In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/marketing-strategy-personalised-offer/sample.csv
/kaggle/input/marketing-strategy-personalised-offer/train_data.csv
/kaggle/input/marketing-strategy-personalised-offer/test_data.csv


In [2]:
### Importing the required libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score , make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


In [3]:
############################# Reading the training and test data ###########################################
train=pd.read_csv('/kaggle/input/marketing-strategy-personalised-offer/train_data.csv')
test =pd.read_csv('/kaggle/input/marketing-strategy-personalised-offer/test_data.csv')

In [4]:
################################### Testing the value counts for checking the imbalance and categories ##########3
for i in train.columns:
    if train[i].dtypes == object:
        print(i)
        print(train[i].value_counts())
        

offer expiration
2days      6791
10hours    5588
Name: offer expiration, dtype: int64
income_range
₹25000 - ₹37499     2010
₹37500 - ₹49999     1752
₹12500 - ₹24999     1743
₹100000 or More     1706
₹50000 - ₹62499     1617
Less than ₹12500    1012
₹87500 - ₹99999      880
₹75000 - ₹87499      830
₹62500 - ₹74999      829
Name: income_range, dtype: int64
no_visited_Cold drinks
less1    3304
1~3      3147
never    2903
4~8      1776
gt8      1051
Name: no_visited_Cold drinks, dtype: int64
Restaur_spend_less_than20
1~3      5248
4~8      3485
less1    2052
gt8      1265
never     208
Name: Restaur_spend_less_than20, dtype: int64
Marital Status
Married partner      4987
Single               4665
Unmarried partner    2104
Divorced              512
Widowed               111
Name: Marital Status, dtype: int64
restaurant type
Cold drinks             3471
2 star restaurant       2642
Take-away restaurant    2341
4 star restaurant       2199
Restaurant with pub     1726
Name: restaurant type, d

In [5]:
####################### Checking for null values ###########################
train.isnull().sum()

offer expiration                            0
income_range                                0
no_visited_Cold drinks                    198
travelled_more_than_15mins_for_offer        0
Restaur_spend_less_than20                 121
Marital Status                              0
restaurant type                             0
age                                         0
Prefer western over chinese                 0
travelled_more_than_25mins_for_offer        0
travelled_more_than_5mins_for_offer         0
no_visited_bars                            93
gender                                      0
car                                     12268
restuarant_same_direction_house             0
Cooks regularly                             0
Customer type                               0
Qualification                               0
is foodie                                   0
no_Take-aways                             144
Job/Job Industry                            0
restuarant_opposite_direction_hous

In [6]:
######################################## Treating the null values with filling techniques ###########################
def nan_values(train):
    train['no_visited_Cold drinks'] = train['no_visited_Cold drinks'].fillna(train['no_visited_Cold drinks'].mode()[0])
    train['no_Take-aways'] = train['no_Take-aways'].fillna(train['no_Take-aways'].mode()[0])
    train['Restaur_spend_greater_than20'] = train['Restaur_spend_greater_than20'].fillna(train['Restaur_spend_greater_than20'].mode()[0])
    train['no_visited_bars'] = train['no_visited_bars'].fillna(train['no_visited_bars'].mode()[0])
    train['Restaur_spend_less_than20'] = train['Restaur_spend_less_than20'].fillna(train['Restaur_spend_less_than20'].mode()[0])
    return train

In [7]:
###########################Converting the income range variable to a numerical variable #####################
def income_range(df):
    if df['income_range'] == '₹100000 or More':
        return 100000
    elif df['income_range'] == '₹87500 - ₹99999':
        return (87500+99999)/2
    elif df['income_range'] == '₹37500 - ₹49999':
        return (37500+49999)/2
    elif df['income_range'] == 'Less than ₹12500':
        return (0+12500)/2
    elif df['income_range'] == '₹50000 - ₹62499':
        return (50000+62499)/2
    elif df['income_range'] == '₹12500 - ₹24999':
        return (12500+24999)/2
    elif df['income_range'] == '₹75000 - ₹87499':
        return (75000+87499)/2
    elif df['income_range'] == '₹25000 - ₹37499':
        return (25000+37499)/2
    else:
        return (62500+74999)/2 

In [8]:
###################################### Creating a single function for pre-processing of both the train and test data.
def preprocessing (train,a):
    train=train.drop(['car'],axis=1)
    train = nan_values(train)
    train['offer expiration'] = np.where(train['offer expiration'] == '2days' ,48,10)
    train['gender'] = np.where(train['gender'] == 'Female' , 1, 0)
    if a == 'train':
        train['Offer Accepted'] = np.where(train['Offer Accepted'] == 'Yes',1,0)
    train['income_range']= train.apply(income_range,axis=1)
    train['age'] = np.where(train['age'] == '50plus' , 55 , np.where(train['age'] == 'below21' , 10 , train['age']))
    train['age'] = train['age'].astype(int)
    ############################### Created a scale maker as the data is ordinal #################################
    scale_maker={'never':0,'less1':1,'1~3':2,'4~8':3,'gt8':4}
    train['Restaur_spend_greater_than20']= train['Restaur_spend_greater_than20'].replace(scale_maker)
    train['no_Take-aways']= train['no_Take-aways'].replace(scale_maker)
    train['no_visited_bars']= train['no_visited_bars'].replace(scale_maker)
    train['Restaur_spend_less_than20']= train['Restaur_spend_less_than20'].replace(scale_maker)
    train['no_visited_Cold drinks']= train['no_visited_Cold drinks'].replace(scale_maker)
    ########################## One hot encoding the data for categorical variables ################################
    train=pd.concat([train,pd.get_dummies(train['drop location'],drop_first=True)],axis=1).drop(['drop location'],axis=1)
    train=pd.concat([train,pd.get_dummies(train['Customer type'],drop_first=True)],axis=1).drop(['Customer type'],axis=1)
    train=pd.concat([train,pd.get_dummies(train['Marital Status'],drop_first=True)],axis=1).drop(['Marital Status'],axis=1)
    train=pd.concat([train,pd.get_dummies(train['restaurant type'],drop_first=True)],axis=1).drop(['restaurant type'],axis=1)
    train=pd.concat([train,pd.get_dummies(train['Climate'],drop_first=True)],axis=1).drop(['Climate'],axis=1)
    scale={'Some High School':1,'High School Graduate':2,'Some college - no degree':3,'Bachelors degree':4,'Associates degree':5,'Graduate degree (Masters or Doctorate)':6}
    train['Qualification']= train['Qualification'].replace(scale)
    train=train.drop(['Job/Job Industry'],axis=1)
    for i in train.columns:
        train[i] = train[i]/max(train[i])
    return train

In [9]:
##################################### Preprocessing the train data and creating train and test datasets #############
train = preprocessing(train,'train')
X=train.drop(['Offer Accepted'],axis=1)
y=train[['Offer Accepted']]
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.3 ,random_state= 32)

In [10]:
####################### Checking Support vector Classifier for choosing the best model #############################
# svc = SVC()
# svc.fit(x_train,y_train)
# f1_score(svc.predict(x_test) , y_test)

In [11]:
####################### Checking K Nearest Neighbours for choosing the best model #############################
# knc = KNeighborsClassifier()
# knc.fit(x_train,y_train)
# f1_score(knc.predict(x_test) , y_test)

In [12]:
# ####################### Checking XG Boost for choosing the best model #############################
# d = XGBClassifier()
# d.fit(x_train,y_train)
# f1_score(d.predict(x_test) , y_test)

In [13]:
####################### Checking Random Forest Classifier for choosing the best model #############################
rff = RandomForestClassifier()
rff.fit(x_train,y_train)
f1_score(rff.predict(x_test) , y_test)

  This is separate from the ipykernel package so we can avoid doing imports until


0.6956149732620321

In [15]:
########################## Using Grid Search for Hyper Parameter Tuning ###############################
# param_grid = {
#     'bootstrap': [True],
#     'max_depth': [30,40,50],
#     'max_features': [2, 3,4,5],
#     'min_samples_leaf': [1,2,5,20],
#     'min_samples_split': [2,3,5,20],
#     'n_estimators': [100,80,50,90]
# }
# d.get_params()

In [17]:
########################### Grid Search for Random Forest ################################
# f1 = make_scorer(f1_score , average='macro')
# rf= GridSearchCV(rff,param_grid,scoring= f1)
# rf.fit(x_train,y_train)
# rf.best_params_

In [18]:
########################## Training the RF ######################
rff = RandomForestClassifier(criterion = 'gini',
 max_depth = 40,
 max_features = 5,
min_samples_leaf = 1,
min_samples_split = 2,
 n_estimators =  80,random_state=42)

In [19]:
#################################### Testing for F1 Score ##############################3
rff.fit(x_train,y_train)
f1_score(rff.predict(x_test) , y_test)


  """Entry point for launching an IPython kernel.


0.6976344086021505

In [20]:
################################### Predicting for test data set and submitting ############################
test = preprocessing(test,'test')
df = pd.DataFrame(rff.predict(test),columns=['Offer Accepted'])
df=df.reset_index()
df.rename(columns={'index':'id'},inplace=True)
df['Offer Accepted'] = np.where(df['Offer Accepted'] == 1 ,'Yes','No')
df.to_csv('/kaggle/working/submission.csv',index=False)