### Stage 1: Importing Dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
#from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier,
                              AdaBoostClassifier)
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier

### Stage 2: Setting up and reading data!

In [2]:
df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
df.shape, test.shape
labels = df.click

#df.head()
# checking the percentage of missing portion?
#print(df.isnull().sum(axis=0)/df.shape[0], test.isnull().sum(axis=0)/test.shape[0])
# fixing the missing data in train and test set
df = df.fillna(-999)
test = test.fillna(-999)

### Stage 3: Making New Features or Feature Engineering!!

In [4]:
cols = ['siteid','offerid','category','merchant']

for x in cols:
    df[x] = df[x].astype('object')
    test[x] = test[x].astype('object')

cat_cols = ['category', 'merchant', 'siteid']
for col in cat_cols:
    lbl = LabelEncoder()
    lbl.fit(list(df[col].values) + list(test[col].values))
    df[col] = lbl.transform(list(df[col].values))
    test[col] = lbl.transform(list(test[col].values))

In [5]:
cat_cols = ['browserid', 'countrycode', 'devid']
for col in cat_cols:
    lbl = LabelEncoder()
    lbl.fit(list(df[col].values) + list(test[col].values))
    df[col] = lbl.transform(list(df[col].values))
    test[col] = lbl.transform(list(test[col].values))
    
df.head()

Unnamed: 0,ID,datetime,siteid,offerid,category,merchant,countrycode,browserid,devid,click
0,IDsrk7SoW,2017-01-14 09:42:09,128865,887235,48,127,4,3,0,0
1,IDmMSxHur,2017-01-18 17:50:53,142053,178235,59,65,1,9,1,0
2,IDVLNN0Ut,2017-01-11 12:46:49,2618,518539,69,15,0,2,0,0
3,ID32T6wwQ,2017-01-17 10:18:43,243406,390352,117,507,2,3,2,0
4,IDqUShzMg,2017-01-14 16:02:33,154278,472937,36,276,3,9,1,0


In [6]:
# One hot encoding
dummy_fields = ['browserid', 'countrycode', 'devid']
for each in dummy_fields:
    dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)
    dummies_test = pd.get_dummies(test[each], prefix=each, drop_first=False)
    df = pd.concat([df, dummies], axis=1)
    test = pd.concat([test, dummies_test], axis=1)
    
# removing parent columns for One Hot Encoding  
target_fields = ['browserid', 'countrycode', 'devid']
df = df.drop(target_fields, axis=1)
test = test.drop(target_fields, axis=1)

In [7]:
# seperating Date time into seperate columns
df[['Date','Time']] = df.datetime.str.split(expand=True)
test[['Date','Time']] = test.datetime.str.split(expand=True)

# df['Year']=[d.split('-')[0] for d in df.Date]
df['Month']=[d.split('-')[1] for d in df.Date]
test['Month']=[d.split('-')[1] for d in test.Date]

df['Day']=[d.split('-')[2] for d in df.Date]
test['Day']=[d.split('-')[2] for d in test.Date]

df['Hour']=[d.split(':')[0] for d in df.Time]
test['Hour']=[d.split(':')[0] for d in test.Time]

df['Minute']=[d.split(':')[1] for d in df.Time]
test['Minute']=[d.split(':')[1] for d in test.Time]

# this can be still irrelevant but will decide after running the model once!
#df['second']=[d.split(':')[2] for d in df.Time]

In [8]:
# removing extra columns
target_fields = ['datetime', 'Date', 'Time', 'ID', 'offerid']
df = df.drop(target_fields, axis=1)
test = test.drop(target_fields, axis=1)
df = df.drop('click', 1)

In [8]:
# this model performed good, testing the below one
# nn = MLPClassifier(activation = 'logistic', solver= 'adam', alpha=1e-5,
#                    hidden_layer_sizes=(50), random_state=1)

# nn = MLPClassifier(activation = 'logistic', solver= 'sgd', alpha=1e-5, early_stopping = True,
#                    validation_fraction = 0.4,
#                    hidden_layer_sizes=(50), random_state=1, verbose=True, shuffle=True,
#                   learning_rate='adaptive')

In [9]:
# 

# # training the Neural Network!!
# nn.fit(df,labels)


# submitted with accuracy of 0.6465
# Iteration 1, loss = 1.08616382
# Validation score: 0.964399
# Iteration 2, loss = 0.73218327
# Validation score: 0.966454
# Iteration 3, loss = 0.54930338
# Validation score: 0.970922
# Iteration 4, loss = 0.36410596
# Validation score: 0.949773
# Iteration 5, loss = 0.15450817
# Validation score: 0.965143
# Iteration 6, loss = 0.14502448
# Validation score: 0.965415
# Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
# Out[12]:
# MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
#        beta_2=0.999, early_stopping=True, epsilon=1e-08,
#        hidden_layer_sizes=50, learning_rate='adaptive',
#        learning_rate_init=0.001, max_iter=200, momentum=0.9,
#        nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
#        solver='adam', tol=0.0001, validation_fraction=0.2, verbose=True,
#        warm_start=False)

In [10]:
# pred = nn.predict_proba(test)
# test1 = pd.read_csv('test.csv')
# ID = test1.ID
# del(test1)
# preds= pd.DataFrame(pred)
# preds.columns = ['remove', 'click']
# #del(pred)
# sub = pd.DataFrame({'ID':ID, 'click': preds.click})
# sub.to_csv('isingh3.csv', index=False)

## Stage 4: Trying different Boosted Classifier's

In [13]:
n_classes = 2
n_estimators = 29
plot_colors = "ryb"
cmap = plt.cm.RdYlBu
plot_step = 0.02
plot_step_coarser = 0.5 
RANDOM_SEED = 13


# clfDT = DecisionTreeClassifier(max_depth=None)  # 0.61106
# clfDT.fit(df,labels)
# predDT = clfDT.predict_proba(test)
# test1 = pd.read_csv('test.csv')
# ID = test1.ID
# del(test1)
# preds= pd.DataFrame(predDT)
# preds.columns = ['remove', 'click']
# sub = pd.DataFrame({'ID':ID, 'click': preds.click})
# sub.to_csv('isingh3_DT.csv', index=False)
# del(preds)
# del(predDT)

# clfRF = RandomForestClassifier(n_estimators=n_estimators, verbose = 1)
# clfRF.fit(df,labels)
# predRF = clfRF.predict_proba(test)
# preds= pd.DataFrame(predRF)
# preds.columns = ['remove', 'click']
# sub = pd.DataFrame({'ID':ID, 'click': preds.click})
# sub.to_csv('isingh3_RF.csv', index=False)
# del(preds)
# del(predRF)

# clfETC = ExtraTreesClassifier(n_estimators=n_estimators, verbose = 1) 0.66707
# clfETC.fit(df,labels)
# predETC = clfETC.predict_proba(test)
# preds= pd.DataFrame(predETC)
# preds.columns = ['remove', 'click']
# sub = pd.DataFrame({'ID':ID, 'click': preds.click})
# sub.to_csv('isingh3_etc.csv', index=False)
# del(preds)
# del(predETC)

#0.67505    # clfada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=n_estimators,
    #                            algorithm='SAMME')
    # clfada.fit(df,labels)
    # predada = clfada.predict_proba(test)
    # preds= pd.DataFrame(predada)
    # preds.columns = ['remove', 'click']
    # sub = pd.DataFrame({'ID':ID, 'click': preds.click})
    # sub.to_csv('isingh3_ada.csv', index=False)
    # del(preds)
    # del(predada)
    
# #df = df.drop('click',1)
# del(xgtrain)
# del(xgtest)
# 66.554    # clfada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=n_estimators,
            #                            algorithm='SAMME.R')
            # clfada.fit(df,labels)
            # predada = clfada.predict_proba(test)
            # preds= pd.DataFrame(predada)
            # preds.columns = ['remove', 'click']
            # test1 = pd.read_csv('test.csv')
            # ID = test1.ID
            # sub = pd.DataFrame({'ID':ID, 'click': preds.click})
            # sub.to_csv('isingh3_ada.csv', index=False)
            # del(preds)
            # del(predada)

In [9]:
df.head()

Unnamed: 0,siteid,category,merchant,browserid_0,browserid_1,browserid_2,browserid_3,browserid_4,browserid_5,browserid_6,...,countrycode_4,countrycode_5,devid_0,devid_1,devid_2,devid_3,Month,Day,Hour,Minute
0,128865,48,127,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1,14,9,42
1,142053,59,65,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1,18,17,50
2,2618,69,15,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1,11,12,46
3,243406,117,507,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1,17,10,18
4,154278,36,276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1,14,16,2


In [10]:
test.head()

Unnamed: 0,siteid,category,merchant,browserid_0,browserid_1,browserid_2,browserid_3,browserid_4,browserid_5,browserid_6,...,countrycode_4,countrycode_5,devid_0,devid_1,devid_2,devid_3,Month,Day,Hour,Minute
0,20623,48,164,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1,22,9,55
1,101616,36,276,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1,22,3,54
2,119811,91,326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1,21,10,25
3,48038,33,59,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1,22,14,45
4,145050,130,209,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1,22,9,34


In [None]:
# clfada = AdaBoostClassifier(ExtraTreesClassifier(n_estimators=n_estimators, verbose = 1), n_estimators=n_estimators,
#                            algorithm='SAMME.R')
# clfada.fit(df,labels)
# predada = clfada.predict_proba(test)
# preds= pd.DataFrame(predada)
# preds.columns = ['remove', 'click']
# test1 = pd.read_csv('test.csv')
# ID = test1.ID
# sub = pd.DataFrame({'ID':ID, 'click': preds.click})
# sub.to_csv('isingh3_trial1.csv', index=False)
# del(preds)
# del(predada)

### Extra Stage: Testing but not a part of competition

In [11]:
from sklearn.model_selection import train_test_split, cross_val_score
x_train, x_test, y_train, y_test = train_test_split(df, labels, test_size=.25)

In [19]:
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score

classifiers = {
    'clfada': AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=n_estimators,
                                algorithm='SAMME'),
#     'naive_bayes': MultinomialNB(),
    'svm': SGDClassifier(loss='hinge',class_weight='balanced'),
    'logistic_regression': SGDClassifier(loss='log',class_weight='balanced'),
    # Neural Network classifier
    'NN': MLPClassifier(activation='relu', alpha=1e-05, 
       solver='lbfgs', hidden_layer_sizes=(40,), momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, shuffle=True,
       validation_fraction=0.2, verbose=True)
}

In [None]:
# cvs = cross validation scores
cvs = []
for k,v in classifiers.items():
    print("Fitting {} model".format(k))
    try:
        cv_score = cross_val_score(v, x_train, y_train, scoring='accuracy', cv=5, n_jobs = -1)
        print(cv_score)
        
        acc = accuracy_score(v.fit(x_train, y_train).predict(x_test), y_test)
        f1 = f1_score(v.fit(x_train,y_train).predict(x_test), y_test)
        cvs.append(np.hstack((cv_score,acc,f1)))
    except ValueError as e:
        print(e)
        cv_score = cross_val_score(v,x_train_pos,y_train_pos,scoring='accuracy',cv=5)
        print(cv_score)
        acc = accuracy_score(v.fit(x_train_pos,y_train_pos).predict(x_test_pos),y_test)
        f1 = f1_score(v.fit(x_train_pos,y_train_pos).predict(x_test_pos),y_test)
        cvs.append(np.hstack((cv_score,acc,f1)))

Fitting clfada model


In [None]:
test_results = pd.DataFrame(
    cvs, 
    index=classifiers.keys(),
    columns=['cv_1', 'cv_2', 'cv_3', 'cv_4', 'cv_5',  'test_acc','test_f1']
)

test_results['cv_std'] = test_results.iloc[:,:5].std(axis=1)
test_results['cv_mean'] = test_results.iloc[:,:5].mean(axis=1)

In [None]:
test_results