In [316]:
#library imports
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import gc
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, learning_curve
from sklearn.metrics import roc_auc_score, roc_curve, auc, accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
import pickle
import matplotlib.pyplot as plt
import lightgbm

In [317]:
#seaborn configs
sns.set(rc={'figure.figsize':(12,9)})
sns.set_palette("husl", 9)

In [318]:
#import training saple csv file
train_raw_df = pd.read_csv("train_sample.csv")
train_raw_df.head()

Unnamed: 0.1,Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,10584553,5.625844e+18,1,14102306,1005,1,57fe1b20,5b626596,f028772b,ecad2386,...,1,0,21153,320,50,2420,2,35,100188,69
1,4696831,6.228909e+18,1,14102204,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,...,1,0,21725,320,50,2502,0,35,100083,221
2,6397405,1.364623e+19,1,14102210,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,...,1,0,15703,320,50,1722,0,35,-1,79
3,33330095,1.858927e+18,1,14102906,1005,0,85f751fd,c4e18dd6,50e219e0,3c4b944d,...,1,0,22738,320,50,2636,0,47,100111,70
4,34284672,1.44964e+19,0,14102911,1005,1,ce3307ec,7e091613,f028772b,ecad2386,...,1,0,23369,320,50,2680,3,815,100156,42


In [319]:
#dropping an unnamed column
train_raw_df.drop(["Unnamed: 0","id"],1,inplace=True)
train_raw_df.head()

Unnamed: 0,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1,14102306,1005,1,57fe1b20,5b626596,f028772b,ecad2386,7801e8d9,07d7df22,...,1,0,21153,320,50,2420,2,35,100188,69
1,1,14102204,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,1,0,21725,320,50,2502,0,35,100083,221
2,1,14102210,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,1,0,15703,320,50,1722,0,35,-1,79
3,1,14102906,1005,0,85f751fd,c4e18dd6,50e219e0,3c4b944d,2347f47a,0f2161f8,...,1,0,22738,320,50,2636,0,47,100111,70
4,0,14102911,1005,1,ce3307ec,7e091613,f028772b,ecad2386,7801e8d9,07d7df22,...,1,0,23369,320,50,2680,3,815,100156,42


In [320]:
#method to convert unix timestamp to date, hour, minute, year, month, day etc.
def to_date_column(df):
    df["dt_hour"] = pd.to_datetime(df["hour"], format="%y%m%d%H")
    df["year"] = df["dt_hour"].dt.year
    df["month"] = df["dt_hour"].dt.month
    df["day"] = df["dt_hour"].dt.day
    df["int_hour"] = df["dt_hour"].dt.hour
    df["is_weekday"] = df["dt_hour"].dt.dayofweek
    df["is_weekend"] = df.apply(lambda x: x["is_weekday"] in [5, 6], axis=1)

In [321]:
to_date_column(train_raw_df)

In [322]:
train_raw_df.head()

Unnamed: 0,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,...,C19,C20,C21,dt_hour,year,month,day,int_hour,is_weekday,is_weekend
0,1,14102306,1005,1,57fe1b20,5b626596,f028772b,ecad2386,7801e8d9,07d7df22,...,35,100188,69,2014-10-23 06:00:00,2014,10,23,6,3,False
1,1,14102204,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,35,100083,221,2014-10-22 04:00:00,2014,10,22,4,2,False
2,1,14102210,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,35,-1,79,2014-10-22 10:00:00,2014,10,22,10,2,False
3,1,14102906,1005,0,85f751fd,c4e18dd6,50e219e0,3c4b944d,2347f47a,0f2161f8,...,47,100111,70,2014-10-29 06:00:00,2014,10,29,6,2,False
4,0,14102911,1005,1,ce3307ec,7e091613,f028772b,ecad2386,7801e8d9,07d7df22,...,815,100156,42,2014-10-29 11:00:00,2014,10,29,11,2,False


In [323]:
len(train_raw_df)

2021448

In [324]:
#counting unique values in each column
train_raw_df.nunique()

click                    2
hour                   240
C1                       7
banner_pos               7
site_id               3042
site_domain           3521
site_category           22
app_id                3929
app_domain             242
app_category            27
device_id           281246
device_ip           971277
device_model          5790
device_type              5
device_conn_type         4
C14                   2366
C15                      8
C16                      9
C17                    426
C18                      4
C19                     67
C20                    161
C21                     60
dt_hour                240
year                     1
month                    1
day                     10
int_hour                24
is_weekday               7
is_weekend               2
dtype: int64

In [325]:
#dropping id columns
train_raw_df.drop(["app_id","device_id","device_ip","dt_hour","site_id","hour"],axis = 1,inplace = True)
train_raw_df.head()

Unnamed: 0,click,C1,banner_pos,site_domain,site_category,app_domain,app_category,device_model,device_type,device_conn_type,...,C18,C19,C20,C21,year,month,day,int_hour,is_weekday,is_weekend
0,1,1005,1,5b626596,f028772b,7801e8d9,07d7df22,6e1e2240,1,0,...,2,35,100188,69,2014,10,23,6,3,False
1,1,1005,0,f3845767,28905ebd,7801e8d9,07d7df22,4ea23a13,1,0,...,0,35,100083,221,2014,10,22,4,2,False
2,1,1005,0,f3845767,28905ebd,7801e8d9,07d7df22,edead9f4,1,0,...,0,35,-1,79,2014,10,22,10,2,False
3,1,1005,0,c4e18dd6,50e219e0,2347f47a,0f2161f8,1f0bc64f,1,0,...,0,47,100111,70,2014,10,29,6,2,False
4,0,1005,1,7e091613,f028772b,7801e8d9,07d7df22,c3f7117b,1,0,...,3,815,100156,42,2014,10,29,11,2,False


In [326]:
#label encoding categorical columns
label_encoder = LabelEncoder()
for x in train_raw_df.columns:
    label_encoder.fit(train_raw_df[x])
    train_raw_df[x] = label_encoder.transform(train_raw_df[x])
with open('LECatTransformer.pkl', 'wb') as pkl:
    pickle.dump(label_encoder, pkl)

In [327]:
train_raw_df.head()

Unnamed: 0,click,C1,banner_pos,site_domain,site_category,app_domain,app_category,device_model,device_type,device_conn_type,...,C18,C19,C20,C21,year,month,day,int_hour,is_weekday,is_weekend
0,1,2,1,1267,20,108,0,2474,1,0,...,2,2,134,18,0,0,2,6,3,0
1,1,2,0,3343,1,108,0,1758,1,0,...,0,2,61,54,0,0,1,4,2,0
2,1,2,0,3343,1,108,0,5379,1,0,...,0,2,0,22,0,0,1,10,2,0
3,1,2,0,2739,5,26,3,662,1,0,...,0,8,83,19,0,0,8,6,2,0
4,0,2,1,1770,20,108,0,4395,1,0,...,3,46,112,10,0,0,8,11,2,0


In [328]:
train_raw_df.shape

(2021448, 24)

In [329]:
#train test split
X_train, X_val, y_train, y_val = train_test_split(train_raw_df.drop("click",1,),train_raw_df["click"], test_size=0.25, random_state=123)

In [330]:
X_train, X_test, y_train, y_test = train_test_split(train_raw_df.drop("click",1,),train_raw_df["click"], test_size=0.25, random_state=66)

### Baseline evaluation: Logistic Regression

Here I am using logistic regression algorithm for my baseline evaluation. And accuracy as my metric.

In [332]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(verbose=True)
clf.fit(X_train, y_train)


y_val_pred = clf.predict(X_val)
y_train_pred = clf.predict(X_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.4s finished


In [369]:
print(clf.coef_)
print(X_train.columns)

[[ 1.90612252e-04  1.12563181e-03  2.12885570e-05  1.62584502e-03
   1.38769182e-03 -1.46850429e-02  2.49713837e-05 -4.78771443e-04
  -3.18027723e-03 -5.71096512e-04  6.71329561e-04  7.35295575e-03
   2.71273754e-03  7.62954674e-04 -2.88630618e-03 -4.36482051e-03
  -8.92996705e-03  0.00000000e+00  0.00000000e+00  7.08688690e-03
   8.24599935e-03  3.31380561e-03  2.11413064e-04]]
Index(['C1', 'banner_pos', 'site_domain', 'site_category', 'app_domain',
       'app_category', 'device_model', 'device_type', 'device_conn_type',
       'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'day',
       'int_hour', 'is_weekday'],
      dtype='object')


In [371]:
pd.DataFrame(zip(X_train.columns, np.transpose(clf.coef_.tolist()[0])), columns=['features', 'coef']).sort_values(by='coef',ascending=False)

Unnamed: 0,features,coef
11,C16,0.007353
19,is_weekday,0.007087
12,C17,0.002713
3,site_category,0.001626
4,app_domain,0.001388
1,banner_pos,0.001126
13,C18,0.000763
10,C15,0.000671
0,C1,0.000191
6,device_model,2.5e-05


As per the logstic regression model, the most important feature is C16 followed by is_weekday and C17 etc. It is to be noted that logstic regression assumes that there exists a linear relationship between the predictor variables and the target.

In [337]:
roc_auc_score(y_train, y_train_pred)

0.5

In [338]:
roc_auc_score(y_val, y_val_pred)

0.5

In [341]:
accuracy_score(y_train, y_train_pred)

0.8303532913040553

In [342]:
accuracy_score(y_val, y_val_pred)

0.830911307142207

Since the dataset is heavily imbalanced, the vanilla logistic regression model outputs 0.83 as accuracy. Since the dataset is imbalanced, using accuracy as a metric is incorrect. 

In [343]:
#converting series to dataframe
y_train = y_train.to_frame("click")
y_test = y_test.to_frame("click")
y_val = y_val.to_frame("click")

In [344]:
#obtaining class weights
weight_0 = len(X_train)/(2 * len(y_train[y_train['click'] == 0]))
weight_1 = len(X_train)/(2 * len(y_train[y_train['click'] == 1]))

In [345]:
print(weight_0, weight_1)

0.602153330680196 2.947301505837892


In [346]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(verbose=True, class_weight={0:weight_0,1:weight_1})
clf.fit(X_train, y_train)


y_train_pred = clf.predict(X_train)
y_val_pred = clf.predict(X_val)

  return f(**kwargs)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.7s finished


In [347]:
accuracy_score(y_train, y_train_pred)

0.5720691306429846

In [348]:
accuracy_score(y_val, y_val_pred)

0.5721126637934787

Here, instead of Undersampling or oversampling, one can use class weights to deal with imbalanced datasets. Now using accuracy makes sense, because the ratio of the labels are equal.

In [349]:
y_test_pred = clf.predict(X_test)

In [350]:
accuracy_score(y_test,y_test_pred)

0.5728606424701501

The accuracy scores of the model is low. Indicating an underfitting problem. This issue is due to the high bias in the model. Using non parametric models can take care of the underfitting problem.

### LightGBM

In [351]:
LightGBM = lightgbm.LGBMClassifier(application = 'binary', objective = 'binary', boosting_type = 'dart', metric = 'auc', scale_pos_weight = (len(y_train[y_train['click'] == 0]) / len(y_train[y_train['click'] == 1])))

In [352]:
LightGBM.fit(X_train,y_train)

  return f(**kwargs)


LGBMClassifier(application='binary', boosting_type='dart', metric='auc',
               objective='binary', scale_pos_weight=4.894603011675784)

In [353]:
FeatureImportanceDf = pd.DataFrame(LightGBM.feature_importances_,
                                  index = X_train.columns,
                                  columns = ['ImportanceScore']).sort_values('ImportanceScore',ascending=False)
with(pd.option_context('display.max_rows', 100)):
    print(FeatureImportanceDf)

                  ImportanceScore
site_domain                   723
app_domain                    255
site_category                 242
C14                           233
C21                           225
app_category                  181
C18                           158
C19                           152
C17                           141
C20                           118
C16                           102
device_model                   97
banner_pos                     92
device_type                    76
int_hour                       57
device_conn_type               54
C1                             34
day                            30
C15                            21
is_weekday                      9
year                            0
month                           0
is_weekend                      0


As per the LightGBM algorithm, site_domain is of the highest importance when it comes to predict the click. Followed by app_domain, C14, site_category etc

In [354]:
#dropping columns with 0 importance score
columns = list(FeatureImportanceDf[FeatureImportanceDf['ImportanceScore'] == 0].index)
X_train.drop(columns = columns, axis = 1, inplace=True)
X_test.drop(columns = columns, axis = 1, inplace=True)
X_val.drop(columns = columns, axis = 1, inplace=True)

In [355]:
params = {
    'learning_rate': [0.05,0.1],
    'min_child_weight': [1,3,5,7],
    'min_split_gain': [0.05,0.1,0.3,0.5,0.7,0.9,1.0,5.0],
    'reg_alpha': [0.01, 0.05, 0.1, 0.5, 1.0, 3.0, 10.0, 15.0],
    'subsample': [0.6,0.7,0.8,0.9,1.0],
    'colsample_bytree': [0.6,0.7,0.8,0.9,1.0],
    'reg_lambda': [0.01,0.05,0.1,0.5,1.0,2.0,3.0],
    'min_data_in_leaf': [30, 45, 60],
    'max_depth': [3,5],
    'num_leaves': [7, 31],
    'early_stopping_rounds':[5]
}

In [356]:
cvset = StratifiedKFold(n_splits=5)
RandomSearch = RandomizedSearchCV(LightGBM, param_distributions=params, n_jobs=20, n_iter=50, cv=cvset, verbose = 3)

In [357]:
#fitting training data
RandomSearch.fit(X_train,y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  88 tasks      | elapsed:  5.7min
[Parallel(n_jobs=20)]: Done 250 out of 250 | elapsed: 23.9min finished
  return f(**kwargs)


RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
                   estimator=LGBMClassifier(application='binary',
                                            boosting_type='dart', metric='auc',
                                            objective='binary',
                                            scale_pos_weight=4.894603011675784),
                   n_iter=50, n_jobs=20,
                   param_distributions={'colsample_bytree': [0.6, 0.7, 0.8, 0.9,
                                                             1.0],
                                        'early_stopping_rounds': [5],
                                        'learning_rate': [0.05, 0.1],
                                        'max_depth': [3, 5],
                                        'min_child_weight': [1, 3, 5, 7],
                                        'min_data_in_leaf': [30, 45, 60],
                                        'min_split_gain': [0.05, 0.1, 0.3, 0.5,
      

In [359]:
RandomSearch.best_params_

{'subsample': 0.7,
 'reg_lambda': 0.1,
 'reg_alpha': 0.1,
 'num_leaves': 31,
 'min_split_gain': 0.3,
 'min_data_in_leaf': 60,
 'min_child_weight': 5,
 'max_depth': 5,
 'learning_rate': 0.05,
 'early_stopping_rounds': 5,
 'colsample_bytree': 0.6}

In [360]:
labels = RandomSearch.classes_
y_train_pred = RandomSearch.predict_proba(X_train)
y_train_pred_df = pd.DataFrame(y_train_pred, columns = labels)
y_train_pred_df.head()

Unnamed: 0,0,1
0,0.549934,0.450066
1,0.588309,0.411691
2,0.421581,0.578419
3,0.469338,0.530662
4,0.85409,0.14591


In [361]:
y_val_pred = RandomSearch.predict_proba(X_val)
y_val_pred_df = pd.DataFrame(y_val_pred, columns = labels)
y_val_pred_df.head()

Unnamed: 0,0,1
0,0.524754,0.475246
1,0.450921,0.549079
2,0.703519,0.296481
3,0.38109,0.61891
4,0.453711,0.546289


In [362]:
roc_auc_score(y_train['click'].values, y_train_pred_df[1].values)

0.7063274086967859

In [363]:
roc_auc_score(y_val['click'].values, y_val_pred_df[1].values)

0.7044988543758264

As we can see the roc_auc_score for both the training and validation set are almost same.

In [364]:
y_test_pred = RandomSearch.predict_proba(X_test)
y_test_pred_df = pd.DataFrame(y_test_pred, columns = labels)
y_test_pred_df.head()

Unnamed: 0,0,1
0,0.522544,0.477456
1,0.293602,0.706398
2,0.668904,0.331096
3,0.653659,0.346341
4,0.543801,0.456199


In [365]:
roc_auc_score(y_test['click'].values, y_test_pred_df[1].values)

0.7059210487327757

In [366]:
print('Accuracy Score: ', accuracy_score(y_test['click'], np.where(y_test_pred_df[1] > 0.705, 1, 0))*100, flush = True)

Accuracy Score:  82.99436839334972


The final accuracy score on the test model is about 83% which is better as compared to logistic regression.