In [111]:
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import gc
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, learning_curve
from sklearn.metrics import roc_auc_score, roc_curve, auc, accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
import pickle
import matplotlib.pyplot as plt
import lightgbm

In [112]:
sns.set(rc={'figure.figsize':(12,9)})
sns.set_palette("husl", 9)

In [113]:
train_raw_df = pd.read_csv("train_sample.csv")
train_raw_df.head()

Unnamed: 0.1,Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,10584553,5.625844e+18,1,14102306,1005,1,57fe1b20,5b626596,f028772b,ecad2386,...,1,0,21153,320,50,2420,2,35,100188,69
1,4696831,6.228909e+18,1,14102204,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,...,1,0,21725,320,50,2502,0,35,100083,221
2,6397405,1.364623e+19,1,14102210,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,...,1,0,15703,320,50,1722,0,35,-1,79
3,33330095,1.858927e+18,1,14102906,1005,0,85f751fd,c4e18dd6,50e219e0,3c4b944d,...,1,0,22738,320,50,2636,0,47,100111,70
4,34284672,1.44964e+19,0,14102911,1005,1,ce3307ec,7e091613,f028772b,ecad2386,...,1,0,23369,320,50,2680,3,815,100156,42


In [114]:
train_raw_df.drop(["Unnamed: 0","id"],1,inplace=True)
train_raw_df.head()

Unnamed: 0,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1,14102306,1005,1,57fe1b20,5b626596,f028772b,ecad2386,7801e8d9,07d7df22,...,1,0,21153,320,50,2420,2,35,100188,69
1,1,14102204,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,1,0,21725,320,50,2502,0,35,100083,221
2,1,14102210,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,1,0,15703,320,50,1722,0,35,-1,79
3,1,14102906,1005,0,85f751fd,c4e18dd6,50e219e0,3c4b944d,2347f47a,0f2161f8,...,1,0,22738,320,50,2636,0,47,100111,70
4,0,14102911,1005,1,ce3307ec,7e091613,f028772b,ecad2386,7801e8d9,07d7df22,...,1,0,23369,320,50,2680,3,815,100156,42


In [115]:
#method to convert unix timestamp to date, hour, minute, year, month, day etc.
def to_date_column(df):
    df["dt_hour"] = pd.to_datetime(df["hour"], format="%y%m%d%H")
    df["year"] = df["dt_hour"].dt.year
    df["month"] = df["dt_hour"].dt.month
    df["day"] = df["dt_hour"].dt.day
    df["int_hour"] = df["dt_hour"].dt.hour
    df["is_weekday"] = df["dt_hour"].dt.dayofweek
    df["is_weekend"] = df.apply(lambda x: x["is_weekday"] in [5, 6], axis=1)

In [116]:
to_date_column(train_raw_df)

In [117]:
train_raw_df.head()

Unnamed: 0,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,...,C19,C20,C21,dt_hour,year,month,day,int_hour,is_weekday,is_weekend
0,1,14102306,1005,1,57fe1b20,5b626596,f028772b,ecad2386,7801e8d9,07d7df22,...,35,100188,69,2014-10-23 06:00:00,2014,10,23,6,3,False
1,1,14102204,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,35,100083,221,2014-10-22 04:00:00,2014,10,22,4,2,False
2,1,14102210,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,35,-1,79,2014-10-22 10:00:00,2014,10,22,10,2,False
3,1,14102906,1005,0,85f751fd,c4e18dd6,50e219e0,3c4b944d,2347f47a,0f2161f8,...,47,100111,70,2014-10-29 06:00:00,2014,10,29,6,2,False
4,0,14102911,1005,1,ce3307ec,7e091613,f028772b,ecad2386,7801e8d9,07d7df22,...,815,100156,42,2014-10-29 11:00:00,2014,10,29,11,2,False


In [118]:
len(train_raw_df)

2021448

In [119]:
#counting unique values in each column
train_raw_df.nunique()

click                    2
hour                   240
C1                       7
banner_pos               7
site_id               3042
site_domain           3521
site_category           22
app_id                3929
app_domain             242
app_category            27
device_id           281246
device_ip           971277
device_model          5790
device_type              5
device_conn_type         4
C14                   2366
C15                      8
C16                      9
C17                    426
C18                      4
C19                     67
C20                    161
C21                     60
dt_hour                240
year                     1
month                    1
day                     10
int_hour                24
is_weekday               7
is_weekend               2
dtype: int64

In [120]:
#dropping id columns
train_raw_df.drop(["app_id","device_id","device_ip","dt_hour","site_id","hour"],axis = 1,inplace = True)
train_raw_df.head()

Unnamed: 0,click,C1,banner_pos,site_domain,site_category,app_domain,app_category,device_model,device_type,device_conn_type,...,C18,C19,C20,C21,year,month,day,int_hour,is_weekday,is_weekend
0,1,1005,1,5b626596,f028772b,7801e8d9,07d7df22,6e1e2240,1,0,...,2,35,100188,69,2014,10,23,6,3,False
1,1,1005,0,f3845767,28905ebd,7801e8d9,07d7df22,4ea23a13,1,0,...,0,35,100083,221,2014,10,22,4,2,False
2,1,1005,0,f3845767,28905ebd,7801e8d9,07d7df22,edead9f4,1,0,...,0,35,-1,79,2014,10,22,10,2,False
3,1,1005,0,c4e18dd6,50e219e0,2347f47a,0f2161f8,1f0bc64f,1,0,...,0,47,100111,70,2014,10,29,6,2,False
4,0,1005,1,7e091613,f028772b,7801e8d9,07d7df22,c3f7117b,1,0,...,3,815,100156,42,2014,10,29,11,2,False


In [121]:
#label encoding categorical columns
label_encoder = LabelEncoder()
for x in train_raw_df.columns:
    label_encoder.fit(train_raw_df[x])
    train_raw_df[x] = label_encoder.transform(train_raw_df[x])
with open('LECatTransformer.pkl', 'wb') as pkl:
    pickle.dump(label_encoder, pkl)

In [122]:
train_raw_df.head()

Unnamed: 0,click,C1,banner_pos,site_domain,site_category,app_domain,app_category,device_model,device_type,device_conn_type,...,C18,C19,C20,C21,year,month,day,int_hour,is_weekday,is_weekend
0,1,2,1,1267,20,108,0,2474,1,0,...,2,2,134,18,0,0,2,6,3,0
1,1,2,0,3343,1,108,0,1758,1,0,...,0,2,61,54,0,0,1,4,2,0
2,1,2,0,3343,1,108,0,5379,1,0,...,0,2,0,22,0,0,1,10,2,0
3,1,2,0,2739,5,26,3,662,1,0,...,0,8,83,19,0,0,8,6,2,0
4,0,2,1,1770,20,108,0,4395,1,0,...,3,46,112,10,0,0,8,11,2,0


In [123]:
train_raw_df.shape

(2021448, 24)

In [124]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(train_raw_df.drop("click",1,),train_raw_df["click"], test_size=0.30, random_state=66)

### Baseline evaluation: Logistic Regression

Here I am using logistic regression algorithm for my baseline evaluation. And accuracy as my metric.

In [125]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(verbose=True)
clf.fit(X_train, y_train)


y_train_pred = clf.predict(X_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.7s finished


In [133]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve, f1_score

f1_score(y_train, y_train_pred)

0.30258544300247886

In [140]:
roc_auc_score(y_train, y_train_pred)

0.5611731715720456

In [127]:
accuracy_score(y_train, y_train_pred)

0.8304029715628054

Since the dataset is heavily imbalanced, the vanilla logistic regression model outputs 0.83 as accuracy. Since the dataset is imbalanced, using accuracy as a metric is incorrect. 

In [128]:
#converting series to dataframe
y_train = y_train.to_frame("click")
y_test = y_test.to_frame("click")

In [129]:
#obtaining class weights
weight_0 = len(X_train)/(2 * len(y_train[y_train['click'] == 0]))
weight_1 = len(X_train)/(2 * len(y_train[y_train['click'] == 1]))

In [130]:
print(weight_0, weight_1)

0.6021173058412927 2.9481648623646772


In [131]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(verbose=True, class_weight={0:weight_0,1:weight_1})
clf.fit(X_train, y_train)


y_train_pred = clf.predict(X_train)

  return f(**kwargs)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.4s finished


In [134]:
accuracy_score(y_train, y_train_pred)

0.5605934362440487

Here, instead of Undersampling or oversampling, one can use class weights to deal with imbalanced datasets. Now using accuracy makes sense, because the ratio of the labels are equal.

In [137]:
y_test_pred = clf.predict(X_test)

In [138]:
accuracy_score(y_test,y_test_pred)

0.5617964002737309

The accuracy scores of the model is low. Indicating an underfitting problem. This issue is due to the high bias in the model. Using non parametric models can take care of the underfitting problem.