In [24]:
#Importing Essential libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from datetime import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [2]:
#Importing Datasets
tr = pd.read_csv('train.csv')
ts = pd.read_csv('test.csv')
view = pd.read_csv('view_log.csv')
item = pd.read_csv('item_data.csv')

In [3]:
#Converting Impression_time field to datetime format in train and test dataset
tr['impression_time'] = pd.to_datetime(tr.impression_time)
ts['impression_time'] = pd.to_datetime(ts.impression_time)
tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237609 entries, 0 to 237608
Data columns (total 7 columns):
impression_id      237609 non-null object
impression_time    237609 non-null datetime64[ns]
user_id            237609 non-null int64
app_code           237609 non-null int64
os_version         237609 non-null object
is_4G              237609 non-null int64
is_click           237609 non-null int64
dtypes: datetime64[ns](1), int64(4), object(2)
memory usage: 12.7+ MB


### <font color='green'>Training dataset contains 237609 records and no missing values</font>
### <font color='green'>columns in train dataset</font>
> impression_id - Unique id of the ad

> impression_time - timestamp of the ad (Date Ranges between 2018-11-15 to 2018-12-13)

> user_id - Unique Id of each user (74723 unique users)

> app_code - Application Code for a partner website (490 unique partner websites)

> os_version - Version of operating system. Three categories (latest-54.3%, intermediate-23.4%, old-22.2%)

> is_4G - 1-Using 4G, 0-No 4G (0-63.9%, 1-36.1%)

> is_click - target variable, 1-ad got clicked, 0-ad was not clicked (only 4.57% hit ratio)

### <font color='green'>Test dataset contains 90675 records and no missing values</font>
### <font color='green'>columns in test dataset</font>
> impression_id - Unique id of the ad

> impression_time - timestamp of the ad (Date Ranges between 2018-12-12 to 2018-12-18)

> user_id - Unique Id of each user (34079 unique users)

> app_code - Application Code for a partner website (373 unique partner websites)

> os_version - Version of operating system. Three categories (latest-53.8%, intermediate-23.4%, old-22.7%)

> is_4G - 1-Using 4G, 0-No 4G (0-64.2%, 1-35.8%)

In [27]:
#Analysing Item dataset
item.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132761 entries, 0 to 132760
Data columns (total 6 columns):
item_id         132761 non-null int64
item_price      132761 non-null int64
category_1      132761 non-null int64
category_2      132761 non-null int64
category_3      132761 non-null int64
product_type    132761 non-null int64
dtypes: int64(6)
memory usage: 6.1 MB


### <font color='green'>Item dataset contains 132761 records and no missing values</font>
### <font color='green'>columns in item dataset</font>
> item_id - Unique id of the item

> item_price - price of the item (min:5, max=1340800, median=2944, mean=10826), Right skewness in price

> category_1 - category depth 1 (17 unique category_1)

> category_2 - category depth 2 (79 unique category_2)

> category_3 - category depth 2 (335 unique category_3)

> product_type - anonymized item type (7959 item_type)

In [4]:
#Converting Impression_time field to datetime format in train and test dataset
view['server_time'] = pd.to_datetime(view.server_time)
#Anlaysing view_log dataset
view.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3118622 entries, 0 to 3118621
Data columns (total 5 columns):
server_time    datetime64[ns]
device_type    object
session_id     int64
user_id        int64
item_id        int64
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 119.0+ MB


### <font color='green'>View_log dataset contains 3118622 records and no missing values</font>
### <font color='green'>columns in View_log dataset</font>
> server_time - Timestamp of the log (Date Ranges between 2018-10-15 to 2018-12-11)

> device_type - Device type of the user. Three categories(android-99.97%, 0.027% from iphone, less than 0.001% from web)

> session_id - Browser session id (1014970 unique sessions)

> user_id - user id (89157 unique user id's)

> item_id - item id (126708 unique items)

In [5]:
#Importing user level aggregation data and impression level aggregation data
user_agg = pd.read_csv('user_agg.csv')
imp_agg = pd.read_csv('imp_agg.csv')

In [8]:
#Deriving dayoftheweek, month, hour and day from impression_time in train and test dataset
tr['impression_weekday_name'] = tr.impression_time.dt.weekday_name
tr['impression_month'] = tr.impression_time.dt.month
tr['impression_hour'] = tr.impression_time.dt.round('H').dt.hour
tr['impression_day'] = tr.impression_time.dt.day

ts['impression_weekday_name'] = ts.impression_time.dt.weekday_name
ts['impression_month'] = ts.impression_time.dt.month
ts['impression_hour'] = ts.impression_time.dt.round('H').dt.hour
ts['impression_day'] = ts.impression_time.dt.day

In [9]:
#Mean Encoding on training data
def MeanEncoding(df, col, trgt, alpha=5, splits=4):
    mean_g = tr[trgt].mean()
    newcol = col+'_Enc'
    df[newcol] = np.nan
    kf = KFold(n_splits=splits, random_state=100, shuffle=True)
    for tr_idx, ts_idx in kf.split(tr):
        enc_tr = df.loc[tr_idx]
        enc_ts = df.loc[ts_idx]
        map_enc = enc_tr.groupby([col])[trgt].describe().apply(lambda x: ((x['count']*x['mean'])+(mean_g*alpha))/\
                                                               (x['count']+alpha), axis=1)
        df.loc[ts_idx, newcol] = enc_ts[col].map(map_enc)
        
    df[newcol] = df[newcol].astype('float')
    return df

In [10]:
#Mean Encoding with 4folds and regularisation parameter(alpha) as 5
col_list =['user_id', 'app_code', 'os_version', 'is_4G', 'impression_weekday_name', 'impression_hour',
           'impression_day', 'impression_month']

trgt = 'is_click'

for x in col_list:
    newcol = x+'_Enc'
    ts[newcol] = np.nan
    tr = MeanEncoding(tr, col=x, trgt=trgt)    
    map_enc = tr.groupby([x])[newcol].mean()
    ts[newcol] = ts[x].map(map_enc)
    ts[newcol] = ts[newcol].astype('float')
    tr.loc[tr[newcol].isnull(), newcol] = tr[trgt].mean()
    ts.loc[ts[newcol].isnull(), newcol] = tr[trgt].mean()

In [13]:
#Preparing model data
#columns to drop
drop_col = ['impression_time', 'app_code', 'os_version', 'is_4G', 'impression_weekday_name', 'impression_hour',
           'impression_day', 'impression_month']
    
tr1 = tr.drop(drop_col, axis='columns')
#Merging user aggregation data to train dataset based on user_id
tr1 = tr1.merge(user_agg, on='user_id', how='left').drop(['user_id'], axis='columns')

#Merging user aggregation data to train dataset based on impression_id
model_data = tr1.merge(imp_agg, on='impression_id', how='left').drop(['impression_id'], axis='columns')

for x1 in user_agg.drop(['user_id'], axis='columns').columns:
    model_data.loc[model_data[x1].isnull(), x1] = 0
    
for x1 in imp_agg.drop(['impression_id'], axis='columns').columns:
    model_data.loc[model_data[x1].isnull(), x1] = 0


#preparing training and validation datasets
X_train, X_test, y_train, y_test = train_test_split(model_data.drop(['is_click'], axis=1),\
                                                    model_data['is_click'],\
                                                    test_size=0.25, random_state=100, stratify=model_data['is_click'])

In [14]:
#Testing Random Forest Model
rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=100, n_jobs=-1,
                            oob_score=True, class_weight="balanced_subsample")
rf.fit(X_train, y_train)
print("Feature Importance:\n"+
      str(pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False).head(10)))

print(roc_auc_score(y_train, rf.predict_proba(X_train)[:,1]))
print(confusion_matrix(y_train, rf.predict(X_train)))
print(roc_auc_score(y_test, rf.predict_proba(X_test)[:,1]))
print(confusion_matrix(y_test, rf.predict(X_test)))

Feature Importance:
app_code_Enc                               0.211349
user_id_Enc                                0.147234
user_session_id_nunique                    0.033343
user_category_3_nunique                    0.021822
user_server_month_12_item_count            0.020892
user_server_month_12_item_nunique          0.019327
user_item_id_nunique                       0.018367
user_server_weekday_Sunday_item_nunique    0.017153
user_category_2_nunique                    0.016924
imp_server_month_12_item_nunique           0.015522
dtype: float64
0.7099464577124308
[[107370  62690]
 [  2656   5490]]
0.6974393410286228
[[35765 20922]
 [  894  1822]]


In [15]:
#Testing Catboost Model
cat = CatBoostClassifier(iterations=4000, eval_metric='AUC')
fit_params = {'early_stopping_rounds': 100, 'eval_set': [(X_test, y_test)], 'verbose': 500}
cat.fit(X_train, y_train, **fit_params)
y_pred1_prob = cat.predict_proba(X_train)[:,1]
y_pred2_prob = cat.predict_proba(X_test)[:,1]
print(roc_auc_score(y_train, y_pred1_prob))
print(roc_auc_score(y_test, y_pred2_prob))

Learning rate set to 0.074288
0:	test: 0.6230736	best: 0.6230736 (0)	total: 835ms	remaining: 55m 38s
500:	test: 0.7526571	best: 0.7527189 (487)	total: 3m 28s	remaining: 24m 14s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7527189098
bestIteration = 487

Shrink model to first 488 iterations.
0.7857968100194501
0.7527189098195806


In [17]:
#Testing LGBM Model
lgbm = LGBMClassifier(learning_rate=0.05, colsample_bytree=0.5, subsample=0.8, subsample_freq=1,\
                      max_bin=31, n_estimators=4000, min_child_samples= 250, num_leaves=8,\
                      objective='binary',scale_pos_weight=1)
fit_params = {'early_stopping_rounds': 100, 'eval_set': [(X_train, y_train),
                                                         (X_test, y_test)],
              'verbose': 500, 'eval_metric': 'auc'}

lgbm.fit(X_train, y_train, **fit_params)
print("Feature Importance:\n"+
      str(pd.Series(lgbm.feature_importances_, index=X_train.columns).sort_values(ascending=False).head(10)))
y_pred1_prob = lgbm.predict_proba(X_train)[:,1]
y_pred2_prob = lgbm.predict_proba(X_test)[:,1]
print(roc_auc_score(y_train, y_pred1_prob))
print(roc_auc_score(y_test, y_pred2_prob))

Training until validation scores don't improve for 100 rounds
[500]	training's auc: 0.803761	training's binary_logloss: 0.157459	valid_1's auc: 0.75948	valid_1's binary_logloss: 0.165682
Early stopping, best iteration is:
[439]	training's auc: 0.799019	training's binary_logloss: 0.158448	valid_1's auc: 0.759628	valid_1's binary_logloss: 0.165679
Feature Importance:
user_id_Enc                          207
app_code_Enc                         206
imp_server_month_12_item_count        60
user_session_id_nunique               52
impression_hour_Enc                   49
impression_day_Enc                    42
impression_weekday_name_Enc           41
impression_month_Enc                  40
user_server_month_12_item_nunique     39
os_version_Enc                        28
dtype: int32
0.7990188227063546
0.7596281130398165


In [19]:
#Selected LGBM model based on accuracy and speed
#Preparing train and test data  for final output
#Preparing model data
#columns to drop
drop_col = ['impression_time', 'app_code', 'os_version', 'is_4G', 'impression_weekday_name', 'impression_hour',
           'impression_day', 'impression_month']
    
tr1 = tr.drop(drop_col, axis='columns')
#Merging user aggregation data to train dataset based on user_id
tr1 = tr1.merge(user_agg, on='user_id', how='left').drop(['user_id'], axis='columns')

#Merging user aggregation data to train dataset based on impression_id
model_data = tr1.merge(imp_agg, on='impression_id', how='left').drop(['impression_id'], axis='columns')

for x1 in user_agg.drop(['user_id'], axis='columns').columns:
    model_data.loc[model_data[x1].isnull(), x1] = 0
    
for x1 in imp_agg.drop(['impression_id'], axis='columns').columns:
    model_data.loc[model_data[x1].isnull(), x1] = 0


#preparing training and validation datasets
X_train, y_train = model_data.drop(['is_click'], axis=1), model_data['is_click']

#Preparing test_data
ts1 = ts.drop(drop_col, axis='columns')
#Merging user aggregation data to train dataset based on user_id
ts1 = ts1.merge(user_agg, on='user_id', how='left').drop(['user_id'], axis='columns')

#Merging user aggregation data to train dataset based on impression_id
test_data = ts1.merge(imp_agg, on='impression_id', how='left').drop(['impression_id'], axis='columns')

for x1 in user_agg.drop(['user_id'], axis='columns').columns:
    test_data.loc[test_data[x1].isnull(), x1] = 0
    
for x1 in imp_agg.drop(['impression_id'], axis='columns').columns:
    test_data.loc[test_data[x1].isnull(), x1] = 0

In [25]:
def kfoldvalidationLGBM(X_train, y_train, X_test, splits=10):
    skf = StratifiedKFold(n_splits=splits, random_state=100, shuffle=True)
    y_pred_tot=[]
    y_tmp_ts1=[]
    y_tmp_pred=[]
    
    for i, idx in enumerate(skf.split(X_train, y_train)):
        X_tr1, y_tr1 = X_train.iloc[idx[0]], y_train.iloc[idx[0]]
        X_ts1, y_ts1 = X_train.iloc[idx[1]], y_train.iloc[idx[1]]
        
        lgbm = LGBMClassifier(learning_rate=0.05, colsample_bytree=0.5, subsample=0.8, subsample_freq=1,\
                      max_bin=31, n_estimators=4000, min_child_samples= 250, num_leaves=8,\
                      objective='binary',scale_pos_weight=1)
        #reg_alpha=0.1, reg_lambda=0.1
        fit_params = {'early_stopping_rounds': 100, 'eval_set': [(X_tr1, y_tr1), (X_ts1, y_ts1)],
                      'verbose': 500, 'eval_metric': 'auc'}
        lgbm.fit(X_tr1, y_tr1, **fit_params)
        print('Fold :',i+1)
        pred_ts1 = lgbm.predict_proba(X_ts1, num_iteration=lgbm.best_iteration_)[:, 1]
        print('AUC Score:\t',roc_auc_score(y_ts1, pred_ts1))
        y_tmp_ts1 =np.concatenate((y_tmp_ts1, y_ts1))
        y_tmp_pred =np.concatenate((y_tmp_pred, pred_ts1))
        pred_test = lgbm.predict_proba(X_test)[:,1]
        y_pred_tot.append(pred_test)
    
    print('Total AUC Score:\t', roc_auc_score(y_tmp_ts1, y_tmp_pred))
    return np.mean(y_pred_tot, 0)

In [26]:
#Calculating final prediction based on averaged 10fold predictions
pred_lgbm = kfoldvalidationLGBM(X_train, y_train, test_data)

Training until validation scores don't improve for 100 rounds
[500]	training's auc: 0.797486	training's binary_logloss: 0.158598	valid_1's auc: 0.765982	valid_1's binary_logloss: 0.165097
Early stopping, best iteration is:
[675]	training's auc: 0.807137	training's binary_logloss: 0.156463	valid_1's auc: 0.767018	valid_1's binary_logloss: 0.16495
Fold : 1
AUC Score:	 0.767017889074955
Training until validation scores don't improve for 100 rounds
[500]	training's auc: 0.798491	training's binary_logloss: 0.158486	valid_1's auc: 0.762968	valid_1's binary_logloss: 0.165043
Early stopping, best iteration is:
[481]	training's auc: 0.797401	training's binary_logloss: 0.158707	valid_1's auc: 0.763357	valid_1's binary_logloss: 0.164994
Fold : 2
AUC Score:	 0.763356760106663
Training until validation scores don't improve for 100 rounds
[500]	training's auc: 0.798561	training's binary_logloss: 0.15844	valid_1's auc: 0.757447	valid_1's binary_logloss: 0.166111
Early stopping, best iteration is:
[65

In [27]:
#Prparing final probability data for submission
out = pd.DataFrame({'impression_id': ts['impression_id'], 'is_click': pred_lgbm})
out.to_csv('KFoldLGBM_sub1.csv', index=False)