In [2]:
import math
import os
import time

import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, f1_score, recall_score, precision_score, SCORERS

from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = 999

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/student-shopee-code-league-marketing-analytics/train.csv
/kaggle/input/student-shopee-code-league-marketing-analytics/sample_submission_0_1.csv
/kaggle/input/student-shopee-code-league-marketing-analytics/test.csv
/kaggle/input/student-shopee-code-league-marketing-analytics/users.csv


In [3]:
pd.set_option('display.max_columns', None)
random_state = 42

In [4]:
df_user = pd.read_csv("/kaggle/input/student-shopee-code-league-marketing-analytics/users.csv")
df_train = pd.read_csv("/kaggle/input/student-shopee-code-league-marketing-analytics/train.csv")
df_test = pd.read_csv("/kaggle/input/student-shopee-code-league-marketing-analytics/test.csv")

df_train

Unnamed: 0,country_code,grass_date,user_id,subject_line_length,last_open_day,last_login_day,last_checkout_day,open_count_last_10_days,open_count_last_30_days,open_count_last_60_days,login_count_last_10_days,login_count_last_30_days,login_count_last_60_days,checkout_count_last_10_days,checkout_count_last_30_days,checkout_count_last_60_days,open_flag,row_id
0,4,2019-07-16 00:00:00+08:00,43,44,19,6,18,0,2,4,12,43,99,0,5,10,0,0
1,4,2019-07-16 00:00:00+08:00,102,44,9,4,8,2,9,17,18,48,90,1,1,4,1,1
2,6,2019-07-16 00:00:00+08:00,177,49,14,5,5,0,4,12,24,69,119,5,19,27,0,2
3,1,2019-07-16 00:00:00+08:00,184,49,49,9,53,0,0,1,9,23,69,1,3,6,0,3
4,6,2019-07-16 00:00:00+08:00,221,49,227,6,221,0,0,0,2,5,5,0,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73534,6,2019-09-02 00:00:00+08:00,127613,39,24,36,279,0,1,1,0,0,0,0,0,0,0,73534
73535,2,2019-09-02 00:00:00+08:00,127620,38,46,10,51,0,0,1,0,0,0,0,0,0,0,73535
73536,2,2019-09-02 00:00:00+08:00,127696,32,Never open,Never login,Never checkout,0,0,0,0,0,0,0,0,0,0,73536
73537,2,2019-09-02 00:00:00+08:00,127807,38,5,34,Never checkout,2,4,4,0,0,0,0,0,0,1,73537


In [5]:
df_test

Unnamed: 0,country_code,grass_date,user_id,subject_line_length,last_open_day,last_login_day,last_checkout_day,open_count_last_10_days,open_count_last_30_days,open_count_last_60_days,login_count_last_10_days,login_count_last_30_days,login_count_last_60_days,checkout_count_last_10_days,checkout_count_last_30_days,checkout_count_last_60_days,row_id
0,6,2019-09-03 00:00:00+08:00,0,35,27,2,13,2,3,4,10,34,134,0,6,18,0
1,6,2019-09-03 00:00:00+08:00,130,35,7,5,383,1,1,1,5,5,5,0,0,0,1
2,5,2019-09-03 00:00:00+08:00,150,25,34,1,3,0,0,0,13,19,38,2,2,2,2
3,1,2019-09-03 00:00:00+08:00,181,36,63,5,5,0,0,0,43,110,173,2,5,5,3
4,5,2019-09-03 00:00:00+08:00,192,23,6,5,54,0,0,0,4,12,39,0,0,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55965,6,2019-09-29 00:00:00+08:00,127348,53,Never open,4,8,0,0,0,0,0,0,0,0,0,55965
55966,6,2019-09-29 00:00:00+08:00,127396,53,59,802,1207,0,0,1,0,0,0,0,0,0,55966
55967,6,2019-09-29 00:00:00+08:00,127574,43,Never open,7,Never checkout,0,0,0,0,0,0,0,0,0,55967
55968,6,2019-09-29 00:00:00+08:00,127887,43,5,5,6,2,5,14,0,0,0,0,0,0,55968


In [6]:
df_user

Unnamed: 0,user_id,attr_1,attr_2,attr_3,age,domain
0,0,,1.0,0.0,,@gmail.com
1,1,1.0,1.0,2.0,50.0,@gmail.com
2,2,,1.0,0.0,,other
3,3,,1.0,0.0,,@gmail.com
4,4,1.0,1.0,2.0,33.0,@gmail.com
...,...,...,...,...,...,...
127881,127921,,1.0,0.0,,@yahoo.com
127882,127922,1.0,1.0,0.0,20.0,@gmail.com
127883,127923,,1.0,0.0,,@gmail.com
127884,127924,,0.0,0.0,,@gmail.com


In [7]:
y_data = df_train['open_flag']
x_data = df_train.drop(['open_flag'], axis = 1)

In [8]:
user_dict = {}
for row in df_user.itertuples():
    user_dict[row.user_id] = (row.attr_1,row.attr_2,row.attr_3,row.age,row.domain)

In [9]:
x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.2, random_state=random_state, shuffle=True)

In [10]:
print('Size of x_train is:', len(x_train))
print('Size of y_train is:', len(y_train))
print('Size of x_val is:', len(x_val))
print('Size of y_val is:', len(y_val))

Size of x_train is: 58831
Size of y_train is: 58831
Size of x_val is: 14708
Size of y_val is: 14708


In [11]:
x_train['date_time'] = pd.to_datetime(x_train['grass_date'])
x_val['date_time'] = pd.to_datetime(x_val['grass_date'])

In [12]:
def time_to_categorical_series(df,type="hour"):
    if type == "hour":
        return df['date_time'].dt.hour.astype('category')
    elif type == "dayofweek":
        return df['date_time'].dt.dayofweek.astype('category')
    elif type == "month":
        return df['date_time'].dt.month.astype('category')
    else:
        return None
    
def time_to_categorical(df):
    hour_series = time_to_categorical_series(df,type='hour')
    dayofweek_series = time_to_categorical_series(df,type='dayofweek')
    month_series = time_to_categorical_series(df,type='month')

    df['hour'] = hour_series
    df['dayofweek'] = dayofweek_series
    df['month'] = month_series

In [13]:
time_to_categorical(x_train)
time_to_categorical(x_val)

In [14]:
def get_user_feature(user_id,i):
    if user_id in user_dict:
        return user_dict[user_id][i]

In [15]:
user_feat_list = ['attr1', 'attr2', 'attr3', 'age', 'domain']

for i in range(len(user_feat_list)):
    x_train[user_feat_list[i]] = x_train['user_id'].apply(lambda x: get_user_feature(x,i))
    x_val[user_feat_list[i]] = x_val['user_id'].apply(lambda x: get_user_feature(x,i))

In [16]:
# Obtain medians to fill na with median
x_train['age'] = x_train['age'].fillna((x_train['age'].median()))
x_train['attr3'] = x_train['attr3'].fillna((x_train['attr3'].median()))

x_val['age'] = x_val['age'].fillna((x_train['age'].median())) # Keep x_train's median
x_val['attr3'] = x_val['attr3'].fillna((x_train['attr3'].median())) # x_train's median

In [17]:
# For categorical features, fill with a random term
x_train['attr1'] = x_train['attr1'].fillna(-1)
x_train['attr2'] = x_train['attr1'].fillna(-1)

x_val['attr1'] = x_val['attr1'].fillna(-1)
x_val['attr2'] = x_val['attr2'].fillna(-1)

In [18]:
df_holder = x_train[['last_open_day','last_login_day','last_checkout_day']].copy()

In [19]:
df_holder = df_holder[-df_holder['last_open_day'].str.contains('Never')]
df_holder = df_holder[-df_holder['last_login_day'].str.contains('Never')]
df_holder = df_holder[-df_holder['last_checkout_day'].str.contains('Never')]
df_holder = df_holder.astype(int)

In [20]:
train_max_open = df_holder['last_open_day'].max()
train_max_login = df_holder['last_login_day'].max()
train_max_checkout = df_holder['last_checkout_day'].max()
print(train_max_open, train_max_login, train_max_checkout)

804 18107 1445


In [21]:
# Never Cases
x_train.loc[x_train['last_open_day'] == 'Never open','last_open_day'] = str(train_max_open)
x_train.loc[x_train['last_login_day'] == 'Never login', 'last_login_day'] = str(train_max_login)
x_train.loc[x_train['last_checkout_day'] == 'Never checkout', 'last_checkout_day'] = str(train_max_checkout)

x_val.loc[x_val['last_open_day'] == 'Never open','last_open_day'] = str(train_max_open)
x_val.loc[x_val['last_login_day'] == 'Never login', 'last_login_day'] = str(train_max_login)
x_val.loc[x_val['last_checkout_day'] == 'Never checkout', 'last_checkout_day'] = str(train_max_checkout)

In [22]:
cat_features = ['grass_date','country_code', 'user_id', 'hour', 'dayofweek', 'month', 'domain', 'attr1', 'attr2']

num_features = ['subject_line_length',
       'last_open_day', 'last_login_day', 'last_checkout_day',
       'open_count_last_10_days', 'open_count_last_30_days',
       'open_count_last_60_days', 'login_count_last_10_days',
       'login_count_last_30_days', 'login_count_last_60_days',
       'checkout_count_last_10_days', 'checkout_count_last_30_days',
       'checkout_count_last_60_days','attr3', 'age']


training_features = cat_features.copy()
training_features.extend(num_features)

In [23]:
remove_features = ['grass_date', 'user_id', 'hour', 'last_open_day', 'last_login_day', 'last_checkout_day']

for i in range(len(remove_features)):
    training_features.remove(remove_features[i])

In [24]:
for col in cat_features:
    x_train[col] = x_train[col].astype('category')
    x_val[col] = x_val[col].astype('category')

In [25]:
x_train2 = x_train[training_features].copy()
x_val2 = x_val[training_features].copy()
x_train2 = pd.get_dummies(x_train2)
x_val2 = pd.get_dummies(x_val2)

In [34]:
xgb = XGBClassifier(random_state=42)

fitted_model = xgb.fit(x_train2, y_train)
y_val_pred = xgb.predict(x_val2)

In [35]:
print("Accuracy (XGBoost Baseline) =", accuracy_score(y_val, y_val_pred))
print("AUC ROC Score (XGBoost Baseline) =", roc_auc_score(y_true=y_val, y_score=xgb.predict_proba(x_val2)[:,1]))

Accuracy (XGBoost Baseline) = 0.885096546097362
AUC ROC Score (XGBoost Baseline) = 0.878449865064532


In [36]:
lgb = LGBMClassifier(random_state=42)

fitted_model = lgb.fit(x_train2, y_train)
y_val_pred = lgb.predict(x_val2)

In [37]:
print("Accuracy (LightGBM Baseline) =", accuracy_score(y_val, y_val_pred))
print("AUC ROC Score (LightGBM Baseline) =", roc_auc_score(y_true=y_val, y_score=lgb.predict_proba(x_val2)[:,1]))

Accuracy (LightGBM Baseline) = 0.8855724775632309
AUC ROC Score (LightGBM Baseline) = 0.8813296312223307


In [38]:
### Hyperparameter Tuning

### Model 1: XGBoost Classifier
xgb_tm = time.time()

xgb = XGBClassifier(objective='binary:logistic',
                    eval_metric='auc',
                    nthread=-1,
                    random_state=42)

parametersGrid = {'booster': ['gbtree', 'dart'],
                  'learning_rate': [0.05, 0.1, 0.2]}

best_xgb = GridSearchCV(estimator=xgb,
                        param_grid=parametersGrid,
                        scoring='roc_auc',
                        cv=5,
                        n_jobs=-1,
                        verbose=5)

best_xgb.fit(x_train2, y_train)

y_val_pred = best_xgb.predict(x_val2)
print("Accuracy (XGBoost) =", accuracy_score(y_val, y_val_pred))
print("ROC AUC Score (XGBoost) =", roc_auc_score(y_true=y_val, y_score=best_xgb.predict_proba(x_val2)[:,1]))

print('Runtime:', round((time.time()-xgb_tm)/60,2), 'mins')
best_xgb.best_params_

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  3.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  3.6min finished


Accuracy (XGBoost) = 0.8867963013326081
ROC AUC Score (XGBoost) = 0.8811528471521586
Runtime: 3.85 mins


{'booster': 'gbtree', 'learning_rate': 0.1}

In [39]:
xgb_tm = time.time()

xgb = XGBClassifier(booster='gbtree',
                    learning_rate=0.1,
                    objective='binary:logistic',
                    eval_metric='auc',
                    nthread=-1,
                    random_state=42)

parametersGrid = {'min_child_weight': [5, 10, 15],
                  'gamma': [0.1, 0.3, 0.5],
                  'max_depth': [5, 10, 15]}

best_xgb = GridSearchCV(estimator=xgb,
                        param_grid=parametersGrid,
                        scoring='roc_auc',
                        cv=5,
                        n_jobs=-1,
                        verbose=5)

best_xgb.fit(x_train2, y_train)

y_val_pred = best_xgb.predict(x_val2)
print("Accuracy (XGBoost) =", accuracy_score(y_val, y_val_pred))
print("ROC AUC Score (XGBoost) =", roc_auc_score(y_true=y_val, y_score=best_xgb.predict_proba(x_val2)[:,1]))

print('Runtime:', round((time.time()-xgb_tm)/60,2), 'mins')
best_xgb.best_params_

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   47.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed: 17.5min finished


Accuracy (XGBoost) = 0.8862523796573294
ROC AUC Score (XGBoost) = 0.8787963521939762
Runtime: 17.76 mins


{'gamma': 0.3, 'max_depth': 5, 'min_child_weight': 10}

In [40]:
xgb_tm = time.time()

xgb = XGBClassifier(booster='gbtree',
                    learning_rate=0.1,
                    objective='binary:logistic',
                    eval_metric='auc',
                    nthread=-1,
                    random_state=42)

parametersGrid = {'reg_alpha': [0.1, 0.3, 0.5, 0.7, 0.9],
                  'reg_lambda': [0.1, 0.3, 0.5, 0.7, 0.9]}

best_xgb = GridSearchCV(estimator=xgb,
                        param_grid=parametersGrid,
                        scoring='roc_auc',
                        cv=5,
                        n_jobs=-1,
                        verbose=5)

best_xgb.fit(x_train2, y_train)

y_val_pred = best_xgb.predict(x_val2)
print("Accuracy (XGBoost) =", accuracy_score(y_val, y_val_pred))
print("ROC AUC Score (XGBoost) =", roc_auc_score(y_true=y_val, y_score=best_xgb.predict_proba(x_val2)[:,1]))

print('Runtime:', round((time.time()-xgb_tm)/60,2), 'mins')
best_xgb.best_params_

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   56.2s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:  9.9min finished


Accuracy (XGBoost) = 0.8867283111231983
ROC AUC Score (XGBoost) = 0.8811128420248867
Runtime: 10.19 mins


{'reg_alpha': 0.7, 'reg_lambda': 0.9}

In [46]:
xgb = XGBClassifier(booster='gbtree',
                    learning_rate=0.1,
                    objective='binary:logistic',
                    eval_metric='auc',
                    nthread=-1,
                    random_state=42)

xgb.fit(x_train2, y_train)

y_val_pred = xgb.predict(x_val2)
print("Accuracy (XGBoost) =", accuracy_score(y_val, y_val_pred))
print("ROC AUC Score (XGBoost) =", roc_auc_score(y_true=y_val, y_score=xgb.predict_proba(x_val2)[:,1]))

Accuracy (XGBoost) = 0.8867963013326081
ROC AUC Score (XGBoost) = 0.8811528471521586


In [48]:
x_test = df_test.copy()

# All the required manipulations

# Date
x_test['date_time'] = pd.to_datetime(x_test['grass_date'])
time_to_categorical(x_test)

# Merge User and Test
for i in range(len(user_feat_list)):
    x_test[user_feat_list[i]] = x_test['user_id'].apply(lambda x: get_user_feature(x,i))

# Impute Age and Attributes
x_test['age'] = x_test['age'].fillna((x_train['age'].median())) # Keep x_train's median
x_test['attr3'] = x_test['attr3'].fillna((x_train['attr3'].median())) # x_train's median

x_test['attr1'] = x_test['attr1'].fillna(-1)
x_test['attr2'] = x_test['attr2'].fillna(-1)

# Change features into the correct types
for col in cat_features:
    x_test[col] = x_test[col].astype('category')
    
x_test2 = x_test[training_features].copy()
x_test2 = pd.get_dummies(x_test2)

x_test2['month_7'] = 0
x_test2['month_8'] = 0

x_test3 = pd.DataFrame(x_test2[x_train2.columns].copy())

y_test = pd.DataFrame(xgb.predict(x_test3))

submission = pd.DataFrame(x_test['row_id'])
submission['open_flag'] = y_test

submission

Unnamed: 0,row_id,open_flag
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
55965,55965,0
55966,55966,0
55967,55967,0
55968,55968,0


In [49]:
submission.to_csv('submission (xgb).csv', index = False)

In [56]:
feat_importances = pd.Series(xgb.feature_importances_, index=x_train2.columns).sort_values(ascending=False)
feat_importances

open_count_last_10_days        0.319628
open_count_last_30_days        0.083878
country_code_4                 0.083242
country_code_5                 0.045317
country_code_7                 0.037190
country_code_1                 0.035165
country_code_3                 0.028377
month_8                        0.027496
dayofweek_0                    0.024964
country_code_2                 0.021605
open_count_last_60_days        0.018007
subject_line_length            0.015927
dayofweek_5                    0.014297
month_7                        0.014214
domain_@qq.com                 0.013351
country_code_6                 0.012638
dayofweek_6                    0.012257
month_9                        0.012195
domain_@gmail.com              0.012165
dayofweek_4                    0.011749
domain_other                   0.010311
dayofweek_1                    0.010054
domain_@ymail.com              0.009763
dayofweek_2                    0.008996
login_count_last_30_days       0.008798


In [43]:
### Model 3: LightGBM Classifier
lgb_tm = time.time()

lgb = LGBMClassifier(n_estimators=500,
                     objective='binary',
                     random_state=42)

parametersGrid = {'boosting_type': ['gbdt', 'dart'],
                  'learning_rate': [0.05, 0.1, 0.2]}

best_lgb = GridSearchCV(estimator=lgb,
                        param_grid=parametersGrid,
                        scoring='roc_auc',
                        cv=5,
                        n_jobs=-1,
                        verbose=5)

best_lgb.fit(x_train2, y_train)

y_val_pred = best_lgb.predict(x_val2)
print("Accuracy (LightGBM) =", accuracy_score(y_val, y_val_pred))
print("ROC AUC Score (LightGBM) =", roc_auc_score(y_true=y_val, y_score=best_lgb.predict_proba(x_val2)[:,1]))

print('Runtime:', round((time.time()-lgb_tm)/60, 2), 'mins')
best_lgb.best_params_

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  5.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  5.9min finished


Accuracy (LightGBM) = 0.8861843894479196
ROC AUC Score (LightGBM) = 0.8814717855311688
Runtime: 7.29 mins


{'boosting_type': 'dart', 'learning_rate': 0.1}

In [44]:
lgb_tm = time.time()

lgb = LGBMClassifier(boosting_type='dart',
                     n_estimators=500,
                     learning_rate=0.1,
                     objective='binary',
                     random_state=42)

parametersGrid = {'max_drop': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}

best_lgb = GridSearchCV(estimator=lgb,
                        param_grid=parametersGrid,
                        scoring='roc_auc',
                        cv=5,
                        n_jobs=-1,
                        verbose=5)

best_lgb.fit(x_train2, y_train)

y_val_pred = best_lgb.predict(x_val2)
print("Accuracy (LightGBM) =", accuracy_score(y_val, y_val_pred))
print("ROC AUC Score (LightGBM) =", roc_auc_score(y_true=y_val, y_score=best_lgb.predict_proba(x_val2)[:,1]))

print('Runtime:', round((time.time()-lgb_tm)/60, 2), 'mins')
best_lgb.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   17.9s finished


Accuracy (LightGBM) = 0.8861843894479196
ROC AUC Score (LightGBM) = 0.8807228695632998
Runtime: 0.45 mins


{'max_drop': 0}

In [57]:
lgb_tm = time.time()

lgb = LGBMClassifier(boosting_type='dart',
                     n_estimators=500,
                     learning_rate=0.1,
                     objective='binary',
                     random_state=42)

parametersGrid = {'max_depth': [10, 30, 50],
                  'min_split_gain': [0.3, 0.5, 0.7]}

best_lgb = GridSearchCV(estimator=lgb,
                        param_grid=parametersGrid,
                        scoring='roc_auc',
                        cv=5,
                        n_jobs=-1,
                        verbose=5)

best_lgb.fit(x_train2, y_train)

y_val_pred = best_lgb.predict(x_val2)
print("Accuracy (LightGBM) =", accuracy_score(y_val, y_val_pred))
print("ROC AUC Score (LightGBM) =", roc_auc_score(y_true=y_val, y_score=best_lgb.predict_proba(x_val2)[:,1]))

print('Runtime:', round((time.time()-lgb_tm)/60, 2), 'mins')
best_lgb.best_params_

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed: 17.0min finished


Accuracy (LightGBM) = 0.8857084579820506
ROC AUC Score (LightGBM) = 0.8802323501926862
Runtime: 18.37 mins


{'max_depth': 30, 'min_split_gain': 0.5}

In [58]:
lgb_tm = time.time()

lgb = LGBMClassifier(boosting_type='dart',
                     n_estimators=500,
                     learning_rate=0.1,
                     objective='binary',
                     random_state=42)

parametersGrid = {'reg_alpha': [0.1, 0.3, 0.5, 0.7, 0.9],
                  'reg_lambda': [0.1, 0.3, 0.5, 0.7, 0.9]}

best_lgb = GridSearchCV(estimator=lgb,
                        param_grid=parametersGrid,
                        scoring='roc_auc',
                        cv=5,
                        n_jobs=-1,
                        verbose=5)

best_lgb.fit(x_train2, y_train)

y_val_pred = best_lgb.predict(x_val2)
print("Accuracy (LightGBM) =", accuracy_score(y_val, y_val_pred))
print("ROC AUC Score (LightGBM) =", roc_auc_score(y_true=y_val, y_score=best_lgb.predict_proba(x_val2)[:,1]))

print('Runtime:', round((time.time()-lgb_tm)/60, 2), 'mins')
best_lgb.best_params_

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 23.5min
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed: 46.3min finished


Accuracy (LightGBM) = 0.8867963013326081
ROC AUC Score (LightGBM) = 0.8812676422266695
Runtime: 47.7 mins


{'reg_alpha': 0.3, 'reg_lambda': 0.9}

In [59]:
x_test = df_test.copy()

# All the required manipulations

# Date
x_test['date_time'] = pd.to_datetime(x_test['grass_date'])
time_to_categorical(x_test)

# Merge User and Test
for i in range(len(user_feat_list)):
    x_test[user_feat_list[i]] = x_test['user_id'].apply(lambda x: get_user_feature(x,i))

# Impute Age and Attributes
x_test['age'] = x_test['age'].fillna((x_train['age'].median())) # Keep x_train's median
x_test['attr3'] = x_test['attr3'].fillna((x_train['attr3'].median())) # x_train's median

x_test['attr1'] = x_test['attr1'].fillna(-1)
x_test['attr2'] = x_test['attr2'].fillna(-1)

# Change features into the correct types
for col in cat_features:
    x_test[col] = x_test[col].astype('category')
    
x_test2 = x_test[training_features].copy()
x_test2 = pd.get_dummies(x_test2)

x_test2['month_7'] = 0
x_test2['month_8'] = 0

x_train3 = x_train2.copy()
x_train3 = x_train3.append(x_val2.copy())

y_train2 = y_train.copy()
y_train2 = y_train2.append(y_val.copy())

In [60]:
lgb = LGBMClassifier(boosting_type='dart',
                     n_estimators=500,
                     learning_rate=0.1,
                     objective='binary',
                     random_state=42)

lgb.fit(x_train3, y_train2)

LGBMClassifier(boosting_type='dart', n_estimators=500, objective='binary',
               random_state=42)

In [61]:
y_test = pd.DataFrame(lgb.predict(x_test2))

submission = pd.DataFrame(x_test['row_id'])
submission['open_flag'] = y_test

submission

Unnamed: 0,row_id,open_flag
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
55965,55965,0
55966,55966,0
55967,55967,0
55968,55968,0


In [62]:
submission.to_csv('submission (lgb).csv', index = False)