In [2]:
import pandas as pd
from datetime import datetime
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import xgboost as xgb
import matplotlib.pyplot as plt

In [3]:
feature_train = pd.read_csv("feature_train.csv")
feature_test = pd.read_csv("feature_test.csv")

In [4]:
res = pd.read_csv('sample_submission_2.csv')
feature_test = pd.merge(res, feature_test, on='user_id_hash', how='left')

In [5]:
# load training data
X_train_7 = feature_train[['unique_session','purchase_before', 'value_purchase','sessions_duration','pct_country','pct_city','0','66']]
y_train_7 = feature_train['user_purchase_binary_7_days']

X_train_14 = feature_train[['unique_session','purchase_before', 'value_purchase','sessions_duration','pct_country','pct_city','0','66']]
y_train_14 = feature_train['user_purchase_binary_14_days']

## Random Forest

In [10]:
# train test split
X_train, X_val, y_train, y_val = train_test_split(X_train_7, y_train_7, test_size=0.2)

In [5]:
from sklearn.ensemble import RandomForestClassifier
rf_7 = RandomForestClassifier(n_estimators=1000, max_depth=20,criterion='entropy',oob_score=True)

In [6]:
# fit random forest model
rf_7.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [8]:
# get validation AUC score 
y_hat = rf_7.predict_proba(X_val)
y_pred = pd.DataFrame(y_hat)[1]
roc_auc_score(y_val,y_pred)

0.9860080594014213

In [170]:
# validation score for purchasing in 14 days
X_train, X_val, y_train, y_val = train_test_split(X_train_14, y_train_14, test_size=0.2)
rf_14 = RandomForestClassifier(n_estimators=1000, max_depth=20,criterion='entropy',oob_score=True)
rf_14.fit(X_train,y_train)
y_hat = rf_14.predict_proba(X_val)
y_pred = pd.DataFrame(y_hat)[1]
roc_auc_score(y_val,y_pred)

0.9843854784338717

## Gradient Boosting

In [200]:
X_train_7 = feature_train[['unique_session','purchase_before', 'value_purchase','sessions_duration','pct_country','pct_city','0','66']]
y_train_7 = feature_train['user_purchase_binary_7_days']

X_train_14 = feature_train[['unique_session','purchase_before', 'value_purchase','sessions_duration','pct_country','pct_city','0','66']]
y_train_14 = feature_train['user_purchase_binary_14_days']

In [197]:
X_train, X_val, y_train, y_val = train_test_split(X_train_7, y_train_7, test_size=0.2)
model_7 = xgb.XGBClassifier(n_estimators=400,max_depth=25, learning_rate=0.05)
model_7.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=25, min_child_weight=1, missing=None, n_estimators=400,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [198]:
# AUC score for validation
y_hat = model_7.predict_proba(X_val)
y_pred = pd.DataFrame(y_hat)[1]
roc_auc_score(y_val,y_pred)

0.9848283012795341

In [210]:
X_train, X_val, y_train, y_val = train_test_split(X_train_14, y_train_14, test_size=0.2)
model_14 = xgb.XGBClassifier(n_estimators=400,max_depth=25,learning_rate=0.05)
model_14.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=25, min_child_weight=1, missing=None, n_estimators=400,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [211]:
# AUC score for validation
y_hat = model_14.predict_proba(X_val)
y_pred = pd.DataFrame(y_hat)[1]
roc_auc_score(y_val,y_pred)

0.9846913659820868

In [242]:
X_pred = feature_test[['unique_session','purchase_before', 'value_purchase','sessions_duration','pct_country','pct_city','0','66']]

In [243]:
y_pred_7 = model_7.predict_proba(X_pred)
y_pred_7 = pd.DataFrame(y_pred_7)[1]

y_pred_14 = model_14.predict_proba(X_pred)
y_pred_14 = pd.DataFrame(y_pred_14)[1]

In [249]:
result = result[['user_id_hash','7_days','14_days']]

In [251]:
res['user_purchase_binary_7_days'] = pred_7
res['user_purchase_binary_14_days'] = pred_14
res = res.fillna(0) # new users

In [252]:
res.columns = ['user_id_hash', 'user_purchase_binary_7_days', 'user_purchase_binary_14_days']

In [254]:
res.to_csv('result_gb.csv',index=False)

## Under sampling

notice that the dataset is unbalanced, we decide to do under sampling.

In [22]:
# under sampling to get balanced data
df_train, df_val = train_test_split(feature_train, test_size=0.2)
no_purchase = len(df_train[df_train['user_purchase_binary_7_days'] == 1])
non_purchase_indices = df_train[df_train['user_purchase_binary_7_days'] == 0].index
random_indices = np.random.choice(non_purchase_indices, no_purchase, replace=False)
purchase_indices = df_train[df_train['user_purchase_binary_7_days'] == 1].index
under_sample_indices = np.concatenate([purchase_indices,random_indices])
under_sample = df_train.loc[under_sample_indices]

In [23]:
under_sample.head()

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days,unique_session,purchase_before,value_purchase,sessions_duration,os_freq,pct_country,pct_city,0,66,67
76112,c77cb6840dc194463c2cd31446c2afd6898b6394d46896...,1.0,1.0,1.0,0.0,0.0,0.0,0.676836,0.00457,0.047619,19.0,0.0,0.0
56572,70f92c8fa7e57977200b94723afb03057feb235512574c...,1.0,1.0,41.0,4.0,7.672,0.047972,0.676836,0.00306,0.5,77.0,0.0,72.0
137969,b1dc3dd93616585476a047c86610131af1edee22772891...,1.0,1.0,5.0,0.0,0.0,0.004336,0.676836,0.00457,0.029197,0.0,0.0,0.0
27842,5e4a5038d943864a0afd8f6fe3a4b4911277b70b069143...,1.0,1.0,37.0,9.0,43.336999,0.068701,0.676836,0.003346,0.041667,76.0,0.0,0.0
1380,7ab6d7a12a41aa35981369dafd3fba4d4fbc2ffaac6fae...,1.0,1.0,5.0,5.0,9.065,0.00284,0.311655,0.00457,0.021333,122.0,0.0,118.0


In [10]:
X_train_7 = under_sample[['unique_session','purchase_before', 'value_purchase','sessions_duration','pct_country','pct_city','0','66']]
y_train_7 = under_sample['user_purchase_binary_7_days']
X_val_7 = df_val[['unique_session','purchase_before', 'value_purchase','sessions_duration','pct_country','pct_city','0','66']]
y_val_7 = df_val['user_purchase_binary_7_days']

In [106]:
model_7_under = xgb.XGBClassifier(n_estimators=5000, max_depth=20, learning_rate=0.01, reg_alpha=0.005)
model_7_under.fit(X_train_7,y_train_7)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=20, min_child_weight=1, missing=None, n_estimators=5000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0.005, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [107]:
# AUC for validation
y_hat = model_7_under.predict_proba(X_val_7)
y_pred = pd.DataFrame(y_hat)[1]
roc_auc_score(y_val_7,y_pred)

0.968679047816572

In [13]:
df_train, df_val = train_test_split(feature_train, test_size=0.2)
no_purchase = len(df_train[df_train['user_purchase_binary_14_days'] == 1])
non_purchase_indices = df_train[df_train['user_purchase_binary_14_days'] == 0].index
random_indices = np.random.choice(non_purchase_indices, no_purchase, replace=False)
purchase_indices = df_train[df_train['user_purchase_binary_14_days'] == 1].index
under_sample_indices = np.concatenate([purchase_indices,random_indices])
under_sample = df_train.loc[under_sample_indices]

In [14]:
X_train_14 = under_sample[['unique_session','purchase_before', 'value_purchase','sessions_duration','pct_country','pct_city','0','66']]
y_train_14 = under_sample['user_purchase_binary_14_days']
X_val_14 = df_val[['unique_session','purchase_before', 'value_purchase','sessions_duration','pct_country','pct_city','0','66']]
y_val_14 = df_val['user_purchase_binary_14_days']

In [104]:
model_14_under = xgb.XGBClassifier(n_estimators=5000, max_depth=20, learning_rate=0.01, reg_alpha=0.005)
model_14_under.fit(X_train_14,y_train_14)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=20, min_child_weight=1, missing=None, n_estimators=5000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0.005, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [105]:
# Auc for validation
y_hat = model_14_under.predict_proba(X_val_14)
y_pred = pd.DataFrame(y_hat)[1]
roc_auc_score(y_val_14,y_pred)

0.9650458241672074

#### Predict

In [108]:
X_pred = feature_test[['unique_session','purchase_before', 'value_purchase','sessions_duration','pct_country','pct_city','0','66']]

In [109]:
y_pred_7 = model_7_under.predict_proba(X_pred)
y_pred_7 = pd.DataFrame(y_pred_7)[1]

In [None]:
y_pred_14 = model_14_under.predict_proba(X_pred)
y_pred_14 = pd.DataFrame(y_pred_14)[1]

In [None]:
res['user_purchase_binary_7_days'] = y_pred_7
res['user_purchase_binary_14_days'] = y_pred_14
res = res.fillna(0) # new users

In [None]:
res.to_csv('result.csv',index=False)

## NN

In [62]:
import torch 
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from torch.utils.data import Dataset, DataLoader

In [127]:
# Predict purchase in 7 days
df_train, df_val = train_test_split(feature_train, test_size=0.2)
no_purchase = len(df_train[df_train['user_purchase_binary_7_days'] == 1])
non_purchase_indices = df_train[df_train['user_purchase_binary_7_days'] == 0].index
random_indices = np.random.choice(non_purchase_indices, no_purchase, replace=False)
purchase_indices = df_train[df_train['user_purchase_binary_7_days'] == 1].index
under_sample_indices = np.concatenate([purchase_indices,random_indices])
under_sample = df_train.loc[under_sample_indices]

X_train_7 = under_sample[['unique_session','purchase_before', 'value_purchase','sessions_duration','pct_country','pct_city','0','66','67']]
y_train_7 = under_sample['user_purchase_binary_7_days']
X_val_7 = df_val[['unique_session','purchase_before', 'value_purchase','sessions_duration','pct_country','pct_city','0','66','67']]
y_val_7 = df_val['user_purchase_binary_7_days']

In [128]:
X_train = np.array(X_train_7)
y_train = np.array(y_train_7)
X_val = np.array(X_val_7)
y_val = np.array(y_val_7)

In [129]:
# Normalize
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
scaler.fit(X_val)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [66]:
import torch

In [130]:
from torch.autograd import Variable
feature_train_v = Variable(torch.FloatTensor(X_train), requires_grad = False)
labels_train_v = Variable(torch.FloatTensor(y_train), requires_grad = False)
feature_test_v = Variable(torch.FloatTensor(X_val), requires_grad = False)
labels_test_v = Variable(torch.FloatTensor(y_val), requires_grad = False)

In [132]:
# try a simple NN with 2 layers
class LinearClassifier(nn.Module):
    def __init__(self,M=200):
        super(LinearClassifier, self).__init__()
        self.h_layer_1 = nn.Linear(9, M)
        self.h_layer_2 = nn.Linear(M, 1)
        self.s_layer_1 = nn.ReLU()
        self.s_layer_2 = nn.Sigmoid()
        
    def forward(self,x):
        y = self.h_layer_1(x)
        y = self.s_layer_1(y)
        y = self.h_layer_2(y)
        p = self.s_layer_2(y)
        return p

In [72]:
model = LinearClassifier() #declaring the classifier to an object
loss_fn = nn.BCELoss() #calculates the loss
optim = torch.optim.SGD(model.parameters(), lr = 0.01)

In [133]:
all_losses = []
for num in range(5000):
    pred = model(feature_train_v) 
    loss = loss_fn(pred, labels_train_v) 
    all_losses.append(loss.data)
    optim.zero_grad() 
    loss.backward() 
    optim.step() 

In [74]:
# get predict values
predicted_values = []
pred = []
for num in range(len(feature_test_v)):
    predicted_values.append(model(feature_test_v[num]))
    pred.append(predicted_values[num].data.numpy())

In [75]:
y_pred = np.stack(pred, axis=1)[0]

In [76]:
# calculate AUC
roc_auc_score(y_val,y_pred)

0.98034204713739

In [111]:
#Predict purchase in 14 days
df_train, df_val = train_test_split(feature_train, test_size=0.2)
no_purchase = len(df_train[df_train['user_purchase_binary_14_days'] == 1])
non_purchase_indices = df_train[df_train['user_purchase_binary_14_days'] == 0].index
random_indices = np.random.choice(non_purchase_indices, no_purchase, replace=False)
purchase_indices = df_train[df_train['user_purchase_binary_14_days'] == 1].index
under_sample_indices = np.concatenate([purchase_indices,random_indices])
under_sample = df_train.loc[under_sample_indices]

X_train_14 = under_sample[['unique_session','purchase_before', 'value_purchase','sessions_duration','pct_country','pct_city','0','66','67']]
y_train_14 = under_sample['user_purchase_binary_14_days']
X_val_14 = df_val[['unique_session','purchase_before', 'value_purchase','sessions_duration','pct_country','pct_city','0','66','67']]
y_val_14 = df_val['user_purchase_binary_14_days']

In [112]:
X_train_14 = np.array(X_train_14)
y_train_14 = np.array(y_train_14)
X_val_14 = np.array(X_val_14)
y_val_14 = np.array(y_val_14)

In [113]:
scaler = MinMaxScaler()
scaler.fit(X_train_14)
scaler.fit(X_val_14)

feature_train_v_14 = Variable(torch.FloatTensor(X_train_14), requires_grad = False)
labels_train_v_14 = Variable(torch.FloatTensor(y_train_14), requires_grad = False)
feature_test_v_14 = Variable(torch.FloatTensor(X_val_14), requires_grad = False)
labels_test_v_14 = Variable(torch.FloatTensor(y_val_14), requires_grad = False)

In [114]:
model = LinearClassifier() #declaring the classifier to an object
loss_fn = nn.BCELoss() #calculates the loss
optim = torch.optim.SGD(model.parameters(), lr = 0.01)

In [115]:
all_losses = []
for num in range(5000):
    pred = model(feature_train_v_14) 
    loss = loss_fn(pred, labels_train_v_14) 
    all_losses.append(loss.data)
    optim.zero_grad() 
    loss.backward() 
    optim.step() 

  "Please ensure they have the same size.".format(target.size(), input.size()))


In [116]:
# get predict values
predicted_values = []
pred = []
for num in range(len(feature_test_v_14)):
    predicted_values.append(model(feature_test_v_14[num]))
    pred.append(predicted_values[num].data.numpy())

In [117]:
y_pred = np.stack(pred, axis=1)[0]
roc_auc_score(y_val_14,y_pred)

0.9807317077701562

In [134]:
# add to 3 layers with dropout and lower learning rate
class LinearClassifier(nn.Module):
    def __init__(self,M=300,N=200,p=0.3):
        super(LinearClassifier, self).__init__()
        self.h_layer_1 = nn.Linear(9, M)
        self.h_layer_2 = nn.Linear(M, N)
        self.h_layer_3 = nn.Linear(N, 1)
        self.s_layer_1 = nn.ReLU()
        self.s_layer_2 = nn.ReLU()
        self.s_layer_3 = nn.Sigmoid()
        self.dropout = nn.Dropout(p)
    def forward(self,x):
        y = self.h_layer_1(x)
        y = self.s_layer_1(y)
        y = self.h_layer_2(y)
        y = self.s_layer_2(y)
        y = self.dropout(y)
        y = self.h_layer_3(y)
        p = self.s_layer_3(y)
        return p


In [135]:
model = LinearClassifier() #declaring the classifier to an object
loss_fn = nn.BCELoss() #calculates the loss
optim = torch.optim.SGD(model.parameters(), lr = 0.05)

all_losses = []
for num in range(5000):
    pred = model(feature_train_v) 
    loss = loss_fn(pred, labels_train_v) 
    all_losses.append(loss.data)
    optim.zero_grad() 
    loss.backward() 
    optim.step() 
    
predicted_values = []
pred = []
for num in range(len(feature_test_v)):
    predicted_values.append(model(feature_test_v[num]))
    pred.append(predicted_values[num].data.numpy())

In [136]:
y_pred = np.stack(pred, axis=1)[0]
roc_auc_score(y_val,y_pred)

0.9836856261984555

In [134]:
model2 = LinearClassifier() #declaring the classifier to an object
loss_fn = nn.BCELoss() #calculates the loss
optim = torch.optim.SGD(model2.parameters(), lr = 0.05)

all_losses = []
for num in range(5000):
    pred = model2(feature_train_v_14) 
    loss = loss_fn(pred, labels_train_v_14) 
    all_losses.append(loss.data)
    optim.zero_grad() 
    loss.backward() 
    optim.step() 
    
predicted_values = []
pred = []
for num in range(len(feature_test_v_14)):
    predicted_values.append(model2(feature_test_v_14[num]))
    pred.append(predicted_values[num].data.numpy())

  "Please ensure they have the same size.".format(target.size(), input.size()))


In [135]:
y_pred = np.stack(pred, axis=1)[0]
roc_auc_score(y_val_14,y_pred)

0.982436112110718

#### predict

In [143]:
X_pred = feature_test[['unique_session','purchase_before', 'value_purchase','sessions_duration','pct_country','pct_city','0','66','67']]
X_pred = np.array(X_pred)
scaler.fit(X_pred)
X_pred = Variable(torch.FloatTensor(X_pred), requires_grad = False)
predicted_values = []
pred_7 = []
for num in range(len(X_pred)):
    predicted_values.append(model(X_pred[num]))
    pred_7.append(predicted_values[num].data.numpy())

predicted_values = []
pred_14 = []    
for num in range(len(X_pred)):
    predicted_values.append(model2(X_pred[num]))
    pred_14.append(predicted_values[num].data.numpy())

In [144]:
y_pred_7 = np.stack(pred_7, axis=1)[0]
y_pred_14 = np.stack(pred_14, axis=1)[0]

In [148]:
res = pd.read_csv('res.csv')

In [150]:
res['user_purchase_binary_7_days'] = y_pred_7
res['user_purchase_binary_14_days'] = y_pred_14
res = res.fillna(0) # new users

In [151]:
res.to_csv('result_nn.csv',index=False)

Gradient boosting gives us the best result, Thus, we do hyper-parameter tunning on Gradient boosting

## Hyperparameter Tunning

In [21]:
def feature_imp(model):
    feat_imp = pd.Series(model.get_booster().get_score(importance_type='weight')).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title = 'Feature Importance')
    plt.ylabel('Feature Importance Score')
    
def xgbfit(model):  
    model.fit(X_train_7, y_train_7, eval_metric='auc')
    y_hat_7 = model.predict_proba(X_val_7)
    y_pred_7 = pd.DataFrame(y_hat_7)[1]
    auc_7 = roc_auc_score(y_val_7, y_pred_7)
    print(f"Val AUC for 7 days: {auc_7}")
    
    model.fit(X_train_14, y_train_14, eval_metric='auc')
    y_hat_14 = model.predict_proba(X_val_14)
    y_pred_14 = pd.DataFrame(y_hat_14)[1]
    auc_14 = roc_auc_score(y_val_14, y_pred_14)
    print(f"Val AUC for 14 days: {auc_14}")

In [22]:
xgb1 = xgb.XGBClassifier(
 learning_rate =0.1,
 n_estimators=400,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=42)

xgbfit(xgb1)

Val AUC for 7 days: 0.9850454241482748
Val AUC for 14 days: 0.9851820988184624


In [34]:
# Tuning max_depth and child_weight
from sklearn.model_selection import GridSearchCV 

param_test_1 = {
 'max_depth': [15,20,22,25,30],
 'min_child_weight':range(1,6,2)
}

gsearch_1 = GridSearchCV(estimator = xgb1, 
                         param_grid = param_test_1, scoring='roc_auc', n_jobs=4, iid=False, cv=5)

gsearch_1.fit(X_train_7, y_train_7)
gsearch_1.best_params_, gsearch_1.best_score_

# after done, repeat same but with one below and one above optimum value

({'max_depth': 15, 'min_child_weight': 1}, 0.9843710400468686)

In [24]:
%%time
param_test_2 = {
 'max_depth':[3,4],
 'min_child_weight':[2,3,4]
}
gsearch = GridSearchCV(estimator = xgb1, 
                         param_grid = param_test_2, scoring='roc_auc', n_jobs=4, iid=False, cv=5)

gsearch.fit(X_train_7, y_train_7)
print(gsearch.best_params_, gsearch.best_score_)


{'max_depth': 3, 'min_child_weight': 4} 0.9866538236863512
CPU times: user 1.38 s, sys: 16 ms, total: 1.4 s
Wall time: 13.1 s


In [25]:
# recalibrate the parameters:
xgb2 = xgb.XGBClassifier(
 learning_rate =0.1,
 n_estimators=400,
 max_depth=3,
 min_child_weight=3,
 gamma=0.4,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=42)

xgbfit(xgb2)

Val AUC for 7 days: 0.9859297428622483
Val AUC for 14 days: 0.9864148721971132


In [26]:
# tune subsample and colsample_bytree
param_test_4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}

gsearch = GridSearchCV(estimator = xgb2, 
                         param_grid = param_test_4, scoring='roc_auc', n_jobs=4, iid=False, cv=5)

gsearch.fit(X_train_7, y_train_7)
print(gsearch.best_params_, gsearch.best_score_)

{'colsample_bytree': 0.6, 'subsample': 0.7} 0.9868046208063405


In [27]:
%%time
#trying values in around 0.7
param_test_5 = {
 'subsample':[i/100.0 for i in range(65,80,5)],
 'colsample_bytree':[i/100.0 for i in range(55,70,5)]
}

gsearch = GridSearchCV(estimator = xgb2, 
                         param_grid = param_test_5, scoring='roc_auc', n_jobs=4, iid=False, cv=5)

gsearch.fit(X_train_7, y_train_7)
print(gsearch.best_params_, gsearch.best_score_)

{'colsample_bytree': 0.55, 'subsample': 0.7} 0.9868046208063405
CPU times: user 1.33 s, sys: 18.4 ms, total: 1.35 s
Wall time: 15.8 s


In [50]:
# Tuning n_estimators (num trees)
param_test_6 = {
 'n_estimator':[400, 500, 600,700, 900]
}

gsearch = GridSearchCV(estimator = xgb2, 
                         param_grid = param_test_6, scoring='roc_auc', n_jobs=4, iid=False, cv=5)

gsearch.fit(X_train_7, y_train_7)
print(gsearch.best_params_, gsearch.best_score_)
#this gives us the best as the lowest, if i drill down until 5 it
# still says its the best param, but recalibrating I see the decrease
# in AUC, overfit

{'n_estimator': 400} 0.9864362980277516


In [6]:
# recalibrating
xgb3 = xgb.XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=19,
 min_child_weight=3,
 gamma=0.4,
 subsample=0.7,
 colsample_bytree=0.55,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=42)

xgbfit(xgb3)

NameError: name 'xgbfit' is not defined

In [30]:
# tuning regulatizatiomn
param_test_7 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch = GridSearchCV(estimator = xgb3, 
                         param_grid = param_test_7, scoring='roc_auc', n_jobs=4, iid=False, cv=5)

gsearch.fit(X_train_7, y_train_7)
print(gsearch.best_params_, gsearch.best_score_)

{'reg_alpha': 0.1} 0.9868471193145428


In [31]:
# lower learning rare
param_test_8 = {
 'learning_rate':[0.001, 0.01, 0.1]
}
gsearch = GridSearchCV(estimator = xgb3, 
                         param_grid = param_test_8, scoring='roc_auc', n_jobs=4, iid=False, cv=5)

gsearch.fit(X_train_7, y_train_7)
print(gsearch.best_params_, gsearch.best_score_)

{'learning_rate': 0.1} 0.9868046208063405


In [7]:
# Final tuned 
xgb4 = xgb.XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=19,
 min_child_weight=3,
 gamma=0.2,
reg_alpha=0.005,
 subsample=0.8,
 colsample_bytree=1,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1)



#(n_estimators=1000,max_depth=30, learning_rate=0.01)

### Predictiction of tuned XGB

In [11]:
X_pred = feature_test[['unique_session','purchase_before', 'value_purchase','sessions_duration',
                       'pct_country','pct_city','0','66']]

In [14]:
xgb4.fit(X_train, y_train, eval_metric='auc')
y_hat_7 = xgb4.predict_proba(X_val)
y_pred_7 = pd.DataFrame(y_hat_7)[1]
auc_7 = roc_auc_score(y_val, y_pred_7)
pred_7 = xgb4.predict_proba(X_pred)
pred_7 = pd.DataFrame(pred_7)[1]
print(f"Val AUC for 7 days: {auc_7}")
#feature_imp(xgb4)

Val AUC for 7 days: 0.9502327303636142


In [15]:
xgb4.fit(X_train_14, y_train_14, eval_metric='auc')
y_hat_14 = xgb4.predict_proba(X_val)
y_pred_14 = pd.DataFrame(y_hat_14)[1]
auc_14 = roc_auc_score(y_val, y_pred_14)
pred_14 = xgb4.predict_proba(X_pred)
pred_14 = pd.DataFrame(pred_14)[1]
print(f"Val AUC for 14 days: {auc_14}")
#feature_imp(xgb4)

Val AUC for 14 days: 0.9987044682484745


In [16]:
res['user_purchase_binary_7_days'] = pred_7
res['user_purchase_binary_14_days'] = pred_14
res = res.fillna(0) # new users

In [17]:
res.to_csv('result_ht.csv',index=False)