In [1]:
# Before beginning, I did research on how best to deal with the incredibly imbalanced dataset and different sampling methods.
# I initially did my modeling in RapidMiner, but found that I wasn't getting good results that generalized well when I submitted
# to Kaggle. I also didn't think the algorithms I was using were doing that well.
# I also did research on which models are best for this kind of data, and came across some boosting algorithms that would work
# efficiently with high performance generalization, so I switched to Python. I chose to try to optimize various versions of 
# XGBoost, CatBoost, and LightGBM, and tried various ensembling methods. I found that my best results came from an ensemble
# of my best of each of the three types of models, using a threshold of 0.6.

In [2]:
# I used the following resources/tutorials about parameter optimization and XGBoost, CatBoost, and LightGBM:
# https://github.com/catboost/tutorials/blob/master/classification/classification_with_parameter_tuning_tutorial.ipynb
# https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
# https://medium.com/analytics-vidhya/hyperparameters-optimization-for-lightgbm-catboost-and-xgboost-regressors-using-bayesian-6e7c495947a9
# https://github.com/catboost/catboost

In [3]:
# This notebook only includes the models with my best parameters for each.

In [4]:
# import packages & modules
from __future__ import absolute_import, division, print_function, unicode_literals
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_auc_score, roc_curve, classification_report, recall_score, f1_score, accuracy_score, precision_score
from sklearn.utils import shuffle
import lightgbm as lgb
import catboost as cgb
import catboost.datasets as cbd
import catboost.utils as cbu
import xgboost as xgb
from bayes_opt import BayesianOptimization
from sklearn.datasets import load_boston
from sklearn.metrics import r2_score
import warnings
import hyperopt
import sys
warnings.filterwarnings('ignore')

In [5]:
# seet seed - my favorite number, obviously
seed = 5
np.random.seed(seed)

In [6]:
# import training dataset
data = pd.read_csv('HW4 - training data.csv',header=0)

In [7]:
# sampling - I used undersampling of the majority class and found this to work
# better than oversampling the minority. I tested multiple ratios, but found that 
# a 1:4 ratio worked best for all of my models, using all of the minority class.

minority_class = np.array(data[data.adopter == 1].index)
majority_class = data[data.adopter == 0].index

# a 1:4 ratio using all of the minority means 1540 minority and 6160 majority
# let's randomly select the 6160 majority samples
# I did this without replacement to add more variety to the samples
random_majority_class = np.random.choice(majority_class, 6160, replace = False)
random_majority_class = np.array(random_majority_class)

# combine both classes now
sample = np.concatenate([minority_class, random_majority_class])

# get the data using the sample indices and shuffle the data before split
sample_data = data.iloc[sample,:]
sample_data = shuffle(sample_data)

# split into X and y
X = sample_data.iloc[:, data.columns != 'adopter']
y = sample_data.iloc[:, data.columns == 'adopter']

# split dataset for cross validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

print ("Number of train instances: {}".format(len(X_train)))
print ("Number of test instances: {}".format(len(X_test)))

Number of train instances: 5390
Number of test instances: 2310


## LightGBM 

In [8]:
# I used this tutorial as a resource for tuning my parameters for LightGBM using Bayesian Optimization:
# https://medium.com/analytics-vidhya/hyperparameters-optimization-for-lightgbm-catboost-and-xgboost-regressors-using-bayesian-6e7c495947a9

In [9]:
# make train and test into a lightgbm Dataset
train_data = lgb.Dataset(X_train, label = y_train)
test_data = lgb.Dataset(X_test, label = y_test)

In [10]:
# final parameters based on optimization
parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric': 'l1',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 46,
    'feature_fraction': 0.8804924233810146,
    'bagging_fraction': 0.8120618558202543,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 1,
    'max_depth': 12,
    'min_child_weight': 30.200387989636972,
    'min_split_gain': 0.031982756484661944
}

# train model
lgbm = lgb.train(parameters,
                       train_data,
                       valid_sets=test_data,
                       verbose_eval=10,
                       num_boost_round=8000,
                       early_stopping_rounds=3500)

Training until validation scores don't improve for 3500 rounds
[10]	valid_0's l1: 0.338022
[20]	valid_0's l1: 0.344689
[30]	valid_0's l1: 0.344838
[40]	valid_0's l1: 0.342264
[50]	valid_0's l1: 0.34023
[60]	valid_0's l1: 0.336653
[70]	valid_0's l1: 0.333756
[80]	valid_0's l1: 0.331738
[90]	valid_0's l1: 0.329531
[100]	valid_0's l1: 0.326918
[110]	valid_0's l1: 0.324378
[120]	valid_0's l1: 0.321728
[130]	valid_0's l1: 0.31907
[140]	valid_0's l1: 0.317121
[150]	valid_0's l1: 0.315655
[160]	valid_0's l1: 0.314135
[170]	valid_0's l1: 0.312524
[180]	valid_0's l1: 0.310824
[190]	valid_0's l1: 0.309525
[200]	valid_0's l1: 0.308348
[210]	valid_0's l1: 0.307378
[220]	valid_0's l1: 0.306174
[230]	valid_0's l1: 0.305242
[240]	valid_0's l1: 0.304308
[250]	valid_0's l1: 0.300741
[260]	valid_0's l1: 0.297999
[270]	valid_0's l1: 0.298542
[280]	valid_0's l1: 0.298478
[290]	valid_0's l1: 0.298076
[300]	valid_0's l1: 0.296737
[310]	valid_0's l1: 0.295344
[320]	valid_0's l1: 0.294641
[330]	valid_0's l1: 

[2760]	valid_0's l1: 0.248072
[2770]	valid_0's l1: 0.24826
[2780]	valid_0's l1: 0.248441
[2790]	valid_0's l1: 0.248718
[2800]	valid_0's l1: 0.248875
[2810]	valid_0's l1: 0.248572
[2820]	valid_0's l1: 0.24848
[2830]	valid_0's l1: 0.248371
[2840]	valid_0's l1: 0.248263
[2850]	valid_0's l1: 0.248405
[2860]	valid_0's l1: 0.248493
[2870]	valid_0's l1: 0.248526
[2880]	valid_0's l1: 0.24859
[2890]	valid_0's l1: 0.24832
[2900]	valid_0's l1: 0.248256
[2910]	valid_0's l1: 0.248053
[2920]	valid_0's l1: 0.247843
[2930]	valid_0's l1: 0.247857
[2940]	valid_0's l1: 0.247955
[2950]	valid_0's l1: 0.247793
[2960]	valid_0's l1: 0.247558
[2970]	valid_0's l1: 0.247648
[2980]	valid_0's l1: 0.247696
[2990]	valid_0's l1: 0.247768
[3000]	valid_0's l1: 0.247838
[3010]	valid_0's l1: 0.247396
[3020]	valid_0's l1: 0.247103
[3030]	valid_0's l1: 0.246783
[3040]	valid_0's l1: 0.246584
[3050]	valid_0's l1: 0.247091
[3060]	valid_0's l1: 0.247452
[3070]	valid_0's l1: 0.247472
[3080]	valid_0's l1: 0.247462
[3090]	valid_0

[5490]	valid_0's l1: 0.241603
[5500]	valid_0's l1: 0.241673
[5510]	valid_0's l1: 0.24188
[5520]	valid_0's l1: 0.242085
[5530]	valid_0's l1: 0.242177
[5540]	valid_0's l1: 0.24222
[5550]	valid_0's l1: 0.241869
[5560]	valid_0's l1: 0.241642
[5570]	valid_0's l1: 0.24203
[5580]	valid_0's l1: 0.242248
[5590]	valid_0's l1: 0.242171
[5600]	valid_0's l1: 0.242114
[5610]	valid_0's l1: 0.24214
[5620]	valid_0's l1: 0.2422
[5630]	valid_0's l1: 0.242095
[5640]	valid_0's l1: 0.242088
[5650]	valid_0's l1: 0.241864
[5660]	valid_0's l1: 0.241789
[5670]	valid_0's l1: 0.241748
[5680]	valid_0's l1: 0.241679
[5690]	valid_0's l1: 0.24176
[5700]	valid_0's l1: 0.241758
[5710]	valid_0's l1: 0.241403
[5720]	valid_0's l1: 0.241169
[5730]	valid_0's l1: 0.241428
[5740]	valid_0's l1: 0.241556
[5750]	valid_0's l1: 0.241546
[5760]	valid_0's l1: 0.24157
[5770]	valid_0's l1: 0.241799
[5780]	valid_0's l1: 0.2419
[5790]	valid_0's l1: 0.241916
[5800]	valid_0's l1: 0.241886
[5810]	valid_0's l1: 0.241829
[5820]	valid_0's l1:

In [11]:
# predict on X_test and classify based on a threshold of 0.6
y_pred = lgbm.predict(X_test)
y_pred[y_pred >= 0.6] = 1
y_pred[y_pred < 0.6] = 0

print ("F1: ", f1_score(y_pred, y_test))
print ("Recall: ", recall_score(y_pred, y_test))
print ("Prec.: ", precision_score(y_pred, y_test))
print ("Acc.: ", accuracy_score(y_pred, y_test))

F1:  0.41854934601664684
Recall:  0.46437994722955145
Prec.:  0.38095238095238093
Acc.:  0.7883116883116883


In [12]:
# now import test set without labels and make predictions
test = pd.read_csv('HW4 - test data.csv',header=0)

y_pred_lgbm = lgbm.predict(test)

y_pred_lgbm = pd.DataFrame({'prediction(adopter)': y_pred_lgbm })
y_pred_lgbm

Unnamed: 0,prediction(adopter)
0,0.005829
1,0.007001
2,0.141774
3,0.000410
4,0.000414
...,...
86676,0.000778
86677,0.843112
86678,0.036194
86679,0.319560


## Catboost

In [13]:
# I used this tutorial as a resource for tuning my parameters for Catboost using Hyperopt:
# https://github.com/catboost/tutorials/blob/master/classification/classification_with_parameter_tuning_tutorial.ipynb

In [14]:
# final parameters based on optimization
parameters = {
    'learning_rate': 0.4234185321620083, 
    'depth': 5, 
    'l2_leaf_reg': 9.464266235679002, 
    'task_type': 'CPU', 
    'loss_function': 'Logloss', 
    'eval_metric': 'AUC', 
    'custom_metric': ['AUC'], 
    'iterations': 100, 
    'random_seed': 20181224
}


# train model
clf = cgb.CatBoostClassifier(iterations=100, learning_rate=0.4234185321620083,
                             loss_function='Logloss', depth=5, 
                             l2_leaf_reg=9.464266235679002,random_seed=20181224,
                             eval_metric='AUC',custom_metric=['AUC'],
                             task_type='CPU')

cgbm = clf.fit(X_train,y_train)

0:	total: 60.9ms	remaining: 6.03s
1:	total: 65.6ms	remaining: 3.21s
2:	total: 70.7ms	remaining: 2.29s
3:	total: 75ms	remaining: 1.8s
4:	total: 85.8ms	remaining: 1.63s
5:	total: 89.6ms	remaining: 1.4s
6:	total: 93.6ms	remaining: 1.24s
7:	total: 97.8ms	remaining: 1.13s
8:	total: 104ms	remaining: 1.05s
9:	total: 115ms	remaining: 1.03s
10:	total: 118ms	remaining: 957ms
11:	total: 123ms	remaining: 905ms
12:	total: 128ms	remaining: 854ms
13:	total: 133ms	remaining: 818ms
14:	total: 137ms	remaining: 777ms
15:	total: 141ms	remaining: 741ms
16:	total: 149ms	remaining: 725ms
17:	total: 153ms	remaining: 695ms
18:	total: 157ms	remaining: 669ms
19:	total: 161ms	remaining: 645ms
20:	total: 166ms	remaining: 624ms
21:	total: 171ms	remaining: 606ms
22:	total: 176ms	remaining: 589ms
23:	total: 184ms	remaining: 583ms
24:	total: 192ms	remaining: 575ms
25:	total: 198ms	remaining: 563ms
26:	total: 207ms	remaining: 560ms
27:	total: 214ms	remaining: 551ms
28:	total: 220ms	remaining: 538ms
29:	total: 225ms	rem

In [15]:
# predict on X_test and classify based on a threshold of 0.6
y_pred = cgbm.predict(X_test)
y_pred[y_pred >= 0.6] = 1
y_pred[y_pred < 0.6] = 0

print ("F1: ", f1_score(y_pred, y_test))
print ("Recall: ", recall_score(y_pred, y_test))
print ("Prec.: ", precision_score(y_pred, y_test))
print ("Acc.: ", accuracy_score(y_pred, y_test))

F1:  0.4010554089709762
Recall:  0.5135135135135135
Prec.:  0.329004329004329
Acc.:  0.8034632034632034


In [16]:
# now import test set without labels and make predictions
test = pd.read_csv('HW4 - test data.csv',header=0)

y_pred_cgbm = cgbm.predict_proba(test)
y_pred_cgbm = pd.DataFrame({'prediction(adopter)': y_pred_cgbm[:, 1]})
y_pred_cgbm

Unnamed: 0,prediction(adopter)
0,0.121667
1,0.017169
2,0.089084
3,0.001871
4,0.075810
...,...
86676,0.057459
86677,0.130759
86678,0.089009
86679,0.072103


## XGBoost

In [18]:
# I used this tutorial as a resource for tuning my parameters for XGBoost using Bayesian Optimization:
# https://medium.com/analytics-vidhya/hyperparameters-optimization-for-lightgbm-catboost-and-xgboost-regressors-using-bayesian-6e7c495947a9

In [19]:
# create train and test xgb matrices
dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_test, y_test)

In [20]:
# final parameters based on optimization
num_rounds = 100

params = {
    'max_depth':8,
    'n_estimators': 5000,
    'gamma': 0,
    'scale_pos_weight': 1,
    'eta': 0.1,
    'reg_alpha': 0.05,
    'objective': 'binary:logistic',
    'seed': 557,
    'colsample_bytree': 1,
    'min_child_weight': 20.0,
    'subsample': 1
}

test_train_split = [(dtest, 'test'), (dtrain, 'train')]

# train model
boost = xgb.train(params,
                 dtrain,
                 num_rounds, 
                 test_train_split)

Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	test-error:0.20563	train-error:0.18293
[1]	test-error:0.19654	train-error:0.17699
[2]	test-error:0.19394	train-error:0.17514
[3]	test-error:0.19740	train-error:0.17347
[4]	test-error:0.19351	train-error:0.17236
[5]	test-error:0.19784	train-error:0.17050
[6]	test-error:0.19697	train-error:0.16976
[7]	test-error:0.19740	train-error:0.16939
[8]	test-error:0.19654	train-error:0.16920
[9]	test-error:0.19740	train-error:0.16939
[10]	test-error:0.19610	train-error:0.16883
[11]	test-error:0.19481	train-error:0.16549
[12]	test-error:0.19437	train-error:0.16568
[13]	test-error:0.19134	train-error:0.16345
[14]	test-error:0.19264	train-error:0.16178
[15]	test-error:0.19307	train-error:0.16197
[16]	test-error:0.

In [21]:
# predict on dtest and classify based on a threshold of 0.6
y_pred = boost.predict(dtest)
y_pred[y_pred >= 0.6] = 1
y_pred[y_pred < 0.6] = 0

print (accuracy_score(y_pred, y_test))
print (f1_score(y_pred, y_test))
print (recall_score(y_pred, y_test))
print (precision_score(y_pred, y_test))

0.8056277056277056
0.2838915470494418
0.5393939393939394
0.19264069264069264


In [22]:
# now import test set without labels and make predictions
test = pd.read_csv('HW4 - test data.csv',header=0)
test = xgb.DMatrix(test)

y_pred_xgbm = boost.predict(test)

y_pred_xgbm = pd.DataFrame({'prediction(adopter)': y_pred_xgbm})
y_pred_xgbm

Unnamed: 0,prediction(adopter)
0,0.129160
1,0.028232
2,0.112173
3,0.015492
4,0.032708
...,...
86676,0.020865
86677,0.457763
86678,0.113139
86679,0.155981


## Ensemble 

In [24]:
# get user id
test = pd.read_csv('HW4 - test data.csv',header=0)
user_id = test[['user_id']]
user_id

Unnamed: 0,user_id
0,5
1,41
2,77
3,99
4,106
...,...
86676,1708912
86677,1708924
86678,1708946
86679,1708972


In [25]:
# rename
y_pred_lgbm = y_pred_lgbm.rename(columns={'prediction(adopter)':'LightGBM'})
y_pred_cgbm = y_pred_cgbm.rename(columns={'prediction(adopter)':'CatBoost'})
y_pred_xgbm = y_pred_xgbm.rename(columns={'prediction(adopter)':'XGBoost'})

# combine three sets of predictions and calculate average
ensemble = pd.concat([y_pred_lgbm, y_pred_cgbm, y_pred_xgbm], axis=1)
ensemble['Average'] = ensemble.mean(axis=1)
ensemble

Unnamed: 0,LightGBM,CatBoost,XGBoost,Average
0,0.005829,0.121667,0.129160,0.085552
1,0.007001,0.017169,0.028232,0.017467
2,0.141774,0.089084,0.112173,0.114344
3,0.000410,0.001871,0.015492,0.005924
4,0.000414,0.075810,0.032708,0.036311
...,...,...,...,...
86676,0.000778,0.057459,0.020865,0.026367
86677,0.843112,0.130759,0.457763,0.477212
86678,0.036194,0.089009,0.113139,0.079447
86679,0.319560,0.072103,0.155981,0.182548


In [26]:
# classify each prediction using threshold of 0.6
ensemble['prediction(adopter)'] = [1 if x>0.6 else 0 for x in ensemble['Average']]
ensemble

Unnamed: 0,LightGBM,CatBoost,XGBoost,Average,prediction(adopter)
0,0.005829,0.121667,0.129160,0.085552,0
1,0.007001,0.017169,0.028232,0.017467,0
2,0.141774,0.089084,0.112173,0.114344,0
3,0.000410,0.001871,0.015492,0.005924,0
4,0.000414,0.075810,0.032708,0.036311,0
...,...,...,...,...,...
86676,0.000778,0.057459,0.020865,0.026367,0
86677,0.843112,0.130759,0.457763,0.477212,0
86678,0.036194,0.089009,0.113139,0.079447,0
86679,0.319560,0.072103,0.155981,0.182548,0


In [27]:
# add user_id
ensemble = pd.concat([ensemble,user_id],axis=1)
ensemble

Unnamed: 0,LightGBM,CatBoost,XGBoost,Average,prediction(adopter),user_id
0,0.005829,0.121667,0.129160,0.085552,0,5
1,0.007001,0.017169,0.028232,0.017467,0,41
2,0.141774,0.089084,0.112173,0.114344,0,77
3,0.000410,0.001871,0.015492,0.005924,0,99
4,0.000414,0.075810,0.032708,0.036311,0,106
...,...,...,...,...,...,...
86676,0.000778,0.057459,0.020865,0.026367,0,1708912
86677,0.843112,0.130759,0.457763,0.477212,0,1708924
86678,0.036194,0.089009,0.113139,0.079447,0,1708946
86679,0.319560,0.072103,0.155981,0.182548,0,1708972


In [28]:
# dataframe of just user_id and predictions
final_preds = ensemble[['user_id','prediction(adopter)']]
final_preds

Unnamed: 0,user_id,prediction(adopter)
0,5,0
1,41,0
2,77,0
3,99,0
4,106,0
...,...,...
86676,1708912,0
86677,1708924,0
86678,1708946,0
86679,1708972,0


In [29]:
# save final predictions
final_preds.to_csv('final_preds.csv',index=False)