In [100]:
import pandas as pd
import numpy as np 
import pickle
from datetime import datetime 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, f1_score ,classification_report 
from sklearn.ensemble import RandomForestClassifier   
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import cross_val_score  
from imblearn.over_sampling import SMOTE
import lightgbm as lgb  
from bayes_opt import BayesianOptimization


In [86]:
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading bayesian-optimization-1.2.0.tar.gz (14 kB)
Building wheels for collected packages: bayesian-optimization
  Building wheel for bayesian-optimization (setup.py): started
  Building wheel for bayesian-optimization (setup.py): finished with status 'done'
  Created wheel for bayesian-optimization: filename=bayesian_optimization-1.2.0-py3-none-any.whl size=11689 sha256=86d168f4e216e7dae73fd92e6ccca8305f511e7c0aea82f99d62a4cea3f376a7
  Stored in directory: c:\users\kaue\appdata\local\pip\cache\wheels\fd\9b\71\f127d694e02eb40bcf18c7ae9613b88a6be4470f57a8528c5b
Successfully built bayesian-optimization
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.2.0


Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.
You should consider upgrading via the 'C:\ProgramData\Anaconda3\python.exe -m pip install --upgrade pip' command.


In [31]:
!pip install -U imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.7.0-py3-none-any.whl (167 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.7.0


Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.
You should consider upgrading via the 'C:\ProgramData\Anaconda3\python.exe -m pip install --upgrade pip' command.


In [28]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.1.1-py2.py3-none-win_amd64.whl (754 kB)
Installing collected packages: lightgbm
Successfully installed lightgbm-3.1.1


Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.
You should consider upgrading via the 'C:\ProgramData\Anaconda3\python.exe -m pip install --upgrade pip' command.


# Create the baseline model ! 

In order to have a comparison of gain of performance, as we saw on the EDA part, above 5 transactions or less than a month of the last transactions have a higher rate of return, so that simple rule will be our model to beat: 

Baseline model: 

is_returning_customer = last_order < 1 month OR total_transaction > 5

In [58]:
#Loading Data  

ds1 = pd.read_csv("~/Reicarnation-Blues-Book/data/machine_learning_challenge_order_data.csv",low_memory=False)  

ds2 = pd.read_csv("~/Reicarnation-Blues-Book/data/machine_learning_challenge_labeled_data.csv",low_memory=False)  

final_ds = pd.read_csv('~/Reicarnation-Blues-Book/data/final_ds.csv')  

final_ds  =pd.merge(final_ds,ds2,how='left',on='customer_id')

final_ds.index = final_ds.customer_id  

final_ds =final_ds.drop('customer_id',axis=1)

ds = pd.merge(ds1,ds2,how='left',on='customer_id')   

In [88]:
#Split the data into train and test 

xtrain = final_ds.drop('is_returning_customer',axis=1)

labels = final_ds['is_returning_customer']

x, x_val, y, y_val = train_test_split(xtrain,labels,test_size=0.2,train_size=0.8)

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.25,train_size =0.75)

In [79]:
# Calculating the baseline model 

pred_train_baseline = np.where( (x_train['less_one_month_order_1'] ==1) |( x_train['total_transac_bigger_5_1']==1 ),1,0) 

print("Baseline train f1_score Score -> ",f1_score(pred_train_baseline, y_train,average='micro')*100) 

print('-*-'*20) 

pred_test_baseline = np.where( (x_test['less_one_month_order_1'] ==1) |( x_test['total_transac_bigger_5_1']==1 ),1,0) 

print("Baseline f1_score Score -> ",f1_score(pred_test_baseline, y_test,average='micro')*100)

Baseline train f1_score Score ->  79.09664364819078
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
Baseline f1_score Score ->  78.95948340836406


In [73]:
# Oversampling as the categories as unbalanced 

sm = SMOTE(random_state=42) 

x_train_res, y_train_res = sm.fit_resample(x_train, y_train) 


# Models! 

I will create try three of them without tunning, and the winner I will tune the hyperparameters, I will keep all the hyperparameters in the default mode, only the number of trees and max_iter I will set to 1000  
 - logistic regression in case the f1 is similar among the models,  
    it will be great due its interpretation  
 - Random Forest, great general algorithm, does not overfit with pruned branches and it is solid 
 - XGBoost, the microsoft version, the lightgbm as it is faster and equally or more accurate  
 
 https://towardsdatascience.com/catboost-vs-light-gbm-vs-xgboost-5f93620723db

In [80]:
# Let's start with the simpler model, logistic regression

lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(x_train,y_train)
print("Cross Val Score: ",cross_val_score(lr,x_train,y_train, scoring='f1_micro',cv=5).mean())
print('-*-'*20)
#predictions_test_lr = lr.predict(x_test)
# Use accuracy_score function to get the accuracy
#print("LR test F1 Score -> ",round(f1_score(predictions_test_lr, y_test,average='micro')*100),2)

print('-*-'*20) 

predictions_test_lr = lr.predict(x_test)
# Use accuracy_score function to get the accuracy
print("LR test F1 Score -> ",round(f1_score(predictions_test_lr, y_test,average='micro')*100),2) 

Cross Val Score:  0.8297990896547688
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
LR test F1 Score ->  83 2


In [82]:
# Second Model to test is the solid Randon Forest

rf = RandomForestClassifier(n_estimators=1000, random_state=42)
rf.fit(x_train,y_train)
#print("Cross Val Score: ",cross_val_score(rf,x_train_res,y_train_res, scoring='f1_weighted',cv=5).mean())
print('-*-'*20)


predictions_train_RF = rf.predict(x_train)
# Use accuracy_score function to get the accuracy
print("RF train F1 Score -> ",round(f1_score(predictions_train_RF, y_train,average='micro')*100),2)


print('-*-'*20)


predictions_test_RF = rf.predict(x_test)
# Use accuracy_score function to get the accuracy
print("RF test F1 Score -> ",round(f1_score(predictions_test_RF, y_test,average='micro')*100),2) 



-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
RF val F1 Score ->  99 2
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
RF test F1 Score ->  82 2


In [85]:
# Last but not least the Light XGBoot  

lgbm = lgb.LGBMClassifier(
    num_iterations =1000,
) 

lgbm.fit(x_train,y_train) 


predictions_train_lgbm = lgbm.predict(x_train)
# Use accuracy_score function to get the accuracy
print("RF train F1 Score -> ",round(f1_score(predictions_train_lgbm, y_train,average='micro')*100),2) 

print('-*-'*20)


predictions_test_lgbm = lgbm.predict(x_test)
# Use accuracy_score function to get the accuracy
print("RF test F1 Score -> ",round(f1_score(predictions_test_lgbm, y_test,average='micro')*100),2) 

print('-*-'*20)






RF val F1 Score ->  87 2
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
RF test F1 Score ->  83 2
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-



# LightGBM barely outperformed the baseline model,   
as it there was no prunning or minimal leave size, Random Forest overfitted a bit, 
So I will do a mini hypeparameter search to find the optial parameters 



In [None]:
%%time

def bayes_parameter_opt_lgb(X, y, init_round=15, opt_round=25, n_folds=3, random_seed=6,n_estimators=10000, output_process=False):
    # prepare data
    train_data = lgb.Dataset(data=X, label=y, free_raw_data=False)
    # parameters
    def lgb_eval(learning_rate,num_leaves, feature_fraction, bagging_fraction, max_depth, max_bin, min_data_in_leaf,min_sum_hessian_in_leaf,subsample):
        params = {'application':'binary', 'metric':'auc'}
        params['learning_rate'] = max(min(learning_rate, 1), 0)
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['max_bin'] = int(round(max_depth))
        params['min_data_in_leaf'] = int(round(min_data_in_leaf))
        params['min_sum_hessian_in_leaf'] = min_sum_hessian_in_leaf
        params['subsample'] = max(min(subsample, 1), 0)
        
        cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=True, verbose_eval =200, metrics=['auc'])
        return max(cv_result['auc-mean'])
     
    lgbBO = BayesianOptimization(lgb_eval, {'learning_rate': (0.01, 1.0),
                                            'num_leaves': (24, 80),
                                            'feature_fraction': (0.1, 0.9),
                                            'bagging_fraction': (0.8, 1),
                                            'max_depth': (5, 30),
                                            'max_bin':(20,90),
                                            'min_data_in_leaf': (20, 80),
                                            'min_sum_hessian_in_leaf':(0,100),
                                           'subsample': (0.01, 1.0)}, random_state=200)

    
    #n_iter: How many steps of bayesian optimization you want to perform. The more steps the more likely to find a good maximum you are.
    #init_points: How many steps of random exploration you want to perform. Random exploration can help by diversifying the exploration space.
    
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)
    
    model_auc=[]
    for model in range(len( lgbBO.res)):
        model_auc.append(lgbBO.res[model]['target'])
    
    # return best parameters
    return lgbBO.res[pd.Series(model_auc).idxmax()]['target'],lgbBO.res[pd.Series(model_auc).idxmax()]['params']

opt_params = bayes_parameter_opt_lgb(x, y, init_round=5, opt_round=10, n_folds=3, random_seed=6,n_estimators=10000)

In [96]:
opt_params[1]['num_leaves'] = int(round(opt_params[1]['num_leaves'])) 
opt_params[1]['max_depth'] = int(round(opt_params[1]['max_depth']))  
opt_params[1]['max_bin'] = int(round(opt_params[1]['max_bin'])) 
opt_params[1]['min_data_in_leaf'] = int(round(opt_params[1]['min_data_in_leaf'])) 

opt_params[1]

{'bagging_fraction': 0.8333129803667263,
 'feature_fraction': 0.7175451796385642,
 'learning_rate': 0.06082589049721443,
 'max_bin': 87,
 'max_depth': 25,
 'min_data_in_leaf': 63,
 'min_sum_hessian_in_leaf': 32.902989847268,
 'num_leaves': 30,
 'subsample': 0.2623276224265793}

In [98]:
lgbm = lgb.LGBMClassifier( num_iterations =1000,
    **opt_params[1]
) 

lgbm.fit(x_train,y_train) 


predictions_test_lgbm = lgbm.predict(x_test)
# Use accuracy_score function to get the accuracy
print("RF test F1 Score -> ",round(f1_score(predictions_test_lgbm, y_test,average='micro')*100),2) 

print('-*-'*20)


predictions_val_lgbm = lgbm.predict(x_val)
# Use accuracy_score function to get the accuracy
print("RF val F1 Score -> ",round(f1_score(predictions_val_lgbm, y_val,average='micro')*100),2) 

print('-*-'*20)




RF train F1 Score ->  84 2
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
RF test F1 Score ->  84 2
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-


# Final Step!
As the model looks stable I will train with the whole data in order to put inside a pickle

In [104]:
lgbm = lgb.LGBMClassifier( num_iterations =1000,
    **opt_params[1]
) 

lgbm.fit(xtrain,labels) 



LGBMClassifier(bagging_fraction=0.8333129803667263,
               feature_fraction=0.7175451796385642,
               learning_rate=0.06082589049721443, max_bin=87, max_depth=25,
               min_data_in_leaf=63, min_sum_hessian_in_leaf=32.902989847268,
               num_iterations=1000, num_leaves=30,
               subsample=0.2623276224265793)

In [105]:
filename = 'final_model.pkl'

with open(filename, 'wb') as file:
    pickle.dump(lgbm, file)

In [None]:
filename = 'final_model2.pkl'

with open(filename, 'wb') as file:
    pickle.dump(rf, file)