# GBDT Modeling

# Import package and data

In [2]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import GradientBoostingClassifier
import matplotlib.pyplot as plt
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss,normalized_mutual_info_score,roc_auc_score,recall_score,f1_score,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn import preprocessing
#from imblearn.over_sampling import SMOTE
from sklearn.model_selection import PredefinedSplit
from sklearn.metrics import make_scorer,fbeta_score
f2_score=make_scorer(fbeta_score,beta=2)

In [6]:
data_jan=pd.read_csv('Jan_data.csv',index_col=0)
data_feb=pd.read_csv('Feb_data.csv',index_col=0)

In [7]:
X_train1=data_jan.drop('is_churn',1)
y_train=data_jan['is_churn']
scaler=preprocessing.StandardScaler()
X_train=scaler.fit_transform(X_train1)
X_train=pd.DataFrame(X_train,columns=X_train1.columns,index=X_train1.index)

In [9]:
X_val1, X_test1, y_val, y_test = train_test_split(data_feb.drop('is_churn',1),data_feb.is_churn,test_size=0.5)
X_val=scaler.fit_transform(X_val1)
X_test=scaler.fit_transform(X_test1)
X_val=pd.DataFrame(X_val,columns=X_val1.columns,index=X_val1.index)
X_test=pd.DataFrame(X_test,columns=X_test1.columns,index=X_test1.index)

In [11]:
from sklearn.model_selection import PredefinedSplit
X=X_train.append(X_val,ignore_index=True)
y=pd.concat([y_train,y_val])

#split_index = [-1 if x in X_train.index else 0]
split_index=np.zeros(len(X),dtype=int)
split_index[:len(X_train)]=-1
split_index=split_index.tolist()
# Use the list to create PredefinedSplit
pds = PredefinedSplit(test_fold = split_index)

# Fit Model in Default Hyperparameters

In [56]:
gbdt_clf1 = GradientBoostingClassifier()
gbdt_clf1.fit(X_train,y_train)
y_pred=gbdt_clf1.predict(X_val)
fbeta_score(y_val,y_pred)

0.23271328125551793

# Tuning n_estimator &max depth

In [14]:
def gd_search_zero(X,y):
    gbdt_clf0 = GradientBoostingClassifier(learning_rate=0.1, min_samples_split=300,
                                       min_samples_leaf=20,max_depth=8, subsample=0.8,random_state=10)
    param0=dict(n_estimators = [40,60,80,100], max_depth = [5,7,9])
    gd_clf0 = GridSearchCV(estimator=gbdt_clf0, cv=pds, param_grid=param0, 
                                n_jobs=-1,scoring=f2_score,verbose=10)
    gd_clf0.fit(X,y)
    best_param=gd_clf0.best_params_
    cv_results=gd_clf0.cv_results_
    return gd_clf0,best_param,cv_results

In [1]:
clf0,param0,cv_results0=gd_search_zero(X,y)

In [21]:
param0

{'max_depth': 9, 'n_estimators': 100}

# Tuning tree-related parameters

In [11]:
def gd_search2(X,y):
    gbdt_clf2 = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100,
                                        subsample=0.8,max_depth = 9, random_state=10)
    param2= {'min_samples_split':range(100,801,200),'min_samples_leaf':[20,40,60,80]}  
    gd_clf2 = GridSearchCV(estimator=gbdt_clf2, cv=pds, param_grid=param2, 
                                n_jobs=-1,scoring='neg_log_loss',verbose=10)
    gd_clf2.fit(X,y)
    best_param=gd_clf2.best_params_
    cv_results=gd_clf2.cv_results_
    return gd_clf2,best_param,cv_results

In [12]:
clf2,param2,cv_results2=gd_search2(X,y)

Fitting 1 folds for each of 16 candidates, totalling 16 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  16 | elapsed: 104.5min remaining: 452.9min
[Parallel(n_jobs=-1)]: Done   5 out of  16 | elapsed: 104.8min remaining: 230.5min
[Parallel(n_jobs=-1)]: Done   7 out of  16 | elapsed: 104.8min remaining: 134.8min
[Parallel(n_jobs=-1)]: Done   9 out of  16 | elapsed: 105.4min remaining: 82.0min
[Parallel(n_jobs=-1)]: Done  11 out of  16 | elapsed: 105.6min remaining: 48.0min
[Parallel(n_jobs=-1)]: Done  13 out of  16 | elapsed: 105.6min remaining: 24.4min
[Parallel(n_jobs=-1)]: Done  16 out of  16 | elapsed: 108.5min finished


In [13]:
param2

{'min_samples_leaf': 80, 'min_samples_split': 700}