In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, train_test_split,KFold,RandomizedSearchCV,GridSearchCV
from sklearn import metrics,svm
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import lightgbm as lgb
from matplotlib.pyplot import figure
from sklearn.metrics import recall_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils import class_weight
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
import time

### ML Modeling Overview ###
In this section, I am going to test our various ML models, such as LightGBM, XGBoost, and Random Forest. To find out the best performing ML model, I will furst perform **nested random search** to find the overall performance model. Next, I will use **random search** to tune the hyperparameters of the best model. Lastly, I will evaluate the final model's performance on test data.

### Step1: Load the train and test data ###

In [2]:
train = pd.read_csv("/Users/haochunniu/Desktop/Kaggle Compatition/Bank Term Deposit Predictions NN ML Nested Random Search/data/train_preprocessed.csv")
test = pd.read_csv("/Users/haochunniu/Desktop/Kaggle Compatition/Bank Term Deposit Predictions NN ML Nested Random Search/data/test_preprocessed.csv")

y_train, y_test = train['y'], test['y']
x_train = train.drop(columns=['y'])
x_test = test.drop(columns=['y'])

### Step2: Compute class weight ###
Given that the data is extremely imbalanced ( y = 1 : 11.7%, y = 0 : 88.3% ), we need to modify the class weight to emphasize more on the classification ability on the minority ( y = 1 ).

In [3]:
class_weights = compute_class_weight(class_weight = 'balanced',classes = np.unique(y_train),y = y_train)
class_weights = dict(zip(np.unique(y_train), class_weights))
print(class_weights)

{0: 0.566241671258955, 1: 4.274059368500661}


In [4]:
# Unlike other models, XGBoost model use sample weights not class weights. So, this section is used to calculate sample weights
xgb_class_weights = class_weight.compute_sample_weight(class_weight='balanced',y=y_train)
print(xgb_class_weights)

[0.56624167 0.56624167 0.56624167 ... 4.27405937 0.56624167 0.56624167]


### Step3: Nested random search to find the overall best model ###
To find the overall best model, I use the nested random search method to find out which model with hyperparameter can be different on each cross validation split would be the best. Given that our data is inbalanced, I used **F1 score** as evalustion metrix.

In [17]:
# 1. Create the Classifier
xgb=XGBClassifier(objective="binary:logistic",seed=9,use_label_encoder =False,
                  scale_pos_weight=(88.3/11.7))

rf=RandomForestClassifier(random_state=9,class_weight=class_weights)

log = LogisticRegression(class_weight=class_weights,max_iter=500,random_state=9)

lgbm=lgb.LGBMClassifier(objective='binary',random_state=9,class_weight=class_weights)

##############################################################
# 2. Create the parameter grid
xgb_grid={'eta':np.arange(0.1,3,0.1),
          'max_depth':list(range(3,10)),
          'n_estimators':list(range(10,100,10)),
          'gamma':list(range(1,6)) }

rf_grid={'n_estimators':list(range(100,500,100)),
         'max_depth':list(range(3,10))}

log_grid={'penalty':['l2','none'],
          'C':[0.1,0.5,1.0,3.0,5.0]}

lgbm_grid={'learning_rate':np.arange(0.1,3,0.1),
           'max_depth':list(range(3,10)),
           'n_estimators':list(range(10,100,10))}

##############################################################
# 3. Create the CV
inner_cv = KFold(n_splits=3, shuffle=True, random_state=9)
outer_cv = KFold(n_splits=3, shuffle=True, random_state=9)

##############################################################
#In this case, because we are dealing with multi-class classification problem, we need to select a method to average our scoring metrics for the CVs
#So, in this case, we use both weighted f1 for both outer and inner CV
# 4-1-1. Random-search CV for XGBoost
t1 = time.time()
print("XGBoost start")
clf = RandomizedSearchCV(xgb,xgb_grid,cv=inner_cv,scoring='f1_weighted',n_iter=15,random_state=9)

# 4-1-2. Nested CV for XGBoost
nested_score = cross_val_score(clf,X=x_train, y=y_train, cv=outer_cv,scoring='f1_weighted') 

# 4-1-3. Result for Nested CV
xgb_result=nested_score.mean()
t2 = time.time()
print("XGboost ends and take {} sec.".format(round(t2-t1)))

##############################################################
# 4-2-1. Random-search CV for Random Forest
t1 = time.time()
print("Random Forest start")
clf = RandomizedSearchCV(rf,rf_grid,cv=inner_cv,scoring='f1_weighted',n_iter=15,random_state=9)

# 4-2-2. Nested CV for Random Forest
nested_score = cross_val_score(clf,X=x_train, y=y_train, cv=outer_cv,scoring='f1_weighted')

# 4-2-3. Result for Nested CV
rf_result=nested_score.mean()
t2 = time.time()
print("Random Forest ends and take {} sec.".format(round(t2-t1)))

##############################################################
# 4-3-1. Random-search CV for Logistic Classifier
t1 = time.time()
print("Logistic Regression start")
clf = RandomizedSearchCV(log,log_grid,cv=inner_cv,scoring='f1_weighted',n_iter=4,random_state=9)

# 4-3-2. Nested CV for LightGBM Classifier
nested_score = cross_val_score(clf,X=x_train, y=y_train, cv=outer_cv,scoring='f1_weighted')

# 4-3-3. Result for Nested CV
log_result=nested_score.mean()
t2 = time.time()
print("Logistic Regression ends and take {} sec.".format(round(t2-t1)))

##############################################################
# 4-4-1. Random-search CV for LightGBM Classifier
t1 = time.time()
print("LightGBM start")
clf = RandomizedSearchCV(lgbm,lgbm_grid,cv=inner_cv,scoring='f1_weighted',n_iter=15,random_state=9)

# 4-4-2. Nested CV for LightGBM Classifier
nested_score = cross_val_score(clf,X=x_train, y=y_train, cv=outer_cv,scoring='f1_weighted')

# 4-4-3. Result for Nested CV
lgbm_result=nested_score.mean()
t2 = time.time()
print("LightGBM ends and take {} sec.".format(round(t2-t1)))

##############################################################
print("XGBoost Nested CV F1 weighted:",round(xgb_result*100,2),"%")
print("Random Forest Nested CV F1 weighted:",round(rf_result*100,2),"%")
print("Logistic Regression Nested CV F1 weighted:",round(log_result*100,2),"%")
print("LightGBM Nested CV F1 weighted:",round(lgbm_result*100,2),"%")

XGBoost start
XGboost ends and take 63 sec.
Random Forest start
Random Forest ends and take 293 sec.
Logistic Regression start




Logistic Regression ends and take 6 sec.
LightGBM start
LightGBM ends and take 14 sec.
XGBoost Nested CV F1 weighted: 86.81 %
Random Forest Nested CV F1 weighted: 85.16 %
Logistic Regression Nested CV F1 weighted: 85.27 %
LightGBM Nested CV F1 weighted: 85.96 %


### Step4: Random search to find the best XGBoost model ###
To find the best XGboost model, I use the random search method to find out which hyperparameter combination is the bast for XGBoost model. Given that our data is inbalanced, I used **F1 score** as evalustion metrix.

In [5]:
# 1. Create the Classifier
xgb=XGBClassifier(objective="binary:logistic",seed=9,use_label_encoder =False,
                  scale_pos_weight=(88.3/11.7))

##############################################################
# 2. Create the parameter grid
xgb_grid={'eta':np.arange(0.1,3,0.1),
          'max_depth':list(range(3,10)),
          'n_estimators':list(range(10,100,10)),
          'gamma':list(range(1,6)) }

##############################################################
# 3. Start the random search
xgb_model = RandomizedSearchCV(xgb,xgb_grid,cv=3,scoring='f1_weighted',n_iter=15,random_state=9)

# 4. Fit the model
xgb_model.fit(x_train,y_train,sample_weight=xgb_class_weights)

# 5. Predict
y_pred=xgb_model.predict(x_test)



In [6]:
# 6. Result
print ("With CV random search, I found the best hyperparameter is eta={}, max_depth={}, n_estimators={}, and gamma={}.".format(xgb_model.best_params_['eta'],
                                                                                                                               xgb_model.best_params_['max_depth'],
                                                                                                                               xgb_model.best_params_['n_estimators'],
                                                                                                                               xgb_model.best_params_['gamma']))
print('----------------------------------------------------------------------------------------------------------------')
print(classification_report(y_test,y_pred))

With CV random search, I found the best hyperparameter is eta=1.5000000000000002, max_depth=9, n_estimators=60, and gamma=2.
----------------------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.97      0.98      4000
           1       0.80      1.00      0.89       521

    accuracy                           0.97      4521
   macro avg       0.90      0.98      0.94      4521
weighted avg       0.98      0.97      0.97      4521



In [7]:
#Save the final XGboost model
import joblib
joblib.dump(xgb_model.best_estimator_, 'Final_XGBoost.pkl')

#Load the model
#xgb_model = joblib.load("Final_XGBoost.pkl")

['Final_XGBoost.pkl']