# Predictive Model for Mailout Campaign
    (1 GridSearchCV for hyper-parameter tunning
    (2 Model Evaluation

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import pickle
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc

In [2]:
from model_pipeline import (
    clean_data,
    build_model
)

## Data Preprocessing

In [3]:
#gather train dataset to train model
mailout_train = pd.read_csv('data/Udacity_MAILOUT_052018_TRAIN.csv', sep=';')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# use method in data_wranggling part to clean data
mailout_train_clean = clean_data(mailout_train)

Dataset has 0 missing value.


In [5]:
X = mailout_train_clean[[col for col in mailout_train_clean.columns if col not in ['LNR', 'RESPONSE' ]]]
y = mailout_train_clean['RESPONSE']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Handle imbalanced dataset

In [25]:
y_train.mean() # highly imbalanced

0.012263757643135075

In [7]:
# Use SMOTE to increase positive sample
sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

## GridSearchCV for Hyper-parameter tunning
    - the pipeline will apply StandardScaler(), PCA(n_components=100) and XGBClassifier()
    - tunning paramter to reduce overfitting: gamma, max_depth, min_child_weight, subsample,colsample

In [8]:
model = build_model()
model.fit(X_train_sm, y_train_sm)

Fitting 5 folds for each of 256 candidates, totalling 1280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 32.5min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 96.8min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 174.9min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 290.9min
[Parallel(n_jobs=-1)]: Done 1280 out of 1280 | elapsed: 303.9min finished


Parameters: { colsample } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('pca', PCA(n_components=100)),
                                       ('clf',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None,
                                                    

In [9]:
# Check best estimator
model.best_estimator_._final_estimator

XGBClassifier(base_score=0.5, booster='gbtree', colsample=0.8,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              gamma=0.7, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=0,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=0.7, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [10]:
# Spend 18 hours to train!!!!!
# Save the model for reusing!!!
with open('clf_mdl.plk', 'wb') as file:
    pickle.dump(model, file)

In [11]:
model = pickle.load(open('clf_mdl.plk', 'rb'))

In [12]:
# check the top candidate model
pd.DataFrame(model.cv_results_).sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__colsample,param_clf__gamma,param_clf__max_depth,param_clf__min_child_weight,param_clf__subsample,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
112,129.897083,9.182618,0.611651,0.121948,0.8,0.7,6,1,0.7,"{'clf__colsample': 0.8, 'clf__gamma': 0.7, 'cl...",0.989788,0.992558,0.992241,0.991855,0.992472,0.991783,0.001027,1
241,107.762462,0.391252,0.335711,0.009583,1.0,0.7,6,1,0.8,"{'clf__colsample': 1.0, 'clf__gamma': 0.7, 'cl...",0.989877,0.99186,0.991954,0.991968,0.992925,0.991717,0.000998,2
49,106.064866,0.769424,0.347098,0.012114,0.7,0.7,6,1,0.8,"{'clf__colsample': 0.7, 'clf__gamma': 0.7, 'cl...",0.989359,0.991737,0.992075,0.991952,0.992742,0.991573,0.001157,3
240,102.888601,0.290034,0.381831,0.052195,1.0,0.7,6,1,0.7,"{'clf__colsample': 1.0, 'clf__gamma': 0.7, 'cl...",0.989377,0.992715,0.991721,0.991276,0.992753,0.991568,0.001235,4
55,138.497884,10.476247,0.349164,0.024876,0.7,0.7,6,2,1.0,"{'clf__colsample': 0.7, 'clf__gamma': 0.7, 'cl...",0.989715,0.991834,0.992072,0.991459,0.992748,0.991566,0.001016,5


## Model Evaluation
    - tn, fp, fn, tp
    - auc score

### Insample Test

In [13]:
y_train_pred = model.predict(X_train)
tn, fp, fn, tp = confusion_matrix(y_train, y_train_pred).ravel()

In [14]:
tn, fp, fn, tp

(26378, 2053, 25, 328)

In [15]:
precision = tp / (tp + fp)
recall = tp / (tp + fn)
accuracy = (tp + tn)/ (tn+ fp+ fn+ tp)
precision, recall, accuracy

(0.1377572448551029, 0.9291784702549575, 0.9278071150639244)

In [16]:
fpr, tpr, thresholds = roc_curve(y_train, y_train_pred, pos_label=1)
roc_auc = auc(fpr, tpr)
roc_auc

0.9284842792694363

### Out-of-sample Test

In [17]:
y_test_pred = model.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()

In [18]:
tn, fp, fn, tp

(12910, 1089, 161, 18)

In [19]:
precision = tp / (tp + fp)
recall = tp / (tp + fn)
accuracy = (tp + tn)/ (tn+ fp+ fn+ tp)
precision, recall, accuracy

(0.016260162601626018, 0.1005586592178771, 0.9118352376921992)

In [20]:
fpr, tpr, thresholds = roc_curve(y_test, y_test_pred, pos_label=1)
roc_auc = auc(fpr, tpr)
roc_auc 

0.5113836942064098

**High in sample test score but low out-of-sample score shows overfitting  still significant in the model.**

### Feature importance

In [25]:
feature_importance = pd.DataFrame({
    'pca': range(100),
    'Feature importance': model.best_estimator_._final_estimator.feature_importances_
})

In [26]:
feature_importance.sort_values(by='Feature importance', ascending=False).head(10)

Unnamed: 0,pca,Feature importance
32,32,0.039935
18,18,0.029981
30,30,0.027008
99,99,0.023832
92,92,0.022611
94,94,0.022518
34,34,0.02229
87,87,0.022284
79,79,0.021951
93,93,0.021529


**The variable with max variance is not in top features, this may be the reason of overfitting**