**1. LOAD REQUIRED PACKAGES**

In [1]:
from joblib import dump 
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import xgboost as xgb
from hyperopt import Trials, STATUS_OK, tpe, hp, fmin

In [2]:
%load_ext autoreload
%autoreload 2

**2. LOAD THE PROCESSED DATA**

In [3]:
from src.data.jason_sets import load_sets

train_scaled, target, val_scaled, val_target, test_scaled, test_target, test_id = load_sets()

print(train_scaled[0])
print(train_scaled.shape)
print(target[0:5])
print(test_scaled.shape)
print(test_id[0:5])     

[ 1.00610018  0.6405738   0.1234033   0.11395867  0.04709033  0.17726855
 -0.42837351 -0.48691065  0.18849336  0.65595335  0.76038672  0.07039208
  1.42837673 -0.12103694  0.26604009  1.16194866  1.1072419  -0.05507101
  0.47321012]
(8000, 19)
[1 1 1 1 1]
(3799, 19)
[   1 8194    3 8196 8197]


**3. USE SMOTE TO OVERSAMPE THE LOADED DATA**

In [4]:
oversample = SMOTE()
X_smote, y_smote = oversample.fit_resample(train_scaled, target)

**4. CALCULATE MODEL BENCHMARK**

In [5]:
from src.models.jason_null import NullModel
from src.models.jason_performance import print_class_perf

base_model = NullModel(target_type="classification")
base_target = base_model.fit_predict(target)

print_class_perf(y_preds=base_target, y_actuals=target, set_name='Training', average='macro')

Accuracy Training: 0.833625
ROC AUC Training: 0.5


**5. MODEL TUNNING**

**Load the custom SearchCV() and set up value for CV** <br>
Because the data is imbalanced, we use `StratifiedKfold` for accurate comparision when tuning hyperparameters.

In [6]:
from src.models.jason_tune_model import searchCV

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)

**5A. Search best hyperparameters for Logistic Regression (LG)**

In [7]:
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

grid = dict(solver=solvers, penalty=penalty, C=c_values)
logistic = LogisticRegression()

searchCV(logistic, grid, train_scaled, target, kfold)

Best params: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Training AUC score: 0.701


**Fit a tuned LG with the SMOTE data** <br>
Then make prediction and save the tuned model and its prediciton result to drive.

In [8]:
from src.models.jason_predict_model import save_prediction

logistic = LogisticRegression(C=0.1, penalty='l2', solver='liblinear')
logistic.fit(X_smote, y_smote)
dump(logistic,  '../models/jason_logistic_tuned_smote.joblib')

save_prediction(logistic, test_scaled, test_id, 'logistic_tuned_smote_pred')

**5B. Search best hyperparameters for Random Forest (RF)**

In [11]:
grid = {'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075, 0.01, 0.05],
        'min_samples_split':[2, 0.01, 0.02, 0.03, 0.04, 0.06, 0.08, 0.1],
        'min_samples_leaf':[1,2,4,6,8,10,20,30],
        'min_impurity_decrease':[0.0, 0.01, 0.05, 0.10, 0.15, 0.2],
        'max_leaf_nodes':[10, 15, 20, 25, 30, 35, 40, 45, 50, None],
        'max_features':['auto', 0.8, 0.7, 0.6, 0.5, 0.4],
        'max_depth':[None,2,4,6,8,10,20]
}

rf = RandomForestClassifier(random_state=2, n_jobs=-1)
searchCV(rf, grid, train_scaled, target, kfold, random=True)

Best params: {'min_weight_fraction_leaf': 0.01, 'min_samples_split': 0.08, 'min_samples_leaf': 20, 'min_impurity_decrease': 0.0, 'max_leaf_nodes': None, 'max_features': 0.7, 'max_depth': 8}
Training AUC score: 0.700


**Fit a tuned RF with the SMOTE data** <br>
Then make prediction and save the tuned model and its prediciton result to drive.

In [12]:
rf = RandomForestClassifier(min_weight_fraction_leaf=0.01, min_samples_split=0.08, 
                            min_samples_leaf=20, min_impurity_decrease=0.0, 
                            max_leaf_nodes=None, max_features=0.7, max_depth=8)
rf.fit(X_smote, y_smote)
dump(rf,  '../models/jason_rf_tuned_smote.joblib')

save_prediction(logistic, test_scaled, test_id, 'rf_tuned_smote_pred')

**6. TESTING NEW HYPOTHESIS: eXTREME GRADIENT BOOSTING (XGB)**

**6A. Fit an XGB model and evaluate its performance**

In [13]:
xgboost_default = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgboost_default.fit(train_scaled, target)
dump(xgboost_default,  '../models/jason_xgboost_default.joblib')

acc_scores = cross_val_score(xgboost_default, train_scaled, target, scoring='accuracy', cv=kfold)
auc_scores = cross_val_score(xgboost_default, train_scaled, target, scoring='roc_auc', cv=kfold)

print("Training accuracy score: {:.3f}".format(acc_scores.mean()))
print("Training AUC score: {:.3f}".format(auc_scores.mean()))

Training accuracy score: 0.822
Training AUC score: 0.664


**6B. Search best hyperparameters for XGB**

**Define an objective function**

In [14]:
def objective(space):
    from sklearn.model_selection import cross_val_score
    
    xgboost = xgb.XGBClassifier(
        max_depth = int(space['max_depth']),
        learning_rate = space['learning_rate'],
        min_child_weight = space['min_child_weight'],
        subsample = space['subsample'],
        colsample_bytree = space['colsample_bytree'],
        use_label_encoder=False, eval_metric='logloss'      
    )
    
    acc = cross_val_score(xgboost, train_scaled, target, cv=kfold, scoring="accuracy").mean()

    return{'loss': 1-acc, 'status': STATUS_OK }

**Specify a search space**

In [15]:
space = {
    'max_depth' : hp.choice('max_depth', range(2, 20, 1)),
    'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.05),
    'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
    'subsample' : hp.quniform('subsample', 0.1, 1, 0.05),
    'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.05)
}

**Use `hypyeropt` to find the best hyperparameters for XGB**

In [16]:
best_hypers = fmin(
    fn=objective,   
    space=space,       
    algo=tpe.suggest,       
    max_evals=5
)

100%|██████████| 5/5 [00:11<00:00,  2.33s/trial, best loss: 0.16549999999999998]


In [17]:
print("Best Hpyerparameters: ", best_hypers)

Best Hpyerparameters:  {'colsample_bytree': 0.45, 'learning_rate': 0.05, 'max_depth': 2, 'min_child_weight': 10.0, 'subsample': 0.25}


**Fit the tuned XGB with SMOTE data**

In [19]:
xgb_tuned = xgb.XGBClassifier(
    max_depth = best_hypers['max_depth'],
    learning_rate = best_hypers['learning_rate'],
    min_child_weight = best_hypers['min_child_weight'],
    subsample = best_hypers['subsample'],
    colsample_bytree = best_hypers['colsample_bytree'],
    use_label_encoder=False, eval_metric='logloss'
)
xgb_tuned.fit(X_smote, y_smote)
dump(xgb_tuned,  '../models/jason_xgboost_tuned_smote.joblib')

acc_scores = cross_val_score(xgb_tuned, train_scaled, target, scoring='accuracy', cv=kfold)
auc_scores = cross_val_score(xgb_tuned, train_scaled, target, scoring='roc_auc', cv=kfold)

print("Training accuracy score: {:.3f}".format(acc_scores.mean()))
print("Training AUC score: {:.3f}".format(auc_scores.mean()))

save_prediction(xgb_tuned, test_scaled, test_id, 'rf_tuned_smote_pred')

Training accuracy score: 0.834
Training AUC score: 0.698
