# Phase 3-2: Training and Testing for $t=2$
* Model : Stacking

## Setup

First, let's import a few common modules.

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Common imports
import numpy as np
import pandas as pd
import os

# to make this notebook's output stable across runs
import random
seed_val = 43
np.random.seed(seed_val)
random.seed(seed_val)

## Data Loading

In [2]:
seed = 42
df = pd.read_csv(f"datasets/train_set_{seed}_t_2.csv")

In [3]:
X = df.drop(['is_drop'], axis=1)
y = df['is_drop'].copy()
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25406 entries, 0 to 25405
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index_o            25406 non-null  int64  
 1   year               25406 non-null  int64  
 2   semester           25406 non-null  int64  
 3   grade              25406 non-null  int64  
 4   sex                25406 non-null  object 
 5   gpa_last_seme      25406 non-null  float64
 6   credits_last_seme  25406 non-null  float64
 7   credits_tot        25406 non-null  float64
 8   n_seme             25406 non-null  int64  
 9   years_since        25406 non-null  int64  
 10  college            25406 non-null  object 
 11  adm_unit           25406 non-null  int64  
 12  nation             25406 non-null  int64  
 13  in_capa            25406 non-null  bool   
 14  leave              25406 non-null  bool   
dtypes: bool(2), float64(3), int64(8), object(2)
memory usage: 2.6+ MB


## Data Transformation

In [4]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

num_attrs = [ 'grade', 'gpa_last_seme', 'credits_last_seme', 'credits_tot', 'n_seme', 'years_since']
cat_attrs = ['semester', 'sex', 'adm_unit', 'nation', 'in_capa', 'college', 'leave']

full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), num_attrs),
    ('cat', OneHotEncoder(), cat_attrs)
])

cat_pipeline = ColumnTransformer([
    ('num', StandardScaler(), num_attrs),
    ('cat', 'passthrough', cat_attrs)
])

## Model Evaluation


In [5]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_predict, cross_validate

cat_indices = [6, 7, 8, 9, 10, 11, 12]

estimators = [
    ('svm', make_pipeline(full_pipeline, SVC(C=10, gamma=0.1, random_state=42))),
    ('mlp', make_pipeline(full_pipeline, MLPClassifier(max_iter=10_000, hidden_layer_sizes=[10,10,10], random_state=42))),
    ('xgb', make_pipeline(full_pipeline, XGBClassifier(max_depth=4, n_estimators=70))),
    ('lgbm', make_pipeline(full_pipeline, LGBMClassifier(max_depth=8, n_estimators=100))),
    ('cat', make_pipeline(cat_pipeline, CatBoostClassifier(cat_features=cat_indices))),
]
stk_clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression(C=10)
)

In [6]:
from sklearn.metrics import (
    precision_recall_curve, recall_score, precision_score, f1_score,
    average_precision_score, roc_auc_score, matthews_corrcoef
)
from pprint import pprint

# function to evaluate a classifier with 5-fold CV with "thereshold optimization".
def run_model(clf, X_train, y_train): 
    res = cross_validate(clf, X_train, y_train, cv=5, scoring=('average_precision', 'roc_auc', 'recall', 'precision', 'f1', 'matthews_corrcoef'))

    auc = res['test_roc_auc'].mean()
    ap = res['test_average_precision'].mean()
    recall_no = res['test_recall'].mean()
    preci_no = res['test_precision'].mean()
    f1_no = res['test_f1'].mean()
    mcc_no = res['test_matthews_corrcoef'].mean()

    # Threshold optimization
    if (type(clf).__name__ == 'SVC'):
            y_score = cross_val_predict(clf, X_train, y_train, cv=5, method='decision_function')
    else: 
        y_score = cross_val_predict(clf, X_train, y_train, cv=5, method='predict_proba')[:, 1]
    
    precision, recall, thresholds = precision_recall_curve(y_train, y_score)

    # Maximizing F1-score
    f1_val = (2 * precision * recall) / (precision + recall)
    f1_val[np.isinf(f1_val)] = np.nan

    i = np.nanargmax(f1_val)
    best_thre = thresholds[i]

    y_pred = (y_score >= best_thre)
    recall_opt = recall_score(y_train, y_pred)
    preci_opt = precision_score(y_train, y_pred)
    f1_opt = f1_score(y_train, y_pred)
    mcc_opt = matthews_corrcoef(y_train, y_pred)

    res = {'ap': ap, 'auc': auc, 'recall': recall_no, 'precision': preci_no, 
           'f1': f1_no, 'mcc': mcc_no, 'recall(opt)': recall_opt, 'precision(opt)': preci_opt,
           'f1(opt)': f1_opt, 'mcc(opt)': mcc_opt, 'threshold': best_thre}
    return res

Run the evaluation and optimize the threshold for the stacking model.

In [7]:
names = ['stk']

cvres_stk = {}

for name in names:
    clf = globals()[f'{name}_clf']
    globals()[f'cvres_{name}'] = run_model(clf, X, y)
    

[LightGBM] [Info] Number of positive: 1051, number of negative: 19273
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002743 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 502
[LightGBM] [Info] Number of data points in the train set: 20324, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.051712 -> initscore=-2.908963
[LightGBM] [Info] Start training from score -2.908963
Learning rate set to 0.037278
0:	learn: 0.6317425	total: 241ms	remaining: 4m
1:	learn: 0.5732797	total: 304ms	remaining: 2m 31s
2:	learn: 0.5289949	total: 366ms	remaining: 2m 1s
3:	learn: 0.4876714	total: 425ms	remaining: 1m 45s
4:	learn: 0.4522159	total: 491ms	remaining: 1m 37s
5:	learn: 0.4217353	total: 555ms	remaining: 1m 31s
6:	learn: 0.3986326	total: 595ms	remaining: 1m 24s
7:	learn: 0.3748378	total: 657ms	remaining: 1m 21s
8:	learn: 0

Results of 5-fold CV on the training set.

In [8]:
for name in names:
    clf = globals()[f'{name}_clf']
    print("model: ", type(clf).__name__)
    pprint(globals()[f'cvres_{name}'], sort_dicts=False)
    print()

model:  StackingClassifier
{'ap': 0.49933436848468454,
 'auc': 0.8971655639070277,
 'recall': 0.2930078657881752,
 'precision': 0.6736075668618744,
 'f1': 0.4081096463247714,
 'mcc': 0.4258382372768841,
 'recall(opt)': 0.5197869101978692,
 'precision(opt)': 0.5059259259259259,
 'f1(opt)': 0.5127627627627628,
 'mcc(opt)': 0.48585882264424435,
 'threshold': 0.13460067193714875}



## Model Training and Test

In [9]:
stk_clf.fit(X, y)

[LightGBM] [Info] Number of positive: 1314, number of negative: 24092
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001641 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 504
[LightGBM] [Info] Number of data points in the train set: 25406, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.051720 -> initscore=-2.908804
[LightGBM] [Info] Start training from score -2.908804
Learning rate set to 0.041005
0:	learn: 0.6263778	total: 84.5ms	remaining: 1m 24s
1:	learn: 0.5705002	total: 161ms	remaining: 1m 20s
2:	learn: 0.5159115	total: 242ms	remaining: 1m 20s
3:	learn: 0.4696864	total: 326ms	remaining: 1m 21s
4:	learn: 0.4308835	total: 422ms	remaining: 1m 24s
5:	learn: 0.3955065	total: 505ms	remaining: 1m 23s
6:	learn: 0.3721400	total: 579ms	remaining: 1m 22s
7:	learn: 0.3492917	total: 647ms	remaining: 1m 20s
8:	le

In [10]:
import joblib

model_dict = {
    'model': stk_clf,
    'threshold': cvres_stk['threshold']
}
joblib.dump(model_dict, f'models/stack_1_2.pkl')


['models/stack_1_2.pkl']

In [11]:
test_set = pd.read_csv(f"datasets/test_set_{seed}_t_2.csv")

X_te = test_set.drop(['is_drop'], axis=1)
y_te = test_set['is_drop'].copy()

In [12]:
# function to evaluate the performance with test set and given threshold

def test_model(clf, X_test, y_test, thre): 
    if (type(clf).__name__ == 'SVC'):
        y_score = clf.decision_function(X_test)
    else: y_score = clf.predict_proba(X_test)[:, 1]

    auc = roc_auc_score(y_test, y_score)
    ap = average_precision_score(y_test, y_score)

    if (type(clf).__name__ == 'SVC'):
        y_pred = (y_score >= 0)
    else: y_pred = (y_score >= 0.5)
    recall_no = recall_score(y_test, y_pred)
    preci_no = precision_score(y_test, y_pred)
    f1_no = f1_score(y_test, y_pred)
    mcc_no = matthews_corrcoef(y_test, y_pred)

    # Optimizing threshold by maximizing F1-score
    y_pred = (y_score >= thre)
    recall_opt = recall_score(y_test, y_pred)
    preci_opt = precision_score(y_test, y_pred)
    f1_opt = f1_score(y_test, y_pred)
    mcc_opt = matthews_corrcoef(y_test, y_pred)


    res = {'auc': auc, 'ap': ap, 'recall': recall_no, 'precision': preci_no, 
           'f1': f1_no, 'mcc_no': mcc_no, 'recall(opt)': recall_opt, 
           'precision(opt)': preci_opt, 'f1(opt)': f1_opt, 'mcc(opt)': mcc_opt,
           'threshold': thre}
    print("model: ", type(clf).__name__)
    pprint(res, sort_dicts=False)
    print()
    return res

In [13]:
# Now we test the six models with the test set and given thresholds.

test_stk = {}

test_stk = test_model(stk_clf, X_te, y_te, cvres_stk['threshold'])

model:  StackingClassifier
{'auc': 0.9004465981033236,
 'ap': 0.48337223358392656,
 'recall': 0.29559748427672955,
 'precision': 0.6666666666666666,
 'f1': 0.4095860566448802,
 'mcc_no': 0.4260512453200593,
 'recall(opt)': 0.5094339622641509,
 'precision(opt)': 0.48502994011976047,
 'f1(opt)': 0.49693251533742333,
 'mcc(opt)': 0.4699411582045685,
 'threshold': 0.13460067193714875}

