# Robust Test for $t=2$ on temporal splits
* Models: SVM, MLP, LightGBM, XGBoost, and CatBoost

## Setup

In [82]:
# Python â‰¥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Common imports
import numpy as np
import pandas as pd
import os

# to make this notebook's output stable across runs
import random
seed_val = 43
np.random.seed(seed_val)
random.seed(seed_val)

## Data Loading

In [83]:
seed = 42
df = pd.read_csv(f"datasets/train_set_t_2_tempo.csv")

In [84]:
X = df.drop(['state_now', 'state_next_1', 'state_next_2', 'state_next_3', 'is_drop'], axis=1)
y = df['is_drop'].copy()
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15764 entries, 0 to 15763
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index_o            15764 non-null  int64  
 1   year               15764 non-null  int64  
 2   semester           15764 non-null  int64  
 3   grade              15764 non-null  int64  
 4   sex                15764 non-null  object 
 5   gpa_last_seme      15764 non-null  float64
 6   credits_last_seme  15764 non-null  float64
 7   credits_tot        15764 non-null  float64
 8   n_seme             15764 non-null  int64  
 9   years_since        15764 non-null  int64  
 10  college            15764 non-null  object 
 11  adm_unit           15764 non-null  int64  
 12  nation             15764 non-null  int64  
 13  in_capa            15764 non-null  bool   
 14  leave              15764 non-null  bool   
dtypes: bool(2), float64(3), int64(8), object(2)
memory usage: 1.6+ MB


## Data Transformation

In [85]:
# Label encoding for categorical features
def sex_mapping(sex):
    sex_map = {'M': 0, 'F': 1}
    return sex_map[sex]

def seme_mapping(seme):
    seme_map = {1: 0, 2: 1}
    return seme_map[seme]

def college_mapping(college):
    college_map = {'TH': 0, 'HS': 1, 'BZ': 2, 'HT': 3, 'EG': 4, 'SW': 5, 'AT': 6}
    return college_map[college]


In [86]:
# Label encoding for categrical features
def label_encoding(data):
    data['sex'] = data['sex'].map(sex_mapping)
    data['semester'] = data['semester'].map(seme_mapping)
    data['college'] = data['college'].map(college_mapping)

In [87]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

num_attrs = ['grade', 'gpa_last_seme', 'credits_last_seme', 'credits_tot', 'n_seme', 'years_since']
cat_attrs = ['semester', 'sex', 'adm_unit', 'nation', 'in_capa', 'college', 'leave']

full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), num_attrs),
    ('cat', OneHotEncoder(), cat_attrs)
])

cat_pipeline = ColumnTransformer([
    ('num', StandardScaler(), num_attrs),
    ('cat', 'passthrough', cat_attrs)
])

X_t = full_pipeline.fit_transform(X) # for SVM and MLP with one-hot encoding
X_cat = X.copy()    # for CatBoost without encoding
label_encoding(X)   # for XGBoost and LightGBM with label encoding
X_xgb = cat_pipeline.fit_transform(X)
X_cat = cat_pipeline.fit_transform(X) # for CatBoost

X = pd.DataFrame(X_t, columns=full_pipeline.get_feature_names_out())    # for SVM and MLP
X_xgb = pd.DataFrame(X_xgb, columns=cat_pipeline.get_feature_names_out())    # for XGBoost and LightGBM
X_cat = pd.DataFrame(X_cat, columns=cat_pipeline.get_feature_names_out())    # for CatBoost
cat_indices = [6, 7, 8, 9, 10, 11, 12]


In [88]:

# Typesetting
num_cols = ['num__grade', 'num__gpa_last_seme', 'num__credits_last_seme', 'num__credits_tot', 'num__n_seme', 'num__years_since']
cat_cols = list(set(X_cat.columns) - set(num_cols))

X_cat[num_cols] = X_cat[num_cols].astype(float)
X_cat[cat_cols] = X_cat[cat_cols].astype(int)
X_xgb[num_cols] = X_xgb[num_cols].astype(float)
X_xgb[cat_cols] = X_xgb[cat_cols].astype(int)

In [89]:
X_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15764 entries, 0 to 15763
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   num__grade              15764 non-null  float64
 1   num__gpa_last_seme      15764 non-null  float64
 2   num__credits_last_seme  15764 non-null  float64
 3   num__credits_tot        15764 non-null  float64
 4   num__n_seme             15764 non-null  float64
 5   num__years_since        15764 non-null  float64
 6   cat__semester           15764 non-null  int32  
 7   cat__sex                15764 non-null  int32  
 8   cat__adm_unit           15764 non-null  int32  
 9   cat__nation             15764 non-null  int32  
 10  cat__in_capa            15764 non-null  int32  
 11  cat__college            15764 non-null  int32  
 12  cat__leave              15764 non-null  int32  
dtypes: float64(6), int32(7)
memory usage: 1.1 MB


In [90]:
X_cat.rename(columns={'num__grade': 'year', 'num__gpa_last_seme': 'gpa_last', 'num__credits_last_seme': 'credits_last',
                  'num__credits_tot': 'credits_tot', 'num__n_seme': 'n_semesters', 'num__years_since': 'years_since',
                  'cat__semester': 'release', 'cat__sex': 'sex', 'cat__adm_unit': 'adm_type', 'cat__nation': 'nation',
                  'cat__in_capa': 'in_capa', 'cat__college': 'college', 'cat__leave': 'status'}, inplace=True)


In [91]:
X_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15764 entries, 0 to 15763
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   year          15764 non-null  float64
 1   gpa_last      15764 non-null  float64
 2   credits_last  15764 non-null  float64
 3   credits_tot   15764 non-null  float64
 4   n_semesters   15764 non-null  float64
 5   years_since   15764 non-null  float64
 6   release       15764 non-null  int32  
 7   sex           15764 non-null  int32  
 8   adm_type      15764 non-null  int32  
 9   nation        15764 non-null  int32  
 10  in_capa       15764 non-null  int32  
 11  college       15764 non-null  int32  
 12  status        15764 non-null  int32  
dtypes: float64(6), int32(7)
memory usage: 1.1 MB


In [103]:
X_cat.head()

Unnamed: 0,year,gpa_last,credits_last,credits_tot,n_semesters,years_since,release,sex,adm_type,nation,in_capa,college,status
0,1.193714,0.996123,0.601744,1.311427,0.868065,2.144077,0,0,0,0,1,2,0
1,-0.744857,0.558007,0.761871,-0.679349,-0.90186,0.071617,0,0,0,0,0,4,0
2,-0.744857,-0.136934,0.601744,-0.838612,-0.90186,0.071617,0,0,0,0,1,4,0
3,-0.744857,0.965908,0.441618,-0.785524,-0.90186,0.071617,0,0,0,0,1,4,0
4,-0.744857,-2.342619,-2.44066,-0.838612,-0.90186,0.071617,0,0,0,0,1,4,0


## 5-fold CV and Threshold Optimization


In [92]:
from sklearn.model_selection import cross_val_predict, cross_validate

from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


In [93]:

svm_clf = SVC(kernel='rbf', C=10, gamma=0.1, random_state=42)
mlp_clf = MLPClassifier(max_iter=10_000, hidden_layer_sizes=[10,10,10], random_state=42)
xgb_clf = XGBClassifier(max_depth=5, n_estimators=40)
lgb_clf = LGBMClassifier(max_depth=8, n_estimators=90)
cat_clf = CatBoostClassifier(cat_features=cat_indices)


In [94]:
from sklearn.metrics import (
    precision_recall_curve, recall_score, precision_score, f1_score,
    average_precision_score, roc_auc_score, matthews_corrcoef
)
from pprint import pprint

def run_model(clf, X_train, y_train): 
    res = cross_validate(clf, X_train, y_train, cv=5, scoring=('average_precision','roc_auc',  'recall', 'precision', 'f1', 'matthews_corrcoef'))

    auc = res['test_roc_auc'].mean()
    ap = res['test_average_precision'].mean()
    recall_no = res['test_recall'].mean()
    preci_no = res['test_precision'].mean()
    f1_no = res['test_f1'].mean()
    mcc_no = res['test_matthews_corrcoef'].mean()


    # Threshold optimization
    if (type(clf).__name__ == 'SVC'):
            y_score = cross_val_predict(clf, X_train, y_train, cv=5, method='decision_function')
    else: 
        y_score = cross_val_predict(clf, X_train, y_train, cv=5, method='predict_proba')[:, 1]
    
    precision, recall, thresholds = precision_recall_curve(y_train, y_score)

    # Maximizing F1-score
    f1_val = (2 * precision * recall) / (precision + recall)
    f1_val[np.isinf(f1_val)] = np.nan

    i = np.nanargmax(f1_val)
    best_thre = thresholds[i]

    y_pred = (y_score >= best_thre)
    recall_opt = recall_score(y_train, y_pred)
    preci_opt = precision_score(y_train, y_pred)
    f1_opt = f1_score(y_train, y_pred)
    mcc_opt = matthews_corrcoef(y_train, y_pred)

    res = {'ap': ap, 'auc': auc, 'recall': recall_no, 'precision': preci_no, 
           'f1': f1_no, 'mcc': mcc_no, 'recall(opt)': recall_opt, 'precision(opt)': preci_opt,
           'f1(opt)': f1_opt, 'mcc(opt)': mcc_opt, 'threshold': best_thre}
    return res

In [95]:
X_xgb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15764 entries, 0 to 15763
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   num__grade              15764 non-null  float64
 1   num__gpa_last_seme      15764 non-null  float64
 2   num__credits_last_seme  15764 non-null  float64
 3   num__credits_tot        15764 non-null  float64
 4   num__n_seme             15764 non-null  float64
 5   num__years_since        15764 non-null  float64
 6   cat__semester           15764 non-null  int32  
 7   cat__sex                15764 non-null  int32  
 8   cat__adm_unit           15764 non-null  int32  
 9   cat__nation             15764 non-null  int32  
 10  cat__in_capa            15764 non-null  int32  
 11  cat__college            15764 non-null  int32  
 12  cat__leave              15764 non-null  int32  
dtypes: float64(6), int32(7)
memory usage: 1.1 MB


In [96]:
names = ['svm', 'mlp', 'lgb', 'xgb', 'cat']

cvres_svm = {}
cvres_mlp = {}
cvres_xgb = {}
cvres_lgb = {}
cvres_cat = {}

for name in names:
    clf = globals()[f'{name}_clf']
    if (name == 'cat'):
        globals()[f'cvres_{name}'] = run_model(clf, X_cat, y)
    elif (name in ['xgb', 'lgb']):
        globals()[f'cvres_{name}'] = run_model(clf, X_xgb, y)
    else:
        globals()[f'cvres_{name}'] = run_model(clf, X, y)
    

  f1_val = (2 * precision * recall) / (precision + recall)


[LightGBM] [Info] Number of positive: 619, number of negative: 11992
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000695 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 466
[LightGBM] [Info] Number of data points in the train set: 12611, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049084 -> initscore=-2.963890
[LightGBM] [Info] Start training from score -2.963890
[LightGBM] [Info] Number of positive: 618, number of negative: 11993
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000534 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 468
[LightGBM] [Info] Number of data points in the train set: 12611, number of used features: 13
[LightGBM] [Info] [binar

  f1_val = (2 * precision * recall) / (precision + recall)


Learning rate set to 0.030406
0:	learn: 0.6365644	total: 66ms	remaining: 1m 5s
1:	learn: 0.5969760	total: 128ms	remaining: 1m 3s
2:	learn: 0.5513625	total: 185ms	remaining: 1m 1s
3:	learn: 0.5173704	total: 242ms	remaining: 1m
4:	learn: 0.4827733	total: 298ms	remaining: 59.3s
5:	learn: 0.4551249	total: 359ms	remaining: 59.4s
6:	learn: 0.4252350	total: 429ms	remaining: 1m
7:	learn: 0.3997166	total: 484ms	remaining: 1m
8:	learn: 0.3796264	total: 546ms	remaining: 1m
9:	learn: 0.3562120	total: 606ms	remaining: 1m
10:	learn: 0.3352339	total: 677ms	remaining: 1m
11:	learn: 0.3217595	total: 749ms	remaining: 1m 1s
12:	learn: 0.3041144	total: 829ms	remaining: 1m 2s
13:	learn: 0.2913202	total: 904ms	remaining: 1m 3s
14:	learn: 0.2799213	total: 980ms	remaining: 1m 4s
15:	learn: 0.2697713	total: 1.05s	remaining: 1m 4s
16:	learn: 0.2614115	total: 1.12s	remaining: 1m 4s
17:	learn: 0.2535055	total: 1.18s	remaining: 1m 4s
18:	learn: 0.2431277	total: 1.25s	remaining: 1m 4s
19:	learn: 0.2330231	total: 1.

  f1_val = (2 * precision * recall) / (precision + recall)


In [97]:
for name in names:
    clf = globals()[f'{name}_clf']
    print("model: ", type(clf).__name__)
    pprint(globals()[f'cvres_{name}'], sort_dicts=False)
    print()

model:  SVC
{'ap': 0.3229277536942661,
 'auc': 0.802026241415121,
 'recall': 0.31877670716380396,
 'precision': 0.522638826863707,
 'f1': 0.2446302193034219,
 'mcc': 0.2893648576537621,
 'recall(opt)': 0.5213454075032341,
 'precision(opt)': 0.21044386422976502,
 'f1(opt)': 0.29985119047619047,
 'mcc(opt)': 0.27794690834173863,
 'threshold': -0.8778677742696765}

model:  MLPClassifier
{'ap': 0.3390383152734748,
 'auc': 0.8549023415449254,
 'recall': 0.3277251780477587,
 'precision': 0.5143190570661688,
 'f1': 0.2615344663049147,
 'mcc': 0.29679046824048155,
 'recall(opt)': 0.538163001293661,
 'precision(opt)': 0.19971195391262603,
 'f1(opt)': 0.2913165266106443,
 'mcc(opt)': 0.27226491545010084,
 'threshold': 0.13293726701746988}

model:  LGBMClassifier
{'ap': 0.35068929916672065,
 'auc': 0.8502170590403167,
 'recall': 0.33418516966904066,
 'precision': 0.5533761637163056,
 'f1': 0.25806821827740967,
 'mcc': 0.3005370695765478,
 'recall(opt)': 0.5653298835705045,
 'precision(opt)': 0.22

## Model Training and Test

In [98]:
svm_clf.fit(X, y)
mlp_clf.fit(X, y)
xgb_clf.fit(X_xgb, y)
lgb_clf.fit(X_xgb, y)
cat_clf.fit(X_cat, y, cat_features=cat_indices)

[LightGBM] [Info] Number of positive: 773, number of negative: 14991
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000807 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 475
[LightGBM] [Info] Number of data points in the train set: 15764, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049036 -> initscore=-2.964926
[LightGBM] [Info] Start training from score -2.964926
Learning rate set to 0.033445
0:	learn: 0.6385978	total: 70.5ms	remaining: 1m 10s
1:	learn: 0.5894931	total: 136ms	remaining: 1m 7s
2:	learn: 0.5428627	total: 205ms	remaining: 1m 8s
3:	learn: 0.4994808	total: 272ms	remaining: 1m 7s
4:	learn: 0.4609726	total: 339ms	remaining: 1m 7s
5:	learn: 0.4307180	total: 403ms	remaining: 1m 6s
6:	learn: 0.4085870	total: 462ms	remaining: 1m 5s
7:	learn: 0.3823934	total: 525ms	remaining: 1m 5s
8:	learn: 0.3

<catboost.core.CatBoostClassifier at 0x16ba9eafce0>

In [99]:
test_set = pd.read_csv(f"datasets/test_set_t_2_tempo.csv")

X_te = full_pipeline.transform(test_set)
label_encoding(test_set)
X_te_xgb = cat_pipeline.transform(test_set)
X_te_cat = cat_pipeline.transform(test_set)
y_te = test_set['is_drop'].copy()

X_te = pd.DataFrame(X_te, columns=full_pipeline.get_feature_names_out())
X_te_xgb = pd.DataFrame(X_te_xgb, columns=cat_pipeline.get_feature_names_out())
X_te_cat = pd.DataFrame(X_te_cat, columns=cat_pipeline.get_feature_names_out())

# Typesetting
X_te_xgb[num_cols] = X_te_xgb[num_cols].astype(float)
X_te_xgb[cat_cols] = X_te_xgb[cat_cols].astype(int)

X_te_cat[num_cols] = X_te_cat[num_cols].astype(float)
X_te_cat[cat_cols] = X_te_cat[cat_cols].astype(int)

In [100]:
X_te_cat.rename(columns={'num__grade': 'year', 'num__gpa_last_seme': 'gpa_last', 'num__credits_last_seme': 'credits_last',
                  'num__credits_tot': 'credits_tot', 'num__n_seme': 'n_semesters', 'num__years_since': 'years_since',
                  'cat__semester': 'release', 'cat__sex': 'sex', 'cat__adm_unit': 'adm_type', 'cat__nation': 'nation',
                  'cat__in_capa': 'in_capa', 'cat__college': 'college', 'cat__leave': 'status'}, inplace=True)

In [101]:
# function to evaluate the performance with test set and given threshold
def test_model(clf, X_test, y_test, thre): 
    if (type(clf).__name__ == 'SVC'):
        y_score = clf.decision_function(X_test)
    else: y_score = clf.predict_proba(X_test)[:, 1]

    auc = roc_auc_score(y_test, y_score)
    ap = average_precision_score(y_test, y_score)

    if (type(clf).__name__ == 'SVC'):
        y_pred = (y_score >= 0)
    else: y_pred = (y_score >= 0.5)
    recall_no = recall_score(y_test, y_pred)
    preci_no = precision_score(y_test, y_pred)
    f1_no = f1_score(y_test, y_pred)
    mcc_no = matthews_corrcoef(y_test, y_pred)

    # Maximizing F1-score

    y_pred = (y_score >= thre)
    recall_opt = recall_score(y_test, y_pred)
    preci_opt = precision_score(y_test, y_pred)
    f1_opt = f1_score(y_test, y_pred)
    mcc_opt = matthews_corrcoef(y_test, y_pred)


    res = {'auc': auc, 'ap': ap, 'recall': recall_no, 'precision': preci_no, 
           'f1': f1_no, 'mcc_no': mcc_no, 'recall(opt)': recall_opt, 
           'precision(opt)': preci_opt, 'f1(opt)': f1_opt, 'mcc(opt)': mcc_opt,
           'threshold': thre}
    print("model: ", type(clf).__name__)
    pprint(res, sort_dicts=False)
    print()
    return res

In [102]:
# Now we test the six models with the test set and given thresholds.

test_svm = {}
test_mlp = {}
test_xgb = {}
test_lgb = {}
test_cat = {}

test_svm = test_model(svm_clf, X_te, y_te, cvres_svm['threshold'])
test_mlp = test_model(mlp_clf, X_te, y_te, cvres_mlp['threshold'])
test_lgb = test_model(lgb_clf, X_te_xgb, y_te, cvres_lgb['threshold'])
test_xgb = test_model(xgb_clf, X_te_xgb, y_te, cvres_xgb['threshold'])
test_cat = test_model(cat_clf, X_te_cat, y_te, cvres_cat['threshold'])


model:  SVC
{'auc': 0.8144950606301792,
 'ap': 0.39469113967164327,
 'recall': 0.19699812382739212,
 'precision': 0.6402439024390244,
 'f1': 0.3012912482065997,
 'mcc_no': 0.33476014451427644,
 'recall(opt)': 0.47842401500938087,
 'precision(opt)': 0.408,
 'f1(opt)': 0.44041450777202074,
 'mcc(opt)': 0.402478310216561,
 'threshold': -0.8778677742696765}

model:  MLPClassifier
{'auc': 0.8832203024029396,
 'ap': 0.4405087539027268,
 'recall': 0.24202626641651032,
 'precision': 0.6201923076923077,
 'f1': 0.3481781376518219,
 'mcc_no': 0.36488923994799327,
 'recall(opt)': 0.5909943714821764,
 'precision(opt)': 0.407503234152652,
 'f1(opt)': 0.48238897396630936,
 'mcc(opt)': 0.45108984107373046,
 'threshold': 0.13293726701746988}

model:  LGBMClassifier
{'auc': 0.89375119187776,
 'ap': 0.4721145303211691,
 'recall': 0.27204502814258913,
 'precision': 0.6744186046511628,
 'f1': 0.3877005347593583,
 'mcc_no': 0.40711537956260097,
 'recall(opt)': 0.600375234521576,
 'precision(opt)': 0.4035308