# Phase 4: Data Balancing for $t=2$ **with** Threshold Opt.

## Setup

In [29]:
# Python â‰¥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Common imports
import numpy as np
import pandas as pd
 
 
from imblearn.over_sampling import (
    SMOTE, ADASYN, 
)
from imblearn.under_sampling import (
    CondensedNearestNeighbour,
    EditedNearestNeighbours,
    TomekLinks,
)
from imblearn.combine import SMOTEENN
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier


## Data Loading

In [30]:
seed = 42
dataset = pd.read_csv(f"datasets/train_set_{seed}_t_2.csv")

Separating validation set from the original train-set.

In [31]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
for train_index, valid_index in split.split(dataset, dataset['is_drop']):
    train_set = dataset.loc[train_index]
    valid_set = dataset.loc[valid_index]

train_set.reset_index(inplace=True, drop=True)
valid_set.reset_index(inplace=True, drop=True)

Separate features and labels from the train-set and valid-set

In [32]:
X_tr = train_set.drop(['is_drop'], axis=1)
y_tr = train_set['is_drop'].copy()

X_va = valid_set.drop(['is_drop'], axis=1)
y_va = valid_set['is_drop'].copy()

X_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19054 entries, 0 to 19053
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index_o            19054 non-null  int64  
 1   year               19054 non-null  int64  
 2   semester           19054 non-null  int64  
 3   grade              19054 non-null  int64  
 4   sex                19054 non-null  object 
 5   gpa_last_seme      19054 non-null  float64
 6   credits_last_seme  19054 non-null  float64
 7   credits_tot        19054 non-null  float64
 8   n_seme             19054 non-null  int64  
 9   years_since        19054 non-null  int64  
 10  college            19054 non-null  object 
 11  adm_unit           19054 non-null  int64  
 12  nation             19054 non-null  int64  
 13  in_capa            19054 non-null  bool   
 14  leave              19054 non-null  bool   
dtypes: bool(2), float64(3), int64(8), object(2)
memory usage: 1.9+ MB


## Data Transformation

In [33]:
# Label encoding for categorical features
def sex_mapping(sex):
    sex_map = {'M': 0, 'F': 1}
    return sex_map[sex]

def seme_mapping(seme):
    seme_map = {1: 0, 2: 1}
    return seme_map[seme]

def college_mapping(college):
    college_map = {'TH': 0, 'HS': 1, 'BZ': 2, 'HT': 3, 'EG': 4, 'SW': 5, 'AT': 6}
    return college_map[college]


In [34]:
# Label encoding for categrical features
def label_encoding(data):
    data['sex'] = data['sex'].map(sex_mapping)
    data['semester'] = data['semester'].map(seme_mapping)
    data['college'] = data['college'].map(college_mapping)

In [35]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

num_attrs = ['grade', 'gpa_last_seme', 'credits_last_seme', 'credits_tot', 'n_seme', 'years_since']
cat_attrs = ['semester', 'sex', 'adm_unit', 'nation', 'in_capa', 'college', 'leave']

full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), num_attrs),
    ('cat', OneHotEncoder(), cat_attrs)
])

cat_pipeline = ColumnTransformer([
    ('num', StandardScaler(), num_attrs),
    ('cat', 'passthrough', cat_attrs)
])

X_t = full_pipeline.fit_transform(X_tr)   # for SVM and MLP with one-hot encoding
label_encoding(X_tr)
X_cat = cat_pipeline.fit_transform(X_tr)  # for tree-based models with label encoding


X_tr = pd.DataFrame(X_t, columns=full_pipeline.get_feature_names_out())
X_cat = pd.DataFrame(X_cat, columns=cat_pipeline.get_feature_names_out())
cat_indices = [6, 7, 8, 9, 10, 11, 12]

# Typesetting
num_cols = ['num__grade', 'num__gpa_last_seme', 'num__credits_last_seme', 'num__credits_tot', 'num__n_seme', 'num__years_since']
cat_cols_oh = list(set(X_tr.columns) - set(num_cols)) # for one-hot encoded categorical columns
cat_cols = list(set(X_cat.columns) - set(num_cols)) # for one-hot encoded categorical columns

X_tr[num_cols] = X_tr[num_cols].astype(float)   
X_tr[cat_cols_oh] = X_tr[cat_cols_oh].astype(int)
X_cat[num_cols] = X_cat[num_cols].astype(float)
X_cat[cat_cols] = X_cat[cat_cols].astype(int)

Encoding for validation set

In [36]:
X_t = full_pipeline.transform(X_va)   # for SVM and MLP with one-hot encoding
label_encoding(X_va)
X_va_cat = cat_pipeline.transform(X_va)

# for SVM and MLP with one-hot encoding
X_va= pd.DataFrame(X_t, columns=full_pipeline.get_feature_names_out())
X_va[num_cols] = X_va[num_cols].astype(float)
X_va[cat_cols_oh] = X_va[cat_cols_oh].astype(int)

# for tree-based models with label encoding
X_va_cat = pd.DataFrame(X_va_cat, columns=cat_pipeline.get_feature_names_out())
X_va_cat[num_cols] = X_va_cat[num_cols].astype(float)
X_va_cat[cat_cols] = X_va_cat[cat_cols].astype(int)

## Model Training with Resampling

In [37]:
sampler_map = {
    "smote": SMOTE(random_state=0),
    "adasyn": ADASYN(
        sampling_strategy="auto",  # samples only the minority class
        random_state=0,  # for reproducibility
        n_neighbors=5,
    ),
    "cnn": CondensedNearestNeighbour(
        sampling_strategy="auto", random_state=0, n_neighbors=1, n_jobs=4
    ),
    "enn": EditedNearestNeighbours(
        sampling_strategy="auto", n_neighbors=3, kind_sel="all", n_jobs=4
    ),
    "tomek": TomekLinks(sampling_strategy="auto", n_jobs=4),
    "smoteenn": SMOTEENN(random_state=0),
}

In [38]:
svm = {}
mlp = {}
xgb = {}
lgb = {}
cat = {}
for sampler in sampler_map.keys():
    print(sampler)
    X, y = sampler_map[sampler].fit_resample(X_tr, y_tr)

    # set up the classifier
    svm[sampler] = SVC(C=10, gamma=0.1)
    svm[sampler].fit(X, y)

    mlp[sampler] = MLPClassifier(max_iter=10_000, hidden_layer_sizes=[10,10,10], random_state=43)
    mlp[sampler].fit(X, y)
    
    X, y = sampler_map[sampler].fit_resample(X_cat, y_tr)

    xgb[sampler] = XGBClassifier(n_estimators=40, max_depth=5)
    xgb[sampler].fit(X, y)

    lgb[sampler] = LGBMClassifier(n_estimators=90, max_depth=8)
    lgb[sampler].fit(X, y)

    cat[sampler] = CatBoostClassifier(cat_features=cat_indices, max_depth=6, n_estimators=1000)
    cat[sampler].fit(X, y)

smote
[LightGBM] [Info] Number of positive: 18069, number of negative: 18069
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002216 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1542
[LightGBM] [Info] Number of data points in the train set: 36138, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Learning rate set to 0.047663
0:	learn: 0.6620170	total: 168ms	remaining: 2m 47s
1:	learn: 0.6385912	total: 330ms	remaining: 2m 44s
2:	learn: 0.6198631	total: 467ms	remaining: 2m 35s
3:	learn: 0.6020440	total: 651ms	remaining: 2m 41s
4:	learn: 0.5861063	total: 825ms	remaining: 2m 44s
5:	learn: 0.5716217	total: 991ms	remaining: 2m 44s
6:	learn: 0.5538648	total: 1.15s	remaining: 2m 43s
7:	learn: 0.5412884	total: 1.32s	remaining: 2m 43s
8:	learn: 0.5266704	total: 1.49s	remaining: 2m 43s
9:	learn: 0.5123122	total: 1.65s	remaining: 2m 43s
10:	learn: 0.50

## Model Evaluation

In [39]:
from sklearn.metrics import (
    precision_recall_curve, roc_curve, roc_auc_score, average_precision_score,
    recall_score, precision_score, f1_score, matthews_corrcoef
)
from pprint import pprint

# function to train models and evaluate the performance

def run_model(clf, X_test, y_test): 
    if (type(clf).__name__ == 'SVC'):
        y_score = clf.decision_function(X_test)
    else: y_score = clf.predict_proba(X_test)[:, 1]

    auc = roc_auc_score(y_test, y_score)
    ap = average_precision_score(y_test, y_score)

    if (type(clf).__name__ == 'SVC'):
        y_pred = (y_score >= 0)
    else: y_pred = (y_score >= 0.5)
    recall_no = recall_score(y_test, y_pred)
    preci_no = precision_score(y_test, y_pred)
    f1_no = f1_score(y_test, y_pred)
    mcc_no = matthews_corrcoef(y_test, y_pred)

    precision, recall, thresholds = precision_recall_curve(y_test, y_score)

    # Maximizing F1-score
    f1_val = (2 * precision * recall) / (precision + recall)
    f1_val[np.isinf(f1_val)] = np.nan

    i = np.nanargmax(f1_val)
    best_thre = thresholds[i]

    y_pred = (y_score >= best_thre)
    recall_opt = recall_score(y_test, y_pred)
    preci_opt = precision_score(y_test, y_pred)
    f1_opt = f1_score(y_test, y_pred)
    mcc_opt = matthews_corrcoef(y_test, y_pred)

    res = {'ap': ap, 'auc': auc, 'recall': recall_no, 'precision': preci_no, 
           'f1': f1_no, 'mcc': mcc_no, 'recall(opt)': recall_opt, 
           'precision(opt)': preci_opt, 'f1(opt)': f1_opt, 'mcc(opt)': mcc_opt, 
           'threshold': best_thre}
    print("model: ", type(clf).__name__)
    pprint(res)
    return res

In [40]:
X_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19054 entries, 0 to 19053
Data columns (total 32 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   num__grade              19054 non-null  float64
 1   num__gpa_last_seme      19054 non-null  float64
 2   num__credits_last_seme  19054 non-null  float64
 3   num__credits_tot        19054 non-null  float64
 4   num__n_seme             19054 non-null  float64
 5   num__years_since        19054 non-null  float64
 6   cat__semester_1         19054 non-null  int32  
 7   cat__semester_2         19054 non-null  int32  
 8   cat__sex_F              19054 non-null  int32  
 9   cat__sex_M              19054 non-null  int32  
 10  cat__adm_unit_0         19054 non-null  int32  
 11  cat__adm_unit_1         19054 non-null  int32  
 12  cat__adm_unit_2         19054 non-null  int32  
 13  cat__adm_unit_3         19054 non-null  int32  
 14  cat__adm_unit_4         19054 non-null

In [41]:
# now we train several models, with the different UnderSampling techniques and
# with cross-validation 

# save results

res_svm = {}
res_mlp = {}
res_cat = {}
res_lgb = {}
res_xgb = {}
res_cat = {}

for sampler in sampler_map.keys():
    print(sampler)
    
    # set up the classifier
    res_svm[sampler] = run_model(svm[sampler], X_va, y_va)
    res_mlp[sampler] = run_model(mlp[sampler], X_va, y_va)
    res_cat[sampler] = run_model(cat[sampler], X_va_cat, y_va)
    res_xgb[sampler] = run_model(xgb[sampler], X_va_cat, y_va)
    res_lgb[sampler] = run_model(lgb[sampler], X_va_cat, y_va)
    res_cat[sampler] = run_model(cat[sampler], X_va_cat, y_va)

    print()

smote
model:  SVC
{'ap': 0.31270572643524835,
 'auc': 0.8394939459528747,
 'f1': 0.3490073145245559,
 'f1(opt)': 0.37532133676092544,
 'mcc': 0.32004519850539764,
 'mcc(opt)': 0.34021028455041175,
 'precision': 0.2659235668789809,
 'precision(opt)': 0.32516703786191536,
 'recall': 0.5075987841945289,
 'recall(opt)': 0.44376899696048633,
 'threshold': 0.401014055922456}
model:  MLPClassifier
{'ap': 0.33987914992955676,
 'auc': 0.859522034834048,
 'f1': 0.335971855760774,
 'f1(opt)': 0.4154103852596315,
 'mcc': 0.31798871567568215,
 'mcc(opt)': 0.38914050446851034,
 'precision': 0.23638613861386137,
 'precision(opt)': 0.4626865671641791,
 'recall': 0.5805471124620061,
 'recall(opt)': 0.3768996960486322,
 'threshold': 0.8260169565770047}
model:  CatBoostClassifier
{'ap': 0.4146652556388025,
 'auc': 0.8866233137713739,
 'f1': 0.4178082191780822,
 'f1(opt)': 0.4573002754820937,
 'mcc': 0.3937098448091491,
 'mcc(opt)': 0.4268215839929742,
 'precision': 0.47843137254901963,
 'precision(opt)':

  f1_val = (2 * precision * recall) / (precision + recall)


model:  LGBMClassifier
{'ap': 0.4376270164336064,
 'auc': 0.8848027848667241,
 'f1': 0.4670487106017192,
 'f1(opt)': 0.48673946957878317,
 'mcc': 0.436975664598306,
 'mcc(opt)': 0.45966550539878687,
 'precision': 0.44173441734417346,
 'precision(opt)': 0.5,
 'recall': 0.49544072948328266,
 'recall(opt)': 0.47416413373860183,
 'threshold': 0.5480061477148856}
model:  CatBoostClassifier
{'ap': 0.4146652556388025,
 'auc': 0.8866233137713739,
 'f1': 0.4178082191780822,
 'f1(opt)': 0.4573002754820937,
 'mcc': 0.3937098448091491,
 'mcc(opt)': 0.4268215839929742,
 'precision': 0.47843137254901963,
 'precision(opt)': 0.4181360201511335,
 'recall': 0.3708206686930091,
 'recall(opt)': 0.5045592705167173,
 'threshold': 0.37091537361269455}

adasyn
model:  SVC
{'ap': 0.30740066908493385,
 'auc': 0.8368273694505409,
 'f1': 0.32618825722274,
 'f1(opt)': 0.3585147247119078,
 'mcc': 0.3014644297552649,
 'mcc(opt)': 0.3221570829300968,
 'precision': 0.23521505376344087,
 'precision(opt)': 0.30973451327

In [42]:
import joblib

names = ['svm', 'mlp', 'xgb', 'lgb', 'cat']
for name in names:
    model_dict = {
        'model': globals()[f'{name}'],
        'result': globals()[f'res_{name}']
    }
    joblib.dump(model_dict, f'models/imbal_opt_{name}_1_2.pkl')


## Final Test

In [43]:
test_set = pd.read_csv(f"datasets/test_set_{seed}_t_2.csv")

In [44]:
# Preprocessing test set
X_te = full_pipeline.transform(test_set)
label_encoding(test_set)
X_te_cat = cat_pipeline.transform(test_set)
y_te = test_set['is_drop'].copy()

X_te = pd.DataFrame(X_te, columns=full_pipeline.get_feature_names_out())
X_te_cat = pd.DataFrame(X_te_cat, columns=cat_pipeline.get_feature_names_out())

# Typesetting for LGBM
X_te_cat[num_cols] = X_te_cat[num_cols].astype(float)
X_te_cat[cat_cols] = X_te_cat[cat_cols].astype(int)

In [45]:
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    recall_score, precision_score, f1_score, matthews_corrcoef
)
from pprint import pprint

# function to evaluate the performance with test set and given threshold
def test_model(clf, X_test, y_test, thre): 
    if (type(clf).__name__ == 'SVC'):
        y_score = clf.decision_function(X_test)
    else: y_score = clf.predict_proba(X_test)[:, 1]

    auc = roc_auc_score(y_test, y_score)
    ap = average_precision_score(y_test, y_score)

    if (type(clf).__name__ == 'SVC'):
        y_pred = (y_score >= 0)
    else: y_pred = (y_score >= 0.5)
    recall_no = recall_score(y_test, y_pred)
    preci_no = precision_score(y_test, y_pred)
    f1_no = f1_score(y_test, y_pred)
    mcc_no = matthews_corrcoef(y_test, y_pred)

    # Maximizing F1-score
    y_pred = (y_score >= thre)
    recall_opt = recall_score(y_test, y_pred)
    preci_opt = precision_score(y_test, y_pred)
    f1_opt = f1_score(y_test, y_pred)
    mcc_opt = matthews_corrcoef(y_test, y_pred)

    res = {'auc': auc, 'ap': ap, 'recall': recall_no, 'precision': preci_no, 
           'f1': f1_no, 'mcc_no': mcc_no, 'recall(opt)': recall_opt, 
           'precision(opt)': preci_opt, 'f1(opt)': f1_opt, 'mcc(opt)': mcc_opt,
           'threshold': thre}
    print("model: ", type(clf).__name__)
    pprint(res, sort_dicts=False)
    return res

In [46]:
# Now we test the six models with the test set and given thresholds.

test_svm = {}
test_mlp = {}
test_xgb = {}
test_lgb = {}
test_cat = {}

for sampler in sampler_map.keys():
    print(sampler)
    
    # set up the classifier
    test_svm[sampler] = test_model(svm[sampler], X_te, y_te, res_svm[sampler]['threshold'])
    test_mlp[sampler] = test_model(mlp[sampler], X_te, y_te, res_mlp[sampler]['threshold'])
    test_lgb[sampler] = test_model(lgb[sampler], X_te_cat, y_te, res_xgb[sampler]['threshold'])
    test_xgb[sampler] = test_model(xgb[sampler], X_te_cat, y_te, res_xgb[sampler]['threshold'])
    test_cat[sampler] = test_model(cat[sampler], X_te_cat, y_te, res_xgb[sampler]['threshold'])

    print()

smote
model:  SVC
{'auc': 0.8553045554567161,
 'ap': 0.34287905113074335,
 'recall': 0.5817610062893082,
 'precision': 0.2631578947368421,
 'f1': 0.3623898139079334,
 'mcc_no': 0.3448014687561386,
 'recall(opt)': 0.44654088050314467,
 'precision(opt)': 0.2897959183673469,
 'f1(opt)': 0.35148514851485146,
 'mcc(opt)': 0.31790893137511095,
 'threshold': 0.401014055922456}
model:  MLPClassifier
{'auc': 0.8651744515712867,
 'ap': 0.3648194509230858,
 'recall': 0.5974842767295597,
 'precision': 0.2334152334152334,
 'f1': 0.33568904593639576,
 'mcc_no': 0.322438036951767,
 'recall(opt)': 0.3490566037735849,
 'precision(opt)': 0.4188679245283019,
 'f1(opt)': 0.38078902229845624,
 'mcc(opt)': 0.3529214736121068,
 'threshold': 0.8260169565770047}
model:  LGBMClassifier
{'auc': 0.8888576764413647,
 'ap': 0.42249978372603986,
 'recall': 0.48742138364779874,
 'precision': 0.4305555555555556,
 'f1': 0.45722713864306785,
 'mcc_no': 0.4277226585812271,
 'recall(opt)': 0.39308176100628933,
 'precision