# Phase 4: Data Balancing for $t=2$ **without** Threshold Opt.
* Models: SVM, MLP, LightGBM, XGBoost, CatBoost

## Setup

In [1]:
# Python â‰¥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Common imports
import numpy as np
import pandas as pd

from imblearn.over_sampling import (
    SMOTE, ADASYN
)
from imblearn.under_sampling import (
    CondensedNearestNeighbour,
    EditedNearestNeighbours,
    TomekLinks,
)
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier


## Data Loading

In [2]:
seed = 42
df = pd.read_csv(f"datasets/train_set_{seed}_t_2.csv")

Separate features and labels from the train-set and valid-set

In [3]:
X_80 = df.drop(['is_drop'], axis=1)
y_80 = df['is_drop'].copy()

X_80.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25406 entries, 0 to 25405
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index_o            25406 non-null  int64  
 1   year               25406 non-null  int64  
 2   semester           25406 non-null  int64  
 3   grade              25406 non-null  int64  
 4   sex                25406 non-null  object 
 5   gpa_last_seme      25406 non-null  float64
 6   credits_last_seme  25406 non-null  float64
 7   credits_tot        25406 non-null  float64
 8   n_seme             25406 non-null  int64  
 9   years_since        25406 non-null  int64  
 10  college            25406 non-null  object 
 11  adm_unit           25406 non-null  int64  
 12  nation             25406 non-null  int64  
 13  in_capa            25406 non-null  bool   
 14  leave              25406 non-null  bool   
dtypes: bool(2), float64(3), int64(8), object(2)
memory usage: 2.6+ MB


## Data Transformation

In [4]:
# Label encoding for categorical features
def sex_mapping(sex):
    sex_map = {'M': 0, 'F': 1}
    return sex_map[sex]

def seme_mapping(seme):
    seme_map = {1: 0, 2: 1}
    return seme_map[seme]

def college_mapping(college):
    college_map = {'TH': 0, 'HS': 1, 'BZ': 2, 'HT': 3, 'EG': 4, 'SW': 5, 'AT': 6}
    return college_map[college]


In [5]:
# Label encoding for categrical features
def label_encoding(data):
    data['sex'] = data['sex'].map(sex_mapping)
    data['semester'] = data['semester'].map(seme_mapping)
    data['college'] = data['college'].map(college_mapping)

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

num_attrs = ['grade', 'gpa_last_seme', 'credits_last_seme', 'credits_tot', 'n_seme', 'years_since']
cat_attrs = ['semester', 'sex', 'adm_unit', 'nation', 'in_capa', 'college', 'leave']

full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), num_attrs),
    ('cat', OneHotEncoder(), cat_attrs)
])

cat_pipeline = ColumnTransformer([
    ('num', StandardScaler(), num_attrs),
    ('cat', 'passthrough', cat_attrs)
])

X_t = full_pipeline.fit_transform(X_80)    # for SVM and MLP with one-hot encoding
label_encoding(X_80)   # for XGBoost and LightGBM with label encoding
X_cat = cat_pipeline.fit_transform(X_80) # for CatBoost

X_80 = pd.DataFrame(X_t, columns=full_pipeline.get_feature_names_out())   # for SVM and MLP
X_cat = pd.DataFrame(X_cat, columns=cat_pipeline.get_feature_names_out()) # for CatBoost, LightGBM, and XGBoost
cat_indices = [6, 7, 8, 9, 10, 11, 12]

# Typesetting
num_cols = ['num__grade', 'num__gpa_last_seme', 'num__credits_last_seme', 'num__credits_tot', 'num__n_seme', 'num__years_since']
cat_cols = list(set(X_80.columns) - set(num_cols))
X_80[num_cols] = X_80[num_cols].astype(float)
X_80[cat_cols] = X_80[cat_cols].astype(int)

cat_cols = list(set(X_cat.columns) - set(num_cols))
X_cat[num_cols] = X_cat[num_cols].astype(float)
X_cat[cat_cols] = X_cat[cat_cols].astype(int)

## Model Training with Resampling

In [7]:
sampler_map = {
    "smote": SMOTE(random_state=0),
    "adasyn": ADASYN(
        sampling_strategy="auto",  # samples only the minority class
        random_state=0,  # for reproducibility
        n_neighbors=5,
    ),
    "smoteenn": SMOTEENN(random_state=0),
    "cnn": CondensedNearestNeighbour(
        sampling_strategy="auto", random_state=0, n_neighbors=1, n_jobs=4
    ),
    "enn": EditedNearestNeighbours(
        sampling_strategy="auto", n_neighbors=3, kind_sel="all", n_jobs=4
    ),
    "tomek": TomekLinks(sampling_strategy="auto", n_jobs=4),
}

Resampling and training

In [None]:
svm = {}
mlp = {}
xgb = {}
lgb = {}
cat = {}

for sampler in sampler_map.keys():
    print(sampler)

    # resample
    X, y = sampler_map[sampler].fit_resample(X_80, y_80)

    # set up the classifier
    svm[sampler] = SVC(C=10, gamma=0.1)
    svm[sampler].fit(X, y)

    mlp[sampler] = MLPClassifier(max_iter=10_000, hidden_layer_sizes=[10,10,10], random_state=43)
    mlp[sampler].fit(X, y)

    X, y = sampler_map[sampler].fit_resample(X_cat, y_80)
    
    xgb[sampler] = XGBClassifier(n_estimators=40, max_depth=5)
    xgb[sampler].fit(X, y)

    lgb[sampler] = LGBMClassifier(n_estimators=90, max_depth=8)
    lgb[sampler].fit(X, y)

    cat[sampler] = CatBoostClassifier(cat_features=cat_indices, max_depth=6, iterations=1000)
    cat[sampler].fit(X, y)

smote


In [None]:
import joblib

names = ['svm', 'mlp', 'xgb', 'lgb', 'cat']
for name in names:
    model_dict = {
        'model': globals()[f'{name}'],
    }
    joblib.dump(model_dict, f'models/imbal_default_{name}_1_2.pkl')


## Final Test

In [None]:
test_set = pd.read_csv(f"datasets/test_set_{seed}_t_2.csv")

In [None]:
# Preprocessing test set
X_te = full_pipeline.transform(test_set)
label_encoding(test_set)
X_te_cat = cat_pipeline.transform(test_set)
y_te = test_set['is_drop'].copy()

X_te = pd.DataFrame(X_te, columns=full_pipeline.get_feature_names_out())
X_te_cat = pd.DataFrame(X_te_cat, columns=cat_pipeline.get_feature_names_out())

# Typesetting 
X_te_cat[num_cols] = X_te_cat[num_cols].astype(float)
X_te_cat[cat_cols] = X_te_cat[cat_cols].astype(int)

In [None]:
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    recall_score, precision_score, f1_score, matthews_corrcoef
)
from pprint import pprint

# function to evaluate the performance with test set and given threshold
def test_model(clf, X_test, y_test): 
    if (type(clf).__name__ == 'SVC'):
        y_score = clf.decision_function(X_test)
    else: y_score = clf.predict_proba(X_test)[:, 1]

    auc = roc_auc_score(y_test, y_score)
    ap = average_precision_score(y_test, y_score)

    if (type(clf).__name__ == 'SVC'):
        y_pred = (y_score >= 0)
    else: y_pred = (y_score >= 0.5)
    recall_no = recall_score(y_test, y_pred)
    preci_no = precision_score(y_test, y_pred)
    f1_no = f1_score(y_test, y_pred)
    mcc_no = matthews_corrcoef(y_test, y_pred)

    res = {'auc': auc, 'ap': ap, 'recall': recall_no, 'precision': preci_no, 'f1': f1_no, 'mcc': mcc_no}
    print("model: ", type(clf).__name__)
    pprint(res)
    return res

In [None]:
# Now we test the six models with the test set and given thresholds.

test_svm = {}
test_mlp = {}
test_xgb = {}
test_lgb = {}
test_cat = {}

for sampler in sampler_map.keys():
    print(sampler)
    
    # set up the classifier
    test_svm[sampler] = test_model(svm[sampler], X_te, y_te)
    test_mlp[sampler] = test_model(mlp[sampler], X_te, y_te)
    test_lgb[sampler] = test_model(lgb[sampler], X_te_cat, y_te)
    test_xgb[sampler] = test_model(xgb[sampler], X_te_cat, y_te)
    test_cat[sampler] = test_model(cat[sampler], X_te_cat, y_te)


    print()

smote
model:  SVC
{'ap': 0.358933513148345,
 'auc': 0.854679786298776,
 'f1': 0.37792329279700654,
 'mcc': 0.3676647894968342,
 'precision': 0.26897470039946736,
 'recall': 0.6352201257861635}
model:  MLPClassifier
{'ap': 0.3982688867563325,
 'auc': 0.8849152842153452,
 'f1': 0.3506805444355484,
 'mcc': 0.3519767331229714,
 'precision': 0.23523093447905477,
 'recall': 0.6886792452830188}
model:  LGBMClassifier
{'ap': 0.4428977754949614,
 'auc': 0.8933533092997488,
 'f1': 0.4581005586592179,
 'mcc': 0.42923557341185975,
 'precision': 0.4120603015075377,
 'recall': 0.5157232704402516}
model:  XGBClassifier
{'ap': 0.4314913476363075,
 'auc': 0.8874156613657527,
 'f1': 0.4463840399002494,
 'mcc': 0.4211906920348153,
 'precision': 0.36983471074380164,
 'recall': 0.5628930817610063}
model:  CatBoostClassifier
{'ap': 0.44894525673614727,
 'auc': 0.8972985626667915,
 'f1': 0.45390070921985815,
 'mcc': 0.4328810188417063,
 'precision': 0.5203252032520326,
 'recall': 0.4025157232704403}

adasyn
