# Imbalanced Learn with Stacking for $Y(2)$

## Setup

In [1]:
# Python â‰¥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Common imports
import numpy as np
import pandas as pd

from sklearn.model_selection import (
    cross_val_predict, cross_validate,
    RepeatedStratifiedKFold,
)

from imblearn.over_sampling import (
    RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE
)
from imblearn.under_sampling import (
    AllKNN,
    CondensedNearestNeighbour,
    EditedNearestNeighbours,
    InstanceHardnessThreshold,
    OneSidedSelection,
    RandomUnderSampler,
    NeighbourhoodCleaningRule,
    RepeatedEditedNearestNeighbours,
    TomekLinks,
)
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier


## Data Loading

In [2]:
seed = 42
dataset = pd.read_csv(f"datasets/train_set_{seed}_t_2.csv")

Separating validation set

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
for train_index, valid_index in split.split(dataset, dataset['is_drop']):
    train_set = dataset.loc[train_index]
    valid_set = dataset.loc[valid_index]

train_set.reset_index(inplace=True, drop=True)
valid_set.reset_index(inplace=True, drop=True)

Separate features and labels from the train-set and valid-set

In [5]:
X_tr = train_set.drop(['is_drop'], axis=1)
y_tr = train_set['is_drop'].copy()

X_va = valid_set.drop(['is_drop'], axis=1)
y_va = valid_set['is_drop'].copy()

X_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19054 entries, 0 to 19053
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index_o            19054 non-null  int64  
 1   year               19054 non-null  int64  
 2   semester           19054 non-null  int64  
 3   grade              19054 non-null  int64  
 4   sex                19054 non-null  object 
 5   gpa_last_seme      19054 non-null  float64
 6   credits_last_seme  19054 non-null  float64
 7   credits_tot        19054 non-null  float64
 8   n_seme             19054 non-null  int64  
 9   years_since        19054 non-null  int64  
 10  college            19054 non-null  object 
 11  adm_unit           19054 non-null  int64  
 12  nation             19054 non-null  int64  
 13  in_capa            19054 non-null  bool   
 14  leave              19054 non-null  bool   
dtypes: bool(2), float64(3), int64(8), object(2)
memory usage: 1.9+ MB


## Data Transformation

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

num_attrs = ['grade', 'gpa_last_seme', 'credits_last_seme', 'credits_tot', 'n_seme', 'years_since']
cat_attrs = ['semester', 'sex', 'adm_unit', 'nation', 'in_capa', 'college', 'leave']

full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), num_attrs),
#    ('cat', 'passthrough', cat_attrs)
    ('cat', OneHotEncoder(), cat_attrs)
])

cat_pipeline = ColumnTransformer([
    ('num', StandardScaler(), num_attrs),
    ('cat', 'passthrough', cat_attrs)
])

X_t = full_pipeline.fit_transform(X_tr)
X_tr = pd.DataFrame(X_t, columns=full_pipeline.get_feature_names_out())


In [7]:
X_t = full_pipeline.transform(X_va)
X_va = pd.DataFrame(X_t, columns=full_pipeline.get_feature_names_out())

## Model Training with Resampling

In [8]:
sampler_map = {
    "smote": SMOTE(random_state=0),
    "adasyn": ADASYN(
        sampling_strategy="auto",  # samples only the minority class
        random_state=0,  # for reproducibility
        n_neighbors=5,
    ),
    "cnn": CondensedNearestNeighbour(
        sampling_strategy="auto", random_state=0, n_neighbors=1, n_jobs=4
    ),
    "enn": EditedNearestNeighbours(
        sampling_strategy="auto", n_neighbors=3, kind_sel="all", n_jobs=4
    ),
    "tomek": TomekLinks(sampling_strategy="auto", n_jobs=4),
    "smoteenn": SMOTEENN(random_state=0),
}

In [9]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_predict, cross_validate

cat_indices = [6, 7, 8, 9, 10, 11, 12]

estimators = [
    ('svm', SVC(C=10, gamma=0.1, random_state=42)),
    ('mlp', MLPClassifier(max_iter=10_000, hidden_layer_sizes=[10,10,10], random_state=42)),
    ('xgb', XGBClassifier(max_depth=4, n_estimators=70)),
    ('lgbm', LGBMClassifier(max_depth=8, n_estimators=100)),
    ('cat', CatBoostClassifier()),
]

In [10]:
stk = {}

for sampler in sampler_map.keys():
    print(sampler)
    X, y = sampler_map[sampler].fit_resample(X_tr, y_tr)

    # set up the classifier
    stk[sampler] = StackingClassifier(
        estimators=estimators, final_estimator=LogisticRegression(C=10)
    )

    stk[sampler].fit(X, y)


smote
[LightGBM] [Info] Number of positive: 18069, number of negative: 18069
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003148 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7115
[LightGBM] [Info] Number of data points in the train set: 36138, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Learning rate set to 0.047663
0:	learn: 0.6661290	total: 148ms	remaining: 2m 28s
1:	learn: 0.6371245	total: 154ms	remaining: 1m 16s
2:	learn: 0.6085822	total: 159ms	remaining: 52.9s
3:	learn: 0.5856289	total: 164ms	remaining: 40.9s
4:	learn: 0.5659193	total: 169ms	remaining: 33.7s
5:	learn: 0.5465709	total: 175ms	remaining: 29s
6:	learn: 0.5315232	total: 180ms	remaining: 25.6s
7:	learn: 0.5189778	total: 185ms	remaining: 23s
8:	learn: 0.5052216	total: 191ms	remaining: 21s
9:	learn: 0.4919320	total: 196ms	remaining: 19.4s
10:	learn: 0.4781808	total: 2

## Model Evaluation

In [11]:
from sklearn.metrics import (
    precision_recall_curve, roc_curve, roc_auc_score, average_precision_score,
    recall_score, precision_score, f1_score, matthews_corrcoef
)

from pprint import pprint

# function to train models and evaluate the performance

def run_model(clf, X_test, y_test): 
    if (type(clf).__name__ == 'SVC'):
        y_score = clf.decision_function(X_test)
    else: y_score = clf.predict_proba(X_test)[:, 1]

    auc = roc_auc_score(y_test, y_score)
    ap = average_precision_score(y_test, y_score)

    if (type(clf).__name__ == 'SVC'):
        y_pred = (y_score >= 0)
    else: y_pred = (y_score >= 0.5)
    recall_no = recall_score(y_test, y_pred)
    preci_no = precision_score(y_test, y_pred)
    f1_no = f1_score(y_test, y_pred)
    mcc_no = matthews_corrcoef(y_test, y_pred)

    precision, recall, thresholds = precision_recall_curve(y_test, y_score)

    # Maximizing F1-score
    f1_val = (2 * precision * recall) / (precision + recall)
    f1_val[np.isinf(f1_val)] = np.nan

    i = np.nanargmax(f1_val)
    best_thre = thresholds[i]

    y_pred = (y_score >= best_thre)
    recall_opt = recall_score(y_test, y_pred)
    preci_opt = precision_score(y_test, y_pred)
    f1_opt = f1_score(y_test, y_pred)
    mcc_opt = matthews_corrcoef(y_test, y_pred)


    res = {'ap': ap, 'auc': auc, 'recall': recall_no, 'precision': preci_no, 
           'f1': f1_no, 'mcc_no': mcc_no, 'recall(opt)': recall_opt, 
           'precision(opt)': preci_opt, 'f1(opt)': f1_opt, 'mcc(opt)': mcc_opt,
           'threshold': best_thre}
    print("model: ", type(clf).__name__)
    pprint(res, sort_dicts=False)
    return res

In [12]:
# now we train several models, with the different UnderSampling techniques and


res_stk = {}

for sampler in sampler_map.keys():
    print(sampler)
    
    # set up the classifier
    res_stk[sampler] = run_model(stk[sampler], X_va, y_va)
    print()

smote
model:  StackingClassifier
{'ap': 0.4244705918436847,
 'auc': 0.8728438150211424,
 'recall': 0.45592705167173253,
 'precision': 0.4658385093167702,
 'f1': 0.4608294930875576,
 'mcc_no': 0.43173938978641857,
 'recall(opt)': 0.5106382978723404,
 'precision(opt)': 0.44680851063829785,
 'f1(opt)': 0.4765957446808511,
 'mcc(opt)': 0.4471024044466587,
 'threshold': 0.3895774348149278}

adasyn
model:  StackingClassifier
{'ap': 0.43340063889712205,
 'auc': 0.8740549777019904,
 'recall': 0.49544072948328266,
 'precision': 0.47246376811594204,
 'f1': 0.4836795252225519,
 'mcc_no': 0.4549120168329319,
 'recall(opt)': 0.5015197568389058,
 'precision(opt)': 0.47413793103448276,
 'f1(opt)': 0.4874446085672083,
 'mcc(opt)': 0.4588184954999951,
 'threshold': 0.49523789504047855}

cnn
model:  StackingClassifier
{'ap': 0.4606036368996149,
 'auc': 0.8929617318011454,
 'recall': 0.40425531914893614,
 'precision': 0.5757575757575758,
 'f1': 0.475,
 'mcc_no': 0.45930525848051795,
 'recall(opt)': 0.495

In [13]:
import joblib

names = ['stk']
for name in names:
    model_dict = {
        'model': globals()[f'{name}'],
        'result': globals()[f'res_{name}']
    }
    joblib.dump(model_dict, f'models/imbal_opt_{name}_1_2.pkl')


## Final Test

In [14]:
test_set = pd.read_csv(f"datasets/test_set_{seed}_t_2.csv")

In [15]:
X_te = full_pipeline.transform(test_set)
y_te = test_set['is_drop'].copy()

X_te = pd.DataFrame(X_te, columns=full_pipeline.get_feature_names_out())


In [16]:
# function to train models and evaluate the performance

def test_model(clf, X_test, y_test, thre): 
    if (type(clf).__name__ == 'SVC'):
        y_score = clf.decision_function(X_test)
    else: y_score = clf.predict_proba(X_test)[:, 1]

    auc = roc_auc_score(y_test, y_score)
    ap = average_precision_score(y_test, y_score)

    if (type(clf).__name__ == 'SVC'):
        y_pred = (y_score >= 0)
    else: y_pred = (y_score >= 0.5)
    recall_no = recall_score(y_test, y_pred)
    preci_no = precision_score(y_test, y_pred)
    f1_no = f1_score(y_test, y_pred)
    mcc_no = matthews_corrcoef(y_test, y_pred)

    # Maximizing F1-score
    y_pred = (y_score >= thre)
    recall_opt = recall_score(y_test, y_pred)
    preci_opt = precision_score(y_test, y_pred)
    f1_opt = f1_score(y_test, y_pred)
    mcc_opt = matthews_corrcoef(y_test, y_pred)

    res = {'ap': ap, 'auc': auc, 'recall': recall_no, 'precision': preci_no, 
           'f1': f1_no, 'mcc_no': mcc_no, 'recall(opt)': recall_opt, 
           'precision(opt)': preci_opt, 'f1(opt)': f1_opt, 'mcc(opt)': mcc_opt,
           'threshold': thre}
    print("model: ", type(clf).__name__)
    pprint(res, sort_dicts=False)
    return res

In [17]:
# now we train several models, with the different UnderSampling techniques and
# with cross-validation 

# save results

test_stk = {}

for sampler in sampler_map.keys():
    print(sampler)
    
    # set up the classifier
    test_stk[sampler] = test_model(stk[sampler], X_te, y_te, res_stk[sampler]['threshold'])

    print()

smote
model:  StackingClassifier
{'ap': 0.43240986505260165,
 'auc': 0.8870226446306787,
 'recall': 0.44339622641509435,
 'precision': 0.4577922077922078,
 'f1': 0.4504792332268371,
 'mcc_no': 0.422116689202981,
 'recall(opt)': 0.4716981132075472,
 'precision(opt)': 0.4155124653739612,
 'f1(opt)': 0.4418262150220913,
 'mcc(opt)': 0.41142058530372727,
 'threshold': 0.3895774348149278}

adasyn
model:  StackingClassifier
{'ap': 0.4469783258422672,
 'auc': 0.890631843979379,
 'recall': 0.46855345911949686,
 'precision': 0.4501510574018127,
 'f1': 0.4591679506933744,
 'mcc_no': 0.43021256896454807,
 'recall(opt)': 0.4716981132075472,
 'precision(opt)': 0.4491017964071856,
 'f1(opt)': 0.4601226993865031,
 'mcc(opt)': 0.43113187859009117,
 'threshold': 0.49523789504047855}

cnn
model:  StackingClassifier
{'ap': 0.45784273626721267,
 'auc': 0.9024665636655899,
 'recall': 0.37735849056603776,
 'precision': 0.5607476635514018,
 'f1': 0.45112781954887216,
 'mcc_no': 0.4373018191528589,
 'recall(o