In [64]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [65]:
data_train = pd.read_csv('train_features.csv', index_col='id')

In [66]:
X, y = data_train.drop(columns=['y']), data_train['y']

In [67]:
X

Unnamed: 0_level_0,PRinterm,PRinterstd,PRsegm,PRsegstd,QRSmean,QRSstd,QTinterm,QTinterstd,STsegm,STsegstd,...,dfa_alpha2,HRV_DFA_alpha2,HRV_MFDFA_alpha2_Width,HRV_MFDFA_alpha2_Peak,HRV_MFDFA_alpha2_Mean,HRV_MFDFA_alpha2_Max,HRV_MFDFA_alpha2_Delta,HRV_MFDFA_alpha2_Asymmetry,HRV_MFDFA_alpha2_Fluctuation,HRV_MFDFA_alpha2_Increment
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,40.500000,19.447044,33.234375,46.176201,24.156250,2.985479,108.031250,5.177985,66.446152,5.378106,...,,,,,,,,,,
1,39.529411,19.336395,32.787880,40.742176,30.000000,3.880570,91.794121,18.339573,62.085712,53.645596,...,,,,,,,,,,
2,37.241379,5.028575,18.344828,4.137931,26.206896,1.185525,114.758621,23.717672,73.733330,21.568392,...,,,,,,,,,,
3,24.369230,9.272903,28.169231,39.208542,29.888889,16.502151,62.044445,15.884241,21.777779,7.983006,...,-0.294153,,,,,,,,,
4,36.333332,4.979960,18.000000,5.781196,25.533333,1.654623,82.511108,13.590229,44.413044,26.781382,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5112,63.047619,23.304356,60.809525,55.594296,27.285715,4.072473,91.190475,29.805111,52.181820,21.579948,...,,,,,,,,,,
5113,29.299999,16.087574,61.344826,98.975189,27.333334,4.093355,113.133331,23.779169,69.806450,21.645859,...,,,,,,,,,,
5114,34.294117,5.102072,16.794117,4.927590,28.029411,2.854154,103.088234,3.184223,56.714287,5.079772,...,,,,,,,,,,
5115,37.866665,19.527302,58.354839,88.032822,27.032259,2.117270,110.193550,6.712624,67.468750,6.219849,...,,,,,,,,,,


In [68]:
y

id
0       0
1       0
2       0
3       1
4       2
       ..
5112    3
5113    0
5114    0
5115    0
5116    2
Name: y, Length: 5117, dtype: int64

In [69]:
X_ = X.replace(to_replace=np.inf, value=np.nan)

# <span style ="color: lightgreen; font-weight: bold"> Class Imbalance </span>

In [70]:
# We need to deal with class imbalance somehow
value_counts = (y.value_counts() / y.shape[-1])
value_counts * 100

y
0    59.214383
2    28.805941
1     8.657416
3     3.322259
Name: count, dtype: float64

In [80]:
class_weights = (1 / value_counts).to_dict()
class_weights[2] = 30
class_weights

{0: 1.6887788778877888, 2: 30, 1: 11.550790067720092, 3: 30.099999999999998}

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X_, y, test_size=0.2, shuffle=True)

# <span style ="color: pink; font-weight: bold"> Model Tuning </span>

In [75]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.inspection import permutation_importance

# <span style ="color: Lightgreen; font-weight: bold"> No Class Imbalance HistGradientBoosting </span>

In [76]:
import optuna 
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

def objective(trial):
    # Define the hyperparameter search space
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.5, log=True),
        "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 10, 50),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 10, 50),
        "l2_regularization": trial.suggest_float("l2_regularization", 1e-4, 1.0, log=True),
        "max_bins": trial.suggest_int("max_bins", 50, 255),
        "early_stopping": True,  # Early stopping is enabled by default in this classifier
        "random_state": 42,
    }
    
    # Create the model with the trial's parameters
    model = HistGradientBoostingClassifier(**params)
    
    # Perform cross-validation
    scores = cross_val_score(
        model, X_train, y_train, cv=5, scoring="f1_micro", n_jobs=-1,
    )
    return scores.mean()

# Run the optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, timeout=360)

# Best hyperparameters
print("Best hyperparameters:")
print(study.best_params)

[I 2024-11-30 14:49:58,726] A new study created in memory with name: no-name-d6093a30-7c04-412b-b530-3ab0e6ab6506
[I 2024-11-30 14:50:06,583] Trial 0 finished with value: 0.8003919742306052 and parameters: {'learning_rate': 0.018937110250783252, 'max_leaf_nodes': 50, 'max_depth': 8, 'min_samples_leaf': 28, 'l2_regularization': 0.005261985932010886, 'max_bins': 127}. Best is trial 0 with value: 0.8003919742306052.
[I 2024-11-30 14:50:11,467] Trial 1 finished with value: 0.7957509754575769 and parameters: {'learning_rate': 0.014819133021774445, 'max_leaf_nodes': 46, 'max_depth': 7, 'min_samples_leaf': 31, 'l2_regularization': 0.023148057682310035, 'max_bins': 76}. Best is trial 0 with value: 0.8003919742306052.
[I 2024-11-30 14:50:17,056] Trial 2 finished with value: 0.8025900749617133 and parameters: {'learning_rate': 0.01777143526086296, 'max_leaf_nodes': 45, 'max_depth': 6, 'min_samples_leaf': 11, 'l2_regularization': 0.00014111320706613768, 'max_bins': 80}. Best is trial 2 with value

Best hyperparameters:
{'learning_rate': 0.06152081637304603, 'max_leaf_nodes': 50, 'max_depth': 5, 'min_samples_leaf': 16, 'l2_regularization': 0.023916592075196863, 'max_bins': 125}


In [77]:
best_model_hist_no_imb = HistGradientBoostingClassifier(**study.best_params)
best_model_hist_no_imb.fit(X_train, y_train)
y_pred_best_model_hist_no_imb = best_model_hist_no_imb.predict(X_test)
print("\n\nAccuracy:{:,.2f}%".format(accuracy_score(y_test, y_pred_best_model_hist_no_imb)*100))
print("Precision:{:,.2f}%".format(precision_score(y_test, y_pred_best_model_hist_no_imb, average="micro")*100))
print("Recall:{:,.2f}%".format(recall_score(y_test, y_pred_best_model_hist_no_imb, average="micro")*100))
print("F1-Score:{:,.2f}%".format(f1_score(y_test, y_pred_best_model_hist_no_imb, average="micro")*100))
print(classification_report(y_test, y_pred_best_model_hist_no_imb))



Accuracy:81.15%
Precision:81.15%
Recall:81.15%
F1-Score:81.15%
              precision    recall  f1-score   support

           0       0.85      0.92      0.88       611
           1       0.82      0.74      0.78        90
           2       0.72      0.64      0.68       286
           3       0.71      0.54      0.62        37

    accuracy                           0.81      1024
   macro avg       0.78      0.71      0.74      1024
weighted avg       0.81      0.81      0.81      1024



In [113]:
y_pred_best_model_hist_no_imb_train = best_model_hist_no_imb.predict(X_train)

# <span style ="color: Lightgreen; font-weight: bold"> Class Imbalance HistGradientBoosting </span>

In [81]:
import optuna 
from sklearn.experimental import enable_hist_gradient_boosting  
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler

def objective(trial):
    # Define the hyperparameter search space
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.5, log=True),
        "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 10, 50),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 10, 50),
        "l2_regularization": trial.suggest_float("l2_regularization", 1e-4, 1.0, log=True),
        "max_bins": trial.suggest_int("max_bins", 50, 255),
        "early_stopping": True,  # Early stopping is enabled by default in this classifier
        "class_weight": class_weights,
        "random_state": 42,
    }
    
    # Create the model with the trial's parameters
    model = HistGradientBoostingClassifier(**params)
    
    # Perform Bootstrapping
    X_train_, y_train_ = RandomOverSampler(sampling_strategy='not majority').fit_resample(X_train, y_train)
    
    # Perform cross-validation
    scores = cross_val_score(
        model, X_train_, y_train_, cv=5, scoring="f1_micro", n_jobs=-1,
    )
    return scores.mean()

# Run the optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, timeout=360)

# Best hyperparameters
print("Best hyperparameters:")
print(study.best_params)

[I 2024-11-30 14:58:06,220] A new study created in memory with name: no-name-1417da1e-2321-49e5-b78a-8072876e2e33
[I 2024-11-30 14:58:08,953] Trial 0 finished with value: 0.8886939692912209 and parameters: {'learning_rate': 0.19163494552375182, 'max_leaf_nodes': 48, 'max_depth': 3, 'min_samples_leaf': 11, 'l2_regularization': 0.04551758515722914, 'max_bins': 85}. Best is trial 0 with value: 0.8886939692912209.
[I 2024-11-30 14:58:13,721] Trial 1 finished with value: 0.8854912230123647 and parameters: {'learning_rate': 0.21400913566637722, 'max_leaf_nodes': 11, 'max_depth': 10, 'min_samples_leaf': 13, 'l2_regularization': 0.00011857554787115282, 'max_bins': 253}. Best is trial 0 with value: 0.8886939692912209.
[I 2024-11-30 14:58:16,945] Trial 2 finished with value: 0.9169095820787152 and parameters: {'learning_rate': 0.19630123696194213, 'max_leaf_nodes': 47, 'max_depth': 9, 'min_samples_leaf': 11, 'l2_regularization': 0.09907840430154884, 'max_bins': 69}. Best is trial 2 with value: 0

Best hyperparameters:
{'learning_rate': 0.30660104698319857, 'max_leaf_nodes': 47, 'max_depth': 10, 'min_samples_leaf': 35, 'l2_regularization': 0.16596445704862006, 'max_bins': 72}


In [82]:
X_train.shape, X_test.shape

((4093, 211), (1024, 211))

In [83]:
best_model_hist_w_imb = HistGradientBoostingClassifier(**study.best_params, class_weight=class_weights)
X_train_, y_train_ = RandomOverSampler(sampling_strategy='not majority').fit_resample(X_train, y_train)
best_model_hist_w_imb.fit(X_train_, y_train_)
y_pred_hist_w_imb = best_model_hist_w_imb.predict(X_test)
print("\n\nAccuracy:{:,.2f}%".format(accuracy_score(y_test, y_pred_hist_w_imb)*100))
print("Precision:{:,.2f}%".format(precision_score(y_test, y_pred_hist_w_imb, average="micro")*100))
print("Recall:{:,.2f}%".format(recall_score(y_test, y_pred_hist_w_imb, average="micro")*100))
print("F1-Score:{:,.2f}%".format(f1_score(y_test, y_pred_hist_w_imb, average="micro")*100))
print(classification_report(y_test, y_pred_hist_w_imb))



Accuracy:80.57%
Precision:80.57%
Recall:80.57%
F1-Score:80.57%
              precision    recall  f1-score   support

           0       0.88      0.86      0.87       611
           1       0.80      0.72      0.76        90
           2       0.69      0.72      0.71       286
           3       0.63      0.70      0.67        37

    accuracy                           0.81      1024
   macro avg       0.75      0.75      0.75      1024
weighted avg       0.81      0.81      0.81      1024



In [114]:
y_pred_hist_w_imb_train = best_model_hist_w_imb.predict(X_train)

# <span style ="color: Lightgreen; font-weight: bold"> No Class Imbalance LightGBM </span>

In [90]:
from lightgbm import LGBMClassifier

def objective_lgbm(trial):
    params = {
        "num_leaves": trial.suggest_int("num_leaves", 20, 150),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.5, log=True),
        "n_estimators": 1000,
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 50),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-4, 1.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-4, 1.0, log=True),
        "random_state": 42,
        'verbose': -1,
        "boosting_type": "gbdt", 
    }
    model = LGBMClassifier(**params)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring="f1_micro", n_jobs=-1)
    return scores.mean()

study_lgbm = optuna.create_study(direction="maximize")
study_lgbm.optimize(objective_lgbm, n_trials=100, timeout=360)

[I 2024-11-30 15:20:59,247] A new study created in memory with name: no-name-3b068d74-49a5-429e-9b09-69ee0a91400c
[I 2024-11-30 15:21:26,802] Trial 0 finished with value: 0.8167560773917742 and parameters: {'num_leaves': 147, 'max_depth': 7, 'learning_rate': 0.3112342365788408, 'min_child_samples': 40, 'reg_alpha': 0.0018778758185213993, 'reg_lambda': 0.11752878259779626}. Best is trial 0 with value: 0.8167560773917742.
[I 2024-11-30 15:23:29,455] Trial 1 finished with value: 0.8160243722590911 and parameters: {'num_leaves': 23, 'max_depth': 13, 'learning_rate': 0.015722031331468236, 'min_child_samples': 21, 'reg_alpha': 0.02786619428607257, 'reg_lambda': 0.00022841341718033364}. Best is trial 0 with value: 0.8167560773917742.
[I 2024-11-30 15:26:48,878] Trial 2 finished with value: 0.8145600663938073 and parameters: {'num_leaves': 103, 'max_depth': 12, 'learning_rate': 0.011485838252489977, 'min_child_samples': 25, 'reg_alpha': 0.0008147138948980087, 'reg_lambda': 0.24339220392801844}

In [91]:
best_model_lgbm_no_imb = LGBMClassifier(verbose=-1, **study_lgbm.best_params)
best_model_lgbm_no_imb.fit(X_train, y_train)

y_pred_lgbm_no_imb = best_model_lgbm_no_imb.predict(X_test)
print(confusion_matrix(y_test, y_pred_lgbm_no_imb))
print("\n\nAccuracy:{:,.2f}%".format(accuracy_score(y_test, y_pred_lgbm_no_imb)*100))
print("Precision:{:,.2f}%".format(precision_score(y_test, y_pred_lgbm_no_imb, average="micro")*100))
print("Recall:{:,.2f}%".format(recall_score(y_test, y_pred_lgbm_no_imb, average="micro")*100))
print("F1-Score:{:,.2f}%".format(f1_score(y_test, y_pred_lgbm_no_imb, average="micro")*100))
print(classification_report(y_test, y_pred_lgbm_no_imb))

[[567   2  38   4]
 [  3  66  20   1]
 [ 88   8 186   4]
 [  7   5   6  19]]


Accuracy:81.84%
Precision:81.84%
Recall:81.84%
F1-Score:81.84%
              precision    recall  f1-score   support

           0       0.85      0.93      0.89       611
           1       0.81      0.73      0.77        90
           2       0.74      0.65      0.69       286
           3       0.68      0.51      0.58        37

    accuracy                           0.82      1024
   macro avg       0.77      0.71      0.73      1024
weighted avg       0.81      0.82      0.81      1024



In [115]:
y_pred_lgbm_no_imb_train = best_model_lgbm_no_imb.predict(X_train)

# <span style ="color: Lightgreen; font-weight: bold"> Class Imbalance LightGBM </span>

In [92]:
from lightgbm import LGBMClassifier

def objective_lgbm(trial):
    params = {
        "num_leaves": trial.suggest_int("num_leaves", 20, 150),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.5, log=True),
        "n_estimators": 1000,
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 50),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-4, 1.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-4, 1.0, log=True),
        "random_state": 42,
        'verbose': -1,
        "boosting_type": "gbdt", 
        "class_weight": class_weights
    }
    X_train_, y_train_ = RandomOverSampler(sampling_strategy='not majority').fit_resample(X_train, y_train)
    model = LGBMClassifier(**params)
    scores = cross_val_score(model, X_train_, y_train_, cv=5, scoring="f1_micro", n_jobs=-1)
    return scores.mean()

study_lgbm = optuna.create_study(direction="maximize")
study_lgbm.optimize(objective_lgbm, n_trials=100, timeout=360)

[I 2024-11-30 15:31:21,642] A new study created in memory with name: no-name-cc8c004e-88d6-4983-9c93-869d0638894b
[I 2024-11-30 15:32:08,241] Trial 0 finished with value: 0.9508071732120357 and parameters: {'num_leaves': 44, 'max_depth': 5, 'learning_rate': 0.1571417496408064, 'min_child_samples': 16, 'reg_alpha': 0.05581992611846067, 'reg_lambda': 0.00399244312370178}. Best is trial 0 with value: 0.9508071732120357.
[I 2024-11-30 15:34:36,224] Trial 1 finished with value: 0.9485333781792556 and parameters: {'num_leaves': 125, 'max_depth': 13, 'learning_rate': 0.02892945061886817, 'min_child_samples': 50, 'reg_alpha': 0.3605506296669571, 'reg_lambda': 0.0018066900754530016}. Best is trial 0 with value: 0.9508071732120357.
[I 2024-11-30 15:36:49,347] Trial 2 finished with value: 0.9508077604800649 and parameters: {'num_leaves': 133, 'max_depth': 5, 'learning_rate': 0.05283139506205304, 'min_child_samples': 35, 'reg_alpha': 0.0008256413658411637, 'reg_lambda': 0.7426942201044413}. Best i

In [93]:
best_model_lgbm_imb = LGBMClassifier(verbose=-1, **study_lgbm.best_params)
X_train_, y_train_ = RandomOverSampler(sampling_strategy='not majority').fit_resample(X_train, y_train)
best_model_lgbm_imb.fit(X_train_, y_train_)

y_pred_lgbm_imb = best_model_lgbm_imb.predict(X_test)
print(confusion_matrix(y_test, y_pred_lgbm_imb))
print("\n\nAccuracy:{:,.2f}%".format(accuracy_score(y_test, y_pred_lgbm_imb)*100))
print("Precision:{:,.2f}%".format(precision_score(y_test, y_pred_lgbm_imb, average="micro")*100))
print("Recall:{:,.2f}%".format(recall_score(y_test, y_pred_lgbm_imb, average="micro")*100))
print("F1-Score:{:,.2f}%".format(f1_score(y_test, y_pred_lgbm_imb, average="micro")*100))
print(classification_report(y_test, y_pred_lgbm_imb))

[[558   3  41   9]
 [  2  64  18   6]
 [ 80  12 187   7]
 [  4   1   5  27]]


Accuracy:81.64%
Precision:81.64%
Recall:81.64%
F1-Score:81.64%
              precision    recall  f1-score   support

           0       0.87      0.91      0.89       611
           1       0.80      0.71      0.75        90
           2       0.75      0.65      0.70       286
           3       0.55      0.73      0.63        37

    accuracy                           0.82      1024
   macro avg       0.74      0.75      0.74      1024
weighted avg       0.82      0.82      0.81      1024



In [116]:
y_pred_lgbm_imb_train = best_model_lgbm_imb.predict(X_train)

# <span style ="color: Lightgreen; font-weight: bold"> No Class Imbalance Xgboost </span>

In [95]:
from xgboost import XGBClassifier

def objective_xgb(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.5, log=True),
        "n_estimators": 1000,
        "gamma": trial.suggest_float("gamma", 0, 5),
        "min_child_weight": trial.suggest_float("min_child_weight", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "random_state": 42,
        "eval_metric": "logloss"
    }
    model = XGBClassifier(**params)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy", n_jobs=-1)
    return scores.mean()

study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(objective_xgb, n_trials=100, timeout=360)

[I 2024-11-30 15:43:09,809] A new study created in memory with name: no-name-0c6197eb-a48b-4794-947e-2c50827ddb1d
[I 2024-11-30 15:43:38,161] Trial 0 finished with value: 0.8177367593015517 and parameters: {'max_depth': 12, 'learning_rate': 0.06325863157028469, 'gamma': 1.0508475247094695, 'min_child_weight': 6.595240426678172, 'subsample': 0.7550270475014992, 'colsample_bytree': 0.8647617979582115}. Best is trial 0 with value: 0.8177367593015517.
[I 2024-11-30 15:43:48,860] Trial 1 finished with value: 0.7974576903672259 and parameters: {'max_depth': 13, 'learning_rate': 0.4900434766663554, 'gamma': 1.3544195382012387, 'min_child_weight': 8.809306097485582, 'subsample': 0.6513829385795483, 'colsample_bytree': 0.562240063415747}. Best is trial 0 with value: 0.8177367593015517.
[I 2024-11-30 15:44:19,142] Trial 2 finished with value: 0.8126007923073939 and parameters: {'max_depth': 14, 'learning_rate': 0.13501895822455273, 'gamma': 0.31096833876964824, 'min_child_weight': 3.636365582066

In [96]:
best_model_xgb_imb = XGBClassifier(**study_xgb.best_params)
best_model_xgb_imb.fit(X_train, y_train)

y_pred_xgb_imb = best_model_xgb_imb.predict(X_test)
print(confusion_matrix(y_test, y_pred_xgb_imb))
print("\n\nAccuracy:{:,.2f}%".format(accuracy_score(y_test, y_pred_xgb_imb)*100))
print("Precision:{:,.2f}%".format(precision_score(y_test, y_pred_xgb_imb, average="micro")*100))
print("Recall:{:,.2f}%".format(recall_score(y_test, y_pred_xgb_imb, average="micro")*100))
print("F1-Score:{:,.2f}%".format(f1_score(y_test, y_pred_xgb_imb, average="micro")*100))
print(classification_report(y_test, y_pred_xgb_imb))

[[570   3  34   4]
 [  3  65  20   2]
 [ 92   9 181   4]
 [  6   3   5  23]]


Accuracy:81.93%
Precision:81.93%
Recall:81.93%
F1-Score:81.93%
              precision    recall  f1-score   support

           0       0.85      0.93      0.89       611
           1       0.81      0.72      0.76        90
           2       0.75      0.63      0.69       286
           3       0.70      0.62      0.66        37

    accuracy                           0.82      1024
   macro avg       0.78      0.73      0.75      1024
weighted avg       0.81      0.82      0.81      1024



In [117]:
y_pred_xgb_imb_train = best_model_xgb_imb.predict(X_train)

# <span style ="color: Lightgreen; font-weight: bold"> Class Imbalance Xgboost </span>

In [107]:
from xgboost import XGBClassifier

def objective_xgb(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.5, log=True),
        "n_estimators": 1000,
        "gamma": trial.suggest_float("gamma", 0, 5),
        "min_child_weight": trial.suggest_float("min_child_weight", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "random_state": 42,
        "eval_metric": "logloss", 
    }
    X_train_, y_train_ = RandomOverSampler(sampling_strategy='not majority').fit_resample(X_train, y_train)
    sample_weights = np.array([class_weights[label] for label in y_train_])
    model = XGBClassifier(**params)
    scores = cross_val_score(model, X_train_, y_train_, cv=5, scoring="accuracy", n_jobs=-1, fit_params={'sample_weight':sample_weights})
    return scores.mean()

study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(objective_xgb, n_trials=100, timeout=360)

[I 2024-11-30 15:59:25,767] A new study created in memory with name: no-name-1522f3eb-f54f-4b57-985a-e24938150ca6
[I 2024-11-30 16:02:04,739] Trial 0 finished with value: 0.9327215602109895 and parameters: {'max_depth': 13, 'learning_rate': 0.025551733771846767, 'gamma': 2.2061414288074137, 'min_child_weight': 9.985466540489178, 'subsample': 0.9936121251803753, 'colsample_bytree': 0.8368257773082239}. Best is trial 0 with value: 0.9327215602109895.
[I 2024-11-30 16:03:30,677] Trial 1 finished with value: 0.9333417152497491 and parameters: {'max_depth': 10, 'learning_rate': 0.06138205302717386, 'gamma': 1.5190144084254582, 'min_child_weight': 2.3976919163508428, 'subsample': 0.8898572428793571, 'colsample_bytree': 0.7367884769017841}. Best is trial 1 with value: 0.9333417152497491.
[I 2024-11-30 16:05:48,778] Trial 2 finished with value: 0.9254867384201422 and parameters: {'max_depth': 10, 'learning_rate': 0.02283837363005125, 'gamma': 3.9595444590270894, 'min_child_weight': 2.578115379

In [108]:
best_model_xgb_w_imb = XGBClassifier(**study_xgb.best_params)
X_train_, y_train_ = RandomOverSampler(sampling_strategy='not majority').fit_resample(X_train, y_train)
sample_weights = np.array([class_weights[label] for label in y_train_])
best_model_xgb_w_imb.fit(X_train_, y_train_)

y_pred_xgb_w_imb = best_model_xgb_w_imb.predict(X_test)
print(confusion_matrix(y_test, y_pred_xgb_w_imb))
print("\n\nAccuracy:{:,.2f}%".format(accuracy_score(y_test, y_pred_xgb_w_imb)*100))
print("Precision:{:,.2f}%".format(precision_score(y_test, y_pred_xgb_w_imb, average="micro")*100))
print("Recall:{:,.2f}%".format(recall_score(y_test, y_pred_xgb_w_imb, average="micro")*100))
print("F1-Score:{:,.2f}%".format(f1_score(y_test, y_pred_xgb_w_imb, average="micro")*100))
print(classification_report(y_test, y_pred_xgb_w_imb))

[[556   4  41  10]
 [  2  68  15   5]
 [ 82  12 180  12]
 [  4   2   3  28]]


Accuracy:81.25%
Precision:81.25%
Recall:81.25%
F1-Score:81.25%
              precision    recall  f1-score   support

           0       0.86      0.91      0.89       611
           1       0.79      0.76      0.77        90
           2       0.75      0.63      0.69       286
           3       0.51      0.76      0.61        37

    accuracy                           0.81      1024
   macro avg       0.73      0.76      0.74      1024
weighted avg       0.81      0.81      0.81      1024



In [118]:
y_pred_xgb_w_imb_train = best_model_xgb_w_imb.predict(X_train)

# <span style ="color: yellow; font-weight: bold"> Ensemble -- Stacking </span>

In [128]:
base_predictions_train = np.column_stack(
    [
        y_pred_best_model_hist_no_imb_train,
        y_pred_hist_w_imb_train, 
        y_pred_lgbm_no_imb_train, 
        y_pred_lgbm_imb_train, 
        y_pred_xgb_imb_train, 
        y_pred_xgb_w_imb_train
    ]
)

base_predictions_test = np.column_stack(
    [
        y_pred_best_model_hist_no_imb, 
        y_pred_hist_w_imb, 
        y_pred_lgbm_no_imb, 
        y_pred_lgbm_imb, 
        y_pred_xgb_imb, 
        y_pred_xgb_w_imb
    ]
)

stacking_set_train, stacking_set_test, stacking_y_train, stacking_y_test = train_test_split(base_predictions_train, y_train, test_size=0.1, shuffle=True)

In [130]:
from sklearn.ensemble import ExtraTreesClassifier

def objective_et(trial):
    params = {
        "n_estimators": 2000,
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
        "max_features": trial.suggest_float("max_features", 0.1, 1.0),
        "random_state": 42,
    }
    model = ExtraTreesClassifier(**params)
    scores = cross_val_score(model, stacking_set_train, stacking_y_train, cv=5, scoring="f1_micro", n_jobs=-1)
    return scores.mean()

study_et = optuna.create_study(direction="maximize")
study_et.optimize(objective_et, n_trials=10, timeout=360)

[I 2024-11-30 16:24:58,224] A new study created in memory with name: no-name-07202186-a36b-4c30-b140-6bf87c43ffeb
[I 2024-11-30 16:25:01,136] Trial 0 finished with value: 1.0 and parameters: {'max_depth': 4, 'min_samples_split': 15, 'min_samples_leaf': 9, 'max_features': 0.4452559722403103}. Best is trial 0 with value: 1.0.
[I 2024-11-30 16:25:03,675] Trial 1 finished with value: 1.0 and parameters: {'max_depth': 7, 'min_samples_split': 19, 'min_samples_leaf': 12, 'max_features': 0.7170681174927501}. Best is trial 0 with value: 1.0.
[I 2024-11-30 16:25:06,131] Trial 2 finished with value: 1.0 and parameters: {'max_depth': 13, 'min_samples_split': 7, 'min_samples_leaf': 14, 'max_features': 0.71251530870194}. Best is trial 0 with value: 1.0.
[I 2024-11-30 16:25:08,729] Trial 3 finished with value: 1.0 and parameters: {'max_depth': 4, 'min_samples_split': 8, 'min_samples_leaf': 7, 'max_features': 0.7814009786889513}. Best is trial 0 with value: 1.0.
[I 2024-11-30 16:25:10,301] Trial 4 fin

In [133]:
best_et = ExtraTreesClassifier(n_estimators=2000, **study_et.best_params)
best_et = best_et.fit(base_predictions_train, y_train)
y_et_pred = best_et.predict(base_predictions_test)
print(f1_score(y_test, y_et_pred, average='micro'))
print(classification_report(y_test, y_et_pred))

0.8203125
              precision    recall  f1-score   support

           0       0.86      0.91      0.89       611
           1       0.81      0.73      0.77        90
           2       0.74      0.67      0.70       286
           3       0.65      0.65      0.65        37

    accuracy                           0.82      1024
   macro avg       0.77      0.74      0.75      1024
weighted avg       0.82      0.82      0.82      1024



# <span style ="color: orange; font-weight: bold"> Feature Selection </span>

In [84]:
perm_importance = permutation_importance(best_model_hist_w_imb, X_test, y_test, scoring='f1_micro', n_repeats=30)

for i in perm_importance.importances_mean.argsort()[::-1]:
    print(f"Feature {i}: Importance = {perm_importance.importances_mean[i]:.4f} ± {perm_importance.importances_std[i]:.4f}")

Feature 16: Importance = 0.0431 ± 0.0062
Feature 21: Importance = 0.0322 ± 0.0052
Feature 1: Importance = 0.0162 ± 0.0057
Feature 125: Importance = 0.0141 ± 0.0037
Feature 0: Importance = 0.0129 ± 0.0043
Feature 44: Importance = 0.0114 ± 0.0032
Feature 141: Importance = 0.0072 ± 0.0047
Feature 35: Importance = 0.0069 ± 0.0039
Feature 56: Importance = 0.0067 ± 0.0022
Feature 42: Importance = 0.0056 ± 0.0031
Feature 11: Importance = 0.0056 ± 0.0036
Feature 8: Importance = 0.0055 ± 0.0041
Feature 2: Importance = 0.0051 ± 0.0026
Feature 134: Importance = 0.0050 ± 0.0023
Feature 23: Importance = 0.0050 ± 0.0032
Feature 22: Importance = 0.0050 ± 0.0024
Feature 47: Importance = 0.0046 ± 0.0016
Feature 48: Importance = 0.0044 ± 0.0029
Feature 3: Importance = 0.0043 ± 0.0019
Feature 36: Importance = 0.0040 ± 0.0038
Feature 37: Importance = 0.0038 ± 0.0021
Feature 144: Importance = 0.0036 ± 0.0026
Feature 174: Importance = 0.0035 ± 0.0017
Feature 189: Importance = 0.0033 ± 0.0025
Feature 149: Im

In [85]:
(perm_importance.importances_mean + perm_importance.importances_std >= 0.001).sum()

np.int64(132)

In [86]:
cutoff = perm_importance.importances_mean + perm_importance.importances_std >= 0.001
X_important_features = X_.loc[:, cutoff]
X_important_features

Unnamed: 0_level_0,PRinterm,PRinterstd,PRsegm,PRsegstd,QRSmean,QRSstd,QTinterm,QTinterstd,STsegm,QSsegm,...,HRV_MFDFA_alpha1_Increment,HRV_ApEn,HRV_ShanEn,HRV_FuzzyEn,HRV_MSEn,HRV_CMSEn,HRV_RCMSEn,HRV_CD,HRV_KFD,HRV_LZC
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,40.500000,19.447044,33.234375,46.176201,24.156250,2.985479,108.031250,5.177985,66.446152,13.390625,...,0.019108,0.503064,4.153968,0.385152,0.215587,0.378403,0.351607,0.791140,1.192584,0.457909
1,39.529411,19.336395,32.787880,40.742176,30.000000,3.880570,91.794121,18.339573,62.085712,16.794117,...,0.156955,0.324642,4.614997,1.160175,0.000000,0.000000,0.000000,1.554685,2.183989,0.879306
2,37.241379,5.028575,18.344828,4.137931,26.206896,1.185525,114.758621,23.717672,73.733330,14.620689,...,0.003791,0.726988,4.215061,0.877344,0.000000,0.000000,0.000000,1.529594,1.327522,0.981378
3,24.369230,9.272903,28.169231,39.208542,29.888889,16.502151,62.044445,15.884241,21.777779,16.235294,...,0.979570,0.597544,4.738070,1.222190,1.374677,1.226913,1.200117,1.392254,2.997132,1.028321
4,36.333332,4.979960,18.000000,5.781196,25.533333,1.654623,82.511108,13.590229,44.413044,14.266666,...,0.073113,0.346309,4.533644,0.587937,0.240248,0.290699,0.292302,1.182244,1.498349,0.610206
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5112,63.047619,23.304356,60.809525,55.594296,27.285715,4.072473,91.190475,29.805111,52.181820,16.863636,...,3.191356,0.183728,4.262692,1.042608,,,,0.906263,2.703842,1.376736
5113,29.299999,16.087574,61.344826,98.975189,27.333334,4.093355,113.133331,23.779169,69.806450,14.266666,...,0.041655,0.104728,3.252426,1.140114,,,,2.149990,1.757852,1.278502
5114,34.294117,5.102072,16.794117,4.927590,28.029411,2.854154,103.088234,3.184223,56.714287,14.147058,...,0.180135,0.296399,4.843569,0.674839,0.000000,0.000000,0.000000,1.442123,1.426302,0.439653
5115,37.866665,19.527302,58.354839,88.032822,27.032259,2.117270,110.193550,6.712624,67.468750,15.741936,...,0.000950,0.146087,4.090018,1.426594,,,,1.776100,3.111096,1.093750


In [None]:
# from sklearn.impute import KNNImputer

# imputer = KNNImputer(n_neighbors=5, weights='distance')
# X_important_features_imputed = imputer.fit_transform(X_important_features)
# X_important_features_imputed = pd.DataFrame(X_important_features_imputed, columns=X_important_features.columns)

In [None]:
# def filter_multicolinearity(X_train, sigma): 
#     corr_matrix = np.triu(np.corrcoef(X_train, rowvar=False))
#     np.fill_diagonal(corr_matrix, val=0)
#     mask_multicorr = (corr_matrix < sigma).all(axis=1)
#     return mask_multicorr

In [None]:
# mask_multicorr = filter_multicolinearity(X_important_features_imputed, 0.95)
# cols = ~mask_multicorr
# X_important_features_imputed.columns[cols].to_list()

['RRmean',
 'mean_nni',
 'range_nni',
 'std_hr',
 'ratio_sd2_sd1',
 'fft_total',
 'lomb_ratio',
 'lomb_total',
 'fft_rel2',
 'lomb_abs1',
 'HRV_MeanNN',
 'HRV_SDNN',
 'HRV_RMSSD',
 'HRV_SDSD',
 'HRV_CVNN',
 'HRV_CVSD',
 'HRV_MadNN',
 'HRV_SDRMSSD',
 'HRV_SD2',
 'HRV_PIP',
 'HRV_GI',
 'HRV_SD1d',
 'HRV_SD1a',
 'HRV_SD2d',
 'HRV_SD2a',
 'HRV_SDNNd',
 'HRV_MFDFA_alpha1_Fluctuation',
 'HRV_CMSEn']

In [None]:
# X_important_features_imputed_filtered = X_important_features_imputed.loc[:, mask_multicorr]

In [None]:
# X_important_features_filtered = X_important_features.loc[:, mask_multicorr]

In [None]:
# X_important_features_filtered.isna().sum().sum()

np.int64(12905)

# <span style ="color: skyblue; font-weight: bold"> Model Tuning 2 </span>

In [87]:
X_train_imp, X_test_imp, y_train_imp, y_test_imp = train_test_split(X_important_features, y, test_size=0.2, shuffle=True)
# over_sampler = RandomOverSampler()
# X_train, y_train = over_sampler.fit_resample(X_train, y_train)

In [88]:
import optuna 
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

def objective(trial):
    # Define the hyperparameter search space
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.5, log=True),
        "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 10, 50),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 10, 50),
        "l2_regularization": trial.suggest_float("l2_regularization", 1e-4, 1.0, log=True),
        "max_bins": trial.suggest_int("max_bins", 50, 255),
        "early_stopping": True,  # Early stopping is enabled by default in this classifier
        "class_weight": class_weights,
        "random_state": 42,
    }
    
    # Create the model with the trial's parameters
    model = HistGradientBoostingClassifier(**params)
    
    # Oversample
    X_train_imp_, y_train_imp_ = RandomOverSampler(sampling_strategy='not majority').fit_resample(X_train_imp, y_train_imp)
    
    # Perform cross-validation
    scores = cross_val_score(
        model, X_train_imp_, y_train_imp_, cv=5, scoring="f1_micro", n_jobs=-1,
    )
    return scores.mean()

# Run the optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, timeout=360)

# Best hyperparameters
print("Best hyperparameters:")
print(study.best_params)

[I 2024-11-30 15:11:27,127] A new study created in memory with name: no-name-711abcb7-ff55-4441-932c-7a1e1fd26350
[I 2024-11-30 15:11:30,440] Trial 0 finished with value: 0.9290854152553354 and parameters: {'learning_rate': 0.3204007910698376, 'max_leaf_nodes': 50, 'max_depth': 8, 'min_samples_leaf': 47, 'l2_regularization': 0.040528147782842174, 'max_bins': 81}. Best is trial 0 with value: 0.9290854152553354.
[I 2024-11-30 15:11:36,352] Trial 1 finished with value: 0.8976813255468239 and parameters: {'learning_rate': 0.041101994081603206, 'max_leaf_nodes': 28, 'max_depth': 8, 'min_samples_leaf': 43, 'l2_regularization': 0.8924333971753383, 'max_bins': 104}. Best is trial 0 with value: 0.9290854152553354.
[I 2024-11-30 15:11:38,247] Trial 2 finished with value: 0.8033690735943141 and parameters: {'learning_rate': 0.4508551802207669, 'max_leaf_nodes': 18, 'max_depth': 9, 'min_samples_leaf': 38, 'l2_regularization': 0.000640356268510833, 'max_bins': 103}. Best is trial 0 with value: 0.92

Best hyperparameters:
{'learning_rate': 0.23784167533637476, 'max_leaf_nodes': 42, 'max_depth': 10, 'min_samples_leaf': 41, 'l2_regularization': 0.00018159914444358367, 'max_bins': 240}


In [89]:
best_model_hist = HistGradientBoostingClassifier(**study.best_params, class_weight=class_weights)
X_tain_imp_, y_train_imp_ = RandomOverSampler().fit_resample(X_train_imp, y_train_imp)
best_model_hist.fit(X_tain_imp_, y_train_imp_)

y_pred = best_model_hist.predict(X_test_imp)
print(confusion_matrix(y_test_imp, y_pred))
print("\n\nAccuracy:{:,.2f}%".format(accuracy_score(y_test_imp, y_pred)*100))
print("Precision:{:,.2f}%".format(precision_score(y_test_imp, y_pred, average="micro")*100))
print("Recall:{:,.2f}%".format(recall_score(y_test_imp, y_pred, average="micro")*100))
print("F1-Score:{:,.2f}%".format(f1_score(y_test_imp, y_pred, average="micro")*100))
print(classification_report(y_test_imp, y_pred))

[[526   3  63   2]
 [  2  75  21   0]
 [ 75   9 203   6]
 [  7   2   9  21]]


Accuracy:80.57%
Precision:80.57%
Recall:80.57%
F1-Score:80.57%
              precision    recall  f1-score   support

           0       0.86      0.89      0.87       594
           1       0.84      0.77      0.80        98
           2       0.69      0.69      0.69       293
           3       0.72      0.54      0.62        39

    accuracy                           0.81      1024
   macro avg       0.78      0.72      0.75      1024
weighted avg       0.80      0.81      0.80      1024



In [232]:
from lightgbm import LGBMClassifier

def objective_lgbm(trial):
    params = {
        "num_leaves": trial.suggest_int("num_leaves", 20, 150),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.5, log=True),
        "n_estimators": 1000,
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 50),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-4, 1.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-4, 1.0, log=True),
        "random_state": 42,
        'verbose': -1,
        "boosting_type": "gbdt",
    }
    model = LGBMClassifier(**params)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring="f1_micro", n_jobs=-1)
    return scores.mean()

study_lgbm = optuna.create_study(direction="maximize")
study_lgbm.optimize(objective_lgbm, n_trials=100, timeout=360)

[I 2024-11-28 14:12:50,232] A new study created in memory with name: no-name-a09d9840-1fa4-4c1f-ab81-03cdc1a23258
[I 2024-11-28 14:12:59,852] Trial 0 finished with value: 0.9585411759701834 and parameters: {'num_leaves': 29, 'max_depth': 14, 'learning_rate': 0.26507176841644836, 'min_child_samples': 28, 'reg_alpha': 0.24950593459045367, 'reg_lambda': 0.00014925998393184374}. Best is trial 0 with value: 0.9585411759701834.
[I 2024-11-28 14:14:26,493] Trial 1 finished with value: 0.9623565659841665 and parameters: {'num_leaves': 38, 'max_depth': 9, 'learning_rate': 0.035546559967970744, 'min_child_samples': 27, 'reg_alpha': 0.0011088353758905668, 'reg_lambda': 0.0014238702269269404}. Best is trial 1 with value: 0.9623565659841665.
[I 2024-11-28 14:15:44,095] Trial 2 finished with value: 0.9595720506372187 and parameters: {'num_leaves': 59, 'max_depth': 6, 'learning_rate': 0.012988832753313324, 'min_child_samples': 50, 'reg_alpha': 0.00038647477109247865, 'reg_lambda': 0.00530163392979447

In [235]:
best_model_lgbm = LGBMClassifier(verbose=-1, **study_lgbm.best_params)
best_model_lgbm.fit(X_train, y_train)

y_pred = best_model_lgbm.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print("\n\nAccuracy:{:,.2f}%".format(accuracy_score(y_test, y_pred)*100))
print("Precision:{:,.2f}%".format(precision_score(y_test, y_pred, average="micro")*100))
print("Recall:{:,.2f}%".format(recall_score(y_test, y_pred, average="micro")*100))
print("F1-Score:{:,.2f}%".format(f1_score(y_test, y_pred, average="micro")*100))
print(classification_report(y_test, y_pred))

[[528   5  64   9]
 [  3  68  16   1]
 [ 73  25 193   6]
 [  9   3   3  18]]


Accuracy:78.81%
Precision:78.81%
Recall:78.81%
F1-Score:78.81%
              precision    recall  f1-score   support

           0       0.86      0.87      0.87       606
           1       0.67      0.77      0.72        88
           2       0.70      0.65      0.67       297
           3       0.53      0.55      0.54        33

    accuracy                           0.79      1024
   macro avg       0.69      0.71      0.70      1024
weighted avg       0.79      0.79      0.79      1024



# <span style ="color: Lime; font-weight: bold"> Submission </span>

In [146]:
data_test = pd.read_csv('test_features.csv', index_col='id')
X_test = data_test.replace(to_replace=np.inf, value=np.nan)
# best_model_hist.fit(X_, y)
# y_test = best_model_hist.predict(X_test.drop(columns=['id']))
# y_test = pd.DataFrame(y_test, index=X_test.index)
# y_test.to_csv('new_submission.csv', index_label='id')

In [147]:
X_over, y_over = RandomOverSampler(sampling_strategy='not majority').fit_resample(X_, y)
sample_weights = np.array([class_weights[label] for label in y_over])
y_pred_best_model_hist_no_imb_all = best_model_hist_no_imb.fit(X_, y).predict(X_)
y_pred_hist_w_imb_all = best_model_hist_w_imb.fit(X_over, y_over).predict(X_)
y_pred_lgbm_no_imb_all = best_model_lgbm_no_imb.fit(X_, y).predict(X_)
y_pred_lgbm_imb_all = best_model_lgbm_imb.fit(X_over, y_over).predict(X_)
y_pred_xgb_imb_all = best_model_xgb_imb.fit(X_, y).predict(X_)
y_pred_xgb_w_imb_all = best_model_xgb_w_imb.fit(X_over, y_over, sample_weight=sample_weights).predict(X_)

In [148]:
base_predictions_all = np.column_stack(
    [
        y_pred_best_model_hist_no_imb_all, 
        y_pred_hist_w_imb_all, 
        y_pred_lgbm_no_imb_all, 
        y_pred_lgbm_imb_all, 
        y_pred_xgb_imb_all, 
        y_pred_xgb_w_imb_all
    ]
)

best_et = best_et.fit(base_predictions_all, y)

In [149]:
X_over, y_over = RandomOverSampler(sampling_strategy='not majority').fit_resample(X_, y)
sample_weights = np.array([class_weights[label] for label in y_over])
y_pred_best_model_hist_no_imb_test = best_model_hist_no_imb.fit(X_, y).predict(X_test)
y_pred_hist_w_imb_test = best_model_hist_w_imb.fit(X_over, y_over).predict(X_test)
y_pred_lgbm_no_imb_test = best_model_lgbm_no_imb.fit(X_, y).predict(X_test)
y_pred_lgbm_imb_test = best_model_lgbm_imb.fit(X_over, y_over).predict(X_test)
y_pred_xgb_imb_test = best_model_xgb_imb.fit(X_, y).predict(X_test)
y_pred_xgb_w_imb_test = best_model_xgb_w_imb.fit(X_over, y_over, sample_weight=sample_weights).predict(X_test)

In [151]:
base_predictions_test = np.column_stack(
    [
        y_pred_best_model_hist_no_imb_test, 
        y_pred_hist_w_imb_test, 
        y_pred_lgbm_no_imb_test, 
        y_pred_lgbm_imb_test, 
        y_pred_xgb_imb_test, 
        y_pred_xgb_w_imb_test
    ]
)

y_pred_final = best_et.predict(base_predictions_test)
y_pred_final_csv = pd.DataFrame(y_pred_final, index=X_test.index)
y_pred_final_csv.to_csv('stacking.csv', index_label='id')

# <span style ="color: Red; font-weight: bold"> Legacy </span>

In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [24]:
tree_clf1 = DecisionTreeClassifier(criterion='entropy', splitter='random').fit(X_train, y_train)

y_pred = tree_clf1.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print("\n\nAccuracy:{:,.2f}%".format(accuracy_score(y_test, y_pred)*100))
print("Precision:{:,.2f}%".format(precision_score(y_test, y_pred, average="micro")*100))
print("Recall:{:,.2f}%".format(recall_score(y_test, y_pred, average="micro")*100))
print("F1-Score:{:,.2f}%".format(f1_score(y_test, y_pred, average="micro")*100))

feature_importances = pd.DataFrame(tree_clf1.feature_importances_,
                                index = X_train.columns,
                                columns=['importance']).sort_values('importance', 
                                                                    ascending=False)

print(feature_importances)

[[500   8 115   9]
 [ 10  34   9   4]
 [ 99  21 128  12]
 [  9   1  10   7]]


Accuracy:68.55%
Precision:68.55%
Recall:68.55%
F1-Score:68.55%
                              importance
HRV_SD1a                        0.111625
HRV_pNN50                       0.062601
mean_hr                         0.025993
HRV_MedianNN                    0.025499
HRV_CD                          0.022990
...                                  ...
HRV_MFDFA_alpha1_Increment      0.002360
HRV_MFDFA_alpha1_Fluctuation    0.001934
HRV_MFDFA_alpha1_Asymmetry      0.001790
HRV_MFDFA_alpha2_Asymmetry      0.001091
HRV_AI                          0.000671

[105 rows x 1 columns]


In [25]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
clf = ExtraTreesClassifier(n_estimators=50)
clf = clf.fit(X_train, y_train)
model = SelectFromModel(clf, prefit=True)
X_new = model.transform(X_train)


y_pred = clf.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print("\n\nAccuracy:{:,.2f}%".format(accuracy_score(y_test, y_pred)*100))
print("Precision:{:,.2f}%".format(precision_score(y_test, y_pred, average="micro")*100))
print("Recall:{:,.2f}%".format(recall_score(y_test, y_pred, average="micro")*100))
print("F1-Score:{:,.2f}%".format(f1_score(y_test, y_pred, average="micro")*100))

feature_importances = pd.DataFrame(tree_clf1.feature_importances_,
                                index = X_train.columns,
                                columns=['importance']).sort_values('importance', 
                                                                    ascending=False)

csum = feature_importances.cumsum()
relevant_features = list(csum[csum['importance'] <= 0.9].index)

[[589   3  40   0]
 [  3  39  15   0]
 [108   9 142   1]
 [  7   3   9   8]]


Accuracy:79.71%
Precision:79.71%
Recall:79.71%
F1-Score:79.71%




In [26]:
X_selected_ftr = X_filtered_multicorr[relevant_features]


In [27]:
X_train, X_test, y_train, y_test = train_test_split(X_selected_ftr, y, test_size=0.2, shuffle=True)

In [28]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(n_estimators=2000, n_jobs=-1)
lgbm.fit(X_train, y_train)
y_pred = lgbm.predict(X_test)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001865 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18777
[LightGBM] [Info] Number of data points in the train set: 3900, number of used features: 75
[LightGBM] [Info] Start training from score -0.478449
[LightGBM] [Info] Start training from score -2.648331
[LightGBM] [Info] Start training from score -1.276635
[LightGBM] [Info] Start training from score -3.489608


In [29]:
f1_score(y_test, y_pred, average='micro')

np.float64(0.8165983606557377)

<span style ="color: yellow; font-weight: bold; font-size: 20px">GPC</span>

In [30]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import *
from scipy.optimize import minimize
from scipy.stats import uniform
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score
import tqdm

In [58]:
from gpytorch.models import ExactGP
from gpytorch.likelihoods import DirichletClassificationLikelihood
from gpytorch.means import ConstantMean
from gpytorch.kernels import ScaleKernel, RBFKernel
import torch
import math
import torch
import numpy as np
import gpytorch
from matplotlib import pyplot as plt
from torch.optim import LBFGS
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP


%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
output_device = torch.device('cuda:0')
X_tr , y_tr = torch.from_numpy(X_train.values).to(output_device), torch.from_numpy(y_train.values).to(output_device)

n_devices = torch.cuda.device_count()
print('Planning to run on {} GPUs.'.format(n_devices))


In [None]:
# We will use the simplest form of GP model, exact inference
class DirichletGPModel(ExactGP):
    def __init__(self, train_x, train_y, likelihood, num_classes, n_devices):
        super(DirichletGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = ConstantMean(batch_shape=torch.Size((num_classes,)))
        base_covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())

        self.covar_module = gpytorch.kernels.MultiDeviceKernel(
            base_covar_module, device_ids=range(n_devices),
            output_device=output_device
        )

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)


# initialize likelihood and model
# we let the DirichletClassificationLikelihood compute the targets for us
likelihood = DirichletClassificationLikelihood(y_tr, learn_additional_noise=True).to(output_device)
model = DirichletGPModel(X_tr, likelihood.transformed_targets, likelihood, num_classes=4 ).to(output_device)

In [None]:
# this is for running the notebook in our testing framework
import os
smoke_test = ('CI' in os.environ)
training_iter = 2 if smoke_test else 1000


# Find optimal model hyperparameters
model.train()
likelihood.train()

# Use the adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)  # Includes GaussianLikelihood parameters

# "Loss" for GPs - the marginal log likelihood
mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

for i in range(training_iter):
    # Zero gradients from previous iteration
    optimizer.zero_grad()
    # Output from model
    output = model(X_tr)
    # Calc loss and backprop gradients
    loss = -mll(output, likelihood.transformed_targets).sum()
    loss.backward()
    if i % 10 == 0:
        print('Iter %d/%d - Loss: %.3f   lengthscale: %.3f   noise: %.3f' % (
            i + 1, training_iter, loss.item(),
            model.covar_module.base_kernel.lengthscale.mean().item(),
            model.likelihood.second_noise_covar.noise.mean().item()
        ))
    optimizer.step()

NotImplementedError: The operator 'aten::linalg_qr.out' is not currently implemented for the MPS device. If you want this op to be added in priority during the prototype phase of this feature, please comment on https://github.com/pytorch/pytorch/issues/77764. As a temporary fix, you can set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1` to use the CPU as a fallback for this op. WARNING: this will be slower than running natively on MPS.

In [None]:
from xgboost import XGBClassifier

xgboost_classification_model = XGBClassifier(n_estimators=2000,n_jobs=-1)
xgboost_classification_model.fit(X_train, y_train, verbose=0)

In [None]:
y_pred = xgboost_classification_model.predict(X_test)

In [None]:
f1_score(y_test, y_pred, average='micro')

np.float64(0.8203125)

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier

hist_classifier = HistGradientBoostingClassifier()
hist_classifier.fit(X_train, y_train)
y_pred = hist_classifier.predict(X_test)
f1_score(y_test, y_pred, average='micro')

np.float64(0.8349609375)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_filtered_multicorr, y, test_size=0.2, shuffle=True)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer

adaboost = AdaBoostClassifier(n_estimators=2000, algorithm='SAMME')
adaboost_pipeline = Pipeline(
    [('impute', KNNImputer(n_neighbors=5)), 
    ('adaboost', adaboost)]
)
adaboost_pipeline.fit(X_train, y_train)

In [None]:
y_pred = adaboost_pipeline.predict(X_test)
f1_score(y_test, y_pred, average='micro')

np.float64(0.7879537953795379)

In [None]:
from sklearn.ensemble import VotingClassifier

voting = VotingClassifier(estimators=[
    ('xgb_class', xgboost_classification_model), 
    ('hist_class', hist_classifier),
    ('lgbm_class', lgbm)
], voting='soft', weights=[0.33, 0.33, 0.33])

voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)
f1_score(y_test, y_pred, average='micro')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002327 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25162
[LightGBM] [Info] Number of data points in the train set: 4093, number of used features: 113
[LightGBM] [Info] Start training from score -0.521799
[LightGBM] [Info] Start training from score -2.461962
[LightGBM] [Info] Start training from score -1.247159
[LightGBM] [Info] Start training from score -3.382560


np.float64(0.8310546875)

In [126]:
data_test = pd.read_csv('test_features.csv', header=0, index_col='id')

In [127]:
X_test = data_test 
X_test.replace(to_replace=np.inf, value=np.nan, inplace=True)

In [128]:
X_test.columns

Index(['PRinterm', 'PRinterstd', 'PRsegm', 'PRsegstd', 'QRSmean', 'QRSstd',
       'QTinterm', 'QTinterstd', 'STsegm', 'STsegstd',
       ...
       'HRV_LZC', 'HRV_DFA_alpha2', 'HRV_MFDFA_alpha2_Width',
       'HRV_MFDFA_alpha2_Peak', 'HRV_MFDFA_alpha2_Mean',
       'HRV_MFDFA_alpha2_Max', 'HRV_MFDFA_alpha2_Delta',
       'HRV_MFDFA_alpha2_Asymmetry', 'HRV_MFDFA_alpha2_Fluctuation',
       'HRV_MFDFA_alpha2_Increment'],
      dtype='object', length=140)

In [129]:
X_test = X_test.loc[:, mask_multicorr]

In [None]:
X_filtered_multicorr, y = RandomOverSampler().fit_resample(X_filtered_multicorr, y)

In [279]:
voting.fit(X_filtered_multicorr, y)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003413 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25393
[LightGBM] [Info] Number of data points in the train set: 12120, number of used features: 113
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294


In [281]:
y_test_pred = voting.predict(X_test)

In [282]:
sample_submission = pd.DataFrame(y_test_pred, index=data_test.index, columns=['y'])

In [283]:
sample_submission.to_csv('new_submission.csv')