In [156]:
import pandas as pd 
import numpy as np

from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import ADASYN

In [157]:
data_train = pd.read_csv('features_extracted_more.csv', index_col='id')

In [158]:
X, y = data_train.drop(columns=['y']), data_train['y']

In [159]:
X_ = X.replace(to_replace=np.inf, value=np.nan)

In [160]:
imputer = KNNImputer(n_neighbors=5, weights='distance')
X_imputed = imputer.fit_transform(X_)

In [161]:
def filter_multicolinearity(X_train, sigma): 
    corr_matrix = np.triu(np.corrcoef(X_train, rowvar=False))
    np.fill_diagonal(corr_matrix, val=0)
    mask_multicorr = (corr_matrix < sigma).all(axis=1)
    return mask_multicorr

In [162]:
mask_multicorr = filter_multicolinearity(X_imputed, 0.95)

In [163]:
X_columns = X.columns 
multi_colinear_columns = X_columns[~mask_multicorr]

In [164]:
multi_colinear_columns

Index(['HeartRatem', 'RRmean', 'mean_nni', 'sdnn', 'sdsd', 'rmssd',
       'range_nni', 'ratio_sd2_sd1', 'HRV_MeanNN', 'HRV_SDNN', 'HRV_RMSSD',
       'HRV_SDSD', 'HRV_CVNN', 'HRV_CVSD', 'HRV_MadNN', 'HRV_SDRMSSD',
       'HRV_SD1', 'HRV_SD2', 'HRV_PIP', 'HRV_GI', 'HRV_SI', 'HRV_SD1d',
       'HRV_SD2d', 'HRV_SD2a', 'HRV_SDNNd', 'HRV_MFDFA_alpha1_Fluctuation',
       'HRV_CMSEn'],
      dtype='object')

In [165]:
X_filtered_multicorr = X_.loc[:, mask_multicorr]

In [166]:
X_train, X_test, y_train, y_test = train_test_split(X_filtered_multicorr, y, test_size=0.2, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, shuffle=True)

In [169]:
import xgboost as xgb 
from xgboost import XGBClassifier

from sklearn.metrics import f1_score
from imblearn.over_sampling import RandomOverSampler

# tree_method : hist vs approx
# max depth: 3-12
# min_child_weight: [1, 200]
# subsample: [0.7, 1.0]
#  colsample_bylevel: [0.5, 1]
# reg_lambda = [0, 10] or reg_alpha = [0, 10]
# learning_rate = [0, 1]
# num_boost_round = N
# early_stopping_rounds: how many non improved boosting rounds until stop training

learning_rate = 0.3

xgboost_classification_model = XGBClassifier(n_estimators=1000, objective='multi:softprob', \
    eval_metric='auc', early_stopping_rounds=50, n_jobs=-1)

X_train, y_train = RandomOverSampler().fit_resample(X_train, y_train)
xgboost_classification_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)
y_pred = xgboost_classification_model.predict(X_val)

In [170]:
import optuna
import time

def objective(trial):
    params = {
        'tree_method': trial.suggest_categorical('tree_method', ['approx', 'hist']), 
        'max_depth': trial.suggest_int('max_depth', 3, 12), 
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 250), 
        'subsample': trial.suggest_float('colsample_bynode', 0.1, 1.0), 
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 25, log=True), 
        'learning_rate': learning_rate
    }
    
    model = xgboost_classification_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)
    return f1_score(y_val, model.predict(X_val), average='micro')


sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction='maximize', sampler=sampler)

tic = time.time()
while time.time() - tic < 300:
    study.optimize(objective, n_trials=1)

[I 2024-11-20 20:47:22,529] A new study created in memory with name: no-name-d7dfecce-080b-4691-b004-4422a1e048b7
[I 2024-11-20 20:47:30,891] Trial 0 finished with value: 0.7975609756097561 and parameters: {'tree_method': 'hist', 'max_depth': 10, 'min_child_weight': 150, 'colsample_bynode': 0.24041677639819287, 'reg_lambda': 0.004853494503398151}. Best is trial 0 with value: 0.7975609756097561.
[I 2024-11-20 20:47:35,966] Trial 1 finished with value: 0.7975609756097561 and parameters: {'tree_method': 'hist', 'max_depth': 9, 'min_child_weight': 178, 'colsample_bynode': 0.1185260448662222, 'reg_lambda': 18.43339560109507}. Best is trial 0 with value: 0.7975609756097561.
[I 2024-11-20 20:47:39,992] Trial 2 finished with value: 0.7975609756097561 and parameters: {'tree_method': 'approx', 'max_depth': 4, 'min_child_weight': 46, 'colsample_bynode': 0.373818018663584, 'reg_lambda': 0.20316425758330103}. Best is trial 0 with value: 0.7975609756097561.
[I 2024-11-20 20:47:44,509] Trial 3 finish

In [171]:
study.best_trial.params

{'tree_method': 'hist',
 'max_depth': 10,
 'min_child_weight': 150,
 'colsample_bynode': 0.24041677639819287,
 'reg_lambda': 0.004853494503398151}

In [172]:
X_train, X_val, y_train, y_val = train_test_split(X_filtered_multicorr, y, test_size=0.2)

In [173]:
X_train, y_train = RandomOverSampler().fit_resample(X_train, y_train)

In [178]:
xgboost_classification_model = XGBClassifier(n_estimators=2000, tree_method='hist', max_depth=10, min_child_weight=105, colsample_bynode=0.24, reg_lambda=0.00485, \
        n_jobs=-1, eta=0.01)
xgboost_classification_model.fit(X_train, y_train, verbose=0)

In [179]:
y_pred = xgboost_classification_model.predict(X_val)

In [180]:
f1_score(y_val, y_pred, average='micro')

np.float64(0.8115234375)

In [181]:
from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.89      0.87       570
           1       0.77      0.82      0.79       101
           2       0.77      0.69      0.73       311
           3       0.56      0.69      0.62        42

    accuracy                           0.81      1024
   macro avg       0.74      0.77      0.75      1024
weighted avg       0.81      0.81      0.81      1024



In [182]:
xgboost_c = XGBClassifier(n_estimators=2000, tree_method='hist', max_depth=10)
xgboost_c.fit(X_train, y_train)
y_pred = xgboost_c.predict(X_val)
f1_score(y_val, y_pred, average='micro')

np.float64(0.814453125)

In [183]:
from sklearn.ensemble import HistGradientBoostingClassifier

hist_classifier = HistGradientBoostingClassifier()
hist_classifier.fit(X_train, y_train)
y_pred = hist_classifier.predict(X_val)
f1_score(y_val, y_pred, average='micro')

np.float64(0.8193359375)

In [184]:
from sklearn.ensemble import VotingClassifier

voting = VotingClassifier(estimators=[
    ('xgb_class', xgboost_classification_model), 
    ('hist_class', hist_classifier)
], voting='soft', weights=[0.5, 0.5])

voting.fit(X_train, y_train)
y_pred = voting.predict(X_val)
f1_score(y_val, y_pred, average='micro')

np.float64(0.826171875)

In [191]:
data_test = pd.read_csv('test_features.csv', header=0, index_col='id')

In [192]:
X_test = data_test 
X_test.replace(to_replace=np.inf, value=np.nan, inplace=True)

In [193]:
X_test.columns

Index(['PRinterm', 'PRinterstd', 'PRsegm', 'PRsegstd', 'QRSmean', 'QRSstd',
       'QTinterm', 'QTinterstd', 'STsegm', 'STsegstd',
       ...
       'HRV_LZC', 'HRV_DFA_alpha2', 'HRV_MFDFA_alpha2_Width',
       'HRV_MFDFA_alpha2_Peak', 'HRV_MFDFA_alpha2_Mean',
       'HRV_MFDFA_alpha2_Max', 'HRV_MFDFA_alpha2_Delta',
       'HRV_MFDFA_alpha2_Asymmetry', 'HRV_MFDFA_alpha2_Fluctuation',
       'HRV_MFDFA_alpha2_Increment'],
      dtype='object', length=140)

In [194]:
X_test = X_test.loc[:, mask_multicorr]

In [196]:
X_filtered_multicorr, y = RandomOverSampler().fit_resample(X_filtered_multicorr, y)

In [197]:
voting.fit(X_filtered_multicorr, y)

In [198]:
y_test_pred = voting.predict(X_test)

In [202]:
sample_submission = pd.DataFrame(y_test_pred, index=data_test.index, columns=['y'])

In [203]:
sample_submission.to_csv('new_submission.csv')