In [1]:
import pandas as pd 
import numpy as np

from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import ADASYN

In [2]:
data_train = pd.read_csv('features_extracted_more.csv', index_col='id')

In [250]:
X, y = data_train.drop(columns=['y']), data_train['y']

In [251]:
X_ = X.replace(to_replace=np.inf, value=np.nan)

In [252]:
imputer = KNNImputer(n_neighbors=5, weights='distance')
X_imputed = imputer.fit_transform(X_)

In [253]:
def filter_multicolinearity(X_train, sigma): 
    corr_matrix = np.triu(np.corrcoef(X_train, rowvar=False))
    np.fill_diagonal(corr_matrix, val=0)
    mask_multicorr = (corr_matrix < sigma).all(axis=1)
    return mask_multicorr

In [254]:
mask_multicorr = filter_multicolinearity(X_imputed, 0.95)

In [255]:
X_columns = X.columns 
multi_colinear_columns = X_columns[~mask_multicorr]

In [256]:
multi_colinear_columns

Index(['HeartRatem', 'RRmean', 'mean_nni', 'sdnn', 'sdsd', 'rmssd',
       'range_nni', 'ratio_sd2_sd1', 'HRV_MeanNN', 'HRV_SDNN', 'HRV_RMSSD',
       'HRV_SDSD', 'HRV_CVNN', 'HRV_CVSD', 'HRV_MadNN', 'HRV_SDRMSSD',
       'HRV_SD1', 'HRV_SD2', 'HRV_PIP', 'HRV_GI', 'HRV_SI', 'HRV_SD1d',
       'HRV_SD2d', 'HRV_SD2a', 'HRV_SDNNd', 'HRV_MFDFA_alpha1_Fluctuation',
       'HRV_CMSEn'],
      dtype='object')

In [257]:
X_filtered_multicorr = X_.loc[:, mask_multicorr]

In [258]:
from sklearn.metrics import f1_score

In [266]:
X_train, X_test, y_train, y_test = train_test_split(X_filtered_multicorr, y, test_size=0.2, shuffle=True)

In [267]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(n_estimators=2000, n_jobs=-1)
lgbm.fit(X_train, y_train)
y_pred = lgbm.predict(X_test)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004590 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25162
[LightGBM] [Info] Number of data points in the train set: 4093, number of used features: 113
[LightGBM] [Info] Start training from score -0.521799
[LightGBM] [Info] Start training from score -2.461962
[LightGBM] [Info] Start training from score -1.247159
[LightGBM] [Info] Start training from score -3.382560


In [268]:
f1_score(y_test, y_pred, average='micro')

np.float64(0.833984375)

In [269]:
from xgboost import XGBClassifier

xgboost_classification_model = XGBClassifier(n_estimators=2000,n_jobs=-1)
xgboost_classification_model.fit(X_train, y_train, verbose=0)

In [270]:
y_pred = xgboost_classification_model.predict(X_test)

In [271]:
f1_score(y_test, y_pred, average='micro')

np.float64(0.8203125)

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier

hist_classifier = HistGradientBoostingClassifier()
hist_classifier.fit(X_train, y_train)
y_pred = hist_classifier.predict(X_test)
f1_score(y_test, y_pred, average='micro')

np.float64(0.8349609375)

In [290]:
X_train, X_test, y_train, y_test = train_test_split(X_filtered_multicorr, y, test_size=0.2, shuffle=True)

In [291]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer

adaboost = AdaBoostClassifier(n_estimators=2000, algorithm='SAMME')
adaboost_pipeline = Pipeline(
    [('impute', KNNImputer(n_neighbors=5)), 
    ('adaboost', adaboost)]
)
adaboost_pipeline.fit(X_train, y_train)

In [292]:
y_pred = adaboost_pipeline.predict(X_test)
f1_score(y_test, y_pred, average='micro')

np.float64(0.7879537953795379)

In [273]:
from sklearn.ensemble import VotingClassifier

voting = VotingClassifier(estimators=[
    ('xgb_class', xgboost_classification_model), 
    ('hist_class', hist_classifier),
    ('lgbm_class', lgbm)
], voting='soft', weights=[0.33, 0.33, 0.33])

voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)
f1_score(y_test, y_pred, average='micro')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002327 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25162
[LightGBM] [Info] Number of data points in the train set: 4093, number of used features: 113
[LightGBM] [Info] Start training from score -0.521799
[LightGBM] [Info] Start training from score -2.461962
[LightGBM] [Info] Start training from score -1.247159
[LightGBM] [Info] Start training from score -3.382560


np.float64(0.8310546875)

In [274]:
data_test = pd.read_csv('test_features.csv', header=0, index_col='id')

In [275]:
X_test = data_test 
X_test.replace(to_replace=np.inf, value=np.nan, inplace=True)

In [276]:
X_test.columns

Index(['PRinterm', 'PRinterstd', 'PRsegm', 'PRsegstd', 'QRSmean', 'QRSstd',
       'QTinterm', 'QTinterstd', 'STsegm', 'STsegstd',
       ...
       'HRV_LZC', 'HRV_DFA_alpha2', 'HRV_MFDFA_alpha2_Width',
       'HRV_MFDFA_alpha2_Peak', 'HRV_MFDFA_alpha2_Mean',
       'HRV_MFDFA_alpha2_Max', 'HRV_MFDFA_alpha2_Delta',
       'HRV_MFDFA_alpha2_Asymmetry', 'HRV_MFDFA_alpha2_Fluctuation',
       'HRV_MFDFA_alpha2_Increment'],
      dtype='object', length=140)

In [277]:
X_test = X_test.loc[:, mask_multicorr]

In [None]:
X_filtered_multicorr, y = RandomOverSampler().fit_resample(X_filtered_multicorr, y)

In [279]:
voting.fit(X_filtered_multicorr, y)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003413 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25393
[LightGBM] [Info] Number of data points in the train set: 12120, number of used features: 113
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294


In [281]:
y_test_pred = voting.predict(X_test)

In [282]:
sample_submission = pd.DataFrame(y_test_pred, index=data_test.index, columns=['y'])

In [283]:
sample_submission.to_csv('new_submission.csv')