In [None]:
import pandas as pd
import numpy as np
from scipy.stats import uniform, loguniform
import matplotlib.pyplot as plt
import json

from sklearn.ensemble import StackingClassifier, VotingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.gaussian_process.kernels import ConstantKernel, RBF, RationalQuadratic
from sklearn.svm import SVC, LinearSVC

from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
import lightgbm as lgbm
from catboost import CatBoostClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from xgboost import XGBClassifier

In [3]:
y_train_raw = pd.read_csv('data/y_train.csv', index_col='id')
X_train_raw = pd.read_csv('data/X_train_mega.csv').iloc[:,1:]
X_test_raw = pd.read_csv('data/X_test_mega.csv').iloc[:,1:]
y_train = y_train_raw

In [4]:
with open('feature_groups.json') as f:
    feature_groups = json.load(f)

X_train = X_train_raw[
    feature_groups['wavelets'],
    feature_groups['hos_and_sign'],
    feature_groups['robust_peaks'],
    feature_groups['intervals'],
    feature_groups['PQST_hrv'],
    feature_groups['R_features']
]

X_test = X_test_raw[
    feature_groups['wavelets'],
    feature_groups['hos_and_sign'],
    feature_groups['robust_peaks'],
    feature_groups['intervals'],
    feature_groups['PQST_hrv'],
    feature_groups['R_features']
]

In [None]:
# remove all-NaN features ----------------------------------------
X_train.dropna(axis=1, how='all', inplace=True)
X_test.dropna(axis=1, how='all', inplace=True)

In [7]:
# GBDT ----------------------------------------------------
params = {
     'boosting_type': 'gbdt'
    ,'max_depth': 17
    ,'num_leaves': 1827

    ,'n_estimators': 817
    ,'learning_rate': 0.056804

    ,'reg_lambda': 0.799360
    ,'subsample': 0.794530
}

lgbm_classifier = lgbm.LGBMClassifier(
    **params,
    class_weight = {
                0: 1.688779,
                2: 3.471506,
                1: 11.550790,
                3: 30.100000
            },
    objective = 'multiclass',
    num_class = 4,
    max_bin = 100,
    subsample_freq = 1, 
    verbose = -1
)

In [8]:
# DART --------------------------------------------------
params = {
     'boosting_type': 'dart'
    ,'max_depth': 25
    ,'num_leaves': 8150

    ,'n_estimators': 522
    ,'learning_rate': 0.293195

    ,'reg_lambda': 0.070163
    ,'subsample': 0.842788
}

dart_classifier = lgbm.LGBMClassifier(
    **params,
    class_weight = {
                0: 1.688779,
                2: 3.471506,
                1: 11.550790,
                3: 30.100000
            },
    objective = 'multiclass',
    num_class = 4,
    max_bin = 100,
    subsample_freq = 1, 
    verbose = -1
)

In [9]:
# XGBoost --------------------------------------------------
from sklearn.utils import compute_sample_weight

def convert_params(params: pd.Series): # Takes a pandas series
    for key, val in params.items():
        params[key] = '{0:g}'.format(float(val))
    return params

params = pd.read_csv("data/xg_opt_params.csv")
weights = compute_sample_weight("balanced", y_train)
xgb_classifier = XGBClassifier(n_estimators=200,
                         sample_weight=weights,
                         **convert_params(params.iloc[0, 1:]))

In [10]:
# CatBoost -------------------------------------------------
catb_classifier = CatBoostClassifier()

In [11]:
# Ensemble -------------------------------------------------
estimators = [
    ('dart', dart_classifier),
    ('lgbm', lgbm_classifier),
    ('xgb', xgb_classifier),
    ('cat_boost', catb_classifier)
]

In [None]:
clf = VotingClassifier(estimators=estimators, voting='soft')
clf.fit(X_train, y_train['y'])

In [13]:
y_pred = clf.predict(X_test)
y_pred = pd.DataFrame(y_pred)
y_pred = pd.DataFrame({
    "id": y_pred.index,
    "y": y_pred[0]
})
y_pred.to_csv("data/submission_tuned_ensemble.csv", index=False)