In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgbm
import json

from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestRegressor

from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, atpe
import hyperopt

from seaborn import histplot

from sklearn.metrics import f1_score

## Data Import

In [None]:
with open('feature_groups.json') as f:
    feature_groups = json.load(f)

y_train = pd.read_csv("data/y_train.csv")
y_train = y_train['y']

X_train = pd.read_csv("data/X_train_mega.csv")
X_train = X_train.iloc[:, 1:]
X_train = X_train[feature_groups['wavelets'] 
                  + feature_groups['robust_peaks'] 
                  + feature_groups['hos_and_sign'] 
                  + feature_groups['R_features'] 
                  + feature_groups['PQST_hrv'] 
                  + feature_groups['intervals']]
X_train.dropna(axis=1, how='all', inplace=True)

X_test = pd.read_csv("data/X_test_mega.csv")
X_test = X_test.iloc[:, 1:]
X_test = X_test[feature_groups['wavelets'] 
                  + feature_groups['robust_peaks'] 
                  + feature_groups['hos_and_sign'] 
                  + feature_groups['R_features'] 
                  + feature_groups['PQST_hrv'] 
                  + feature_groups['intervals']]
X_test.dropna(axis=1, how='all', inplace=True)

# Hyperparameter Tuning 

In [None]:
def get_hyp_results_as_df(trials):
    hyp_results = []
    for entry in trials.trials:
        entry_hyperparam_values = entry['misc']['vals']
        entry_hyperparam_values = {key: entry_hyperparam_values[key][0] for key in entry_hyperparam_values} # unpack list (there is always one item)
        entry_dict = {**entry['result'], **entry_hyperparam_values} # combine the two dicts
        hyp_results.append(entry_dict)
    hyp_results = pd.DataFrame(hyp_results).sort_values(by='loss')
    return hyp_results

In [None]:
space = {
    'max_depth': hp.uniform('max_depth',1, 30)
    ,'num_leaves': hp.loguniform('num_leaves', 1, 10)
    ,'n_estimators': hp.loguniform('n_estimators', 5, 7)
    ,'learning_rate': hp.loguniform('learning_rate', -10, -1)
    ,'reg_lambda': hp.loguniform('reg_lambda', -8, 3)
    ,'subsample': hp.uniform('subsample', 0, 1)
}

def objective(space):
    
    params = {        
        'max_depth': int(space['max_depth'])
        ,'num_leaves': int(space['num_leaves'])
        ,'n_estimators': int(space['n_estimators'])
        ,'learning_rate': space['learning_rate']
        ,'reg_lambda': space['reg_lambda']
        ,'subsample': space['subsample']
    }
    
    N_SPLITS = 5
    kf_cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True)

    scores = np.zeros(N_SPLITS)
    for idx, (train_idx, val_idx) in enumerate(kf_cv.split(X_train, y_train)):

        X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

        lgbm_classifier = lgbm.LGBMClassifier(
            **params,
            class_weight = {
                0: 1.69,
                2: 3.47,
                1: 11.55,
                3: 30.10
            },
            objective = 'multiclass',
            num_class = 4,
            max_bin = 100,
            subsample_freq = 1, 
            verbose = -1,
            boosting_type = 'dart'
        )
        
        lgbm_classifier.fit(
            X_train_fold,
            y_train_fold,
            verbose = False
        )
        
        preds = lgbm_classifier.predict(X_val_fold)
        scores[idx] = f1_score(y_true = y_val_fold, y_pred = preds, average='micro')
    
    avg_loss = np.mean(scores)
    return {'loss': -avg_loss, 'loss_variance': np.var(scores, ddof=1), 'status': STATUS_OK }

In [None]:
trials = Trials()

In [None]:
best_hyperparams = fmin(
    fn = objective,
    space = space,
    algo = atpe.suggest, #hyperopt.rand.suggest,
    max_evals = 50 
    ,trials = trials
)

In [None]:
hyp_results = get_hyp_results_as_df(trials)
pd.set_option('display.max_rows', 100)

# Training and Prediction

In [None]:
def get_kth_best_hyperparams(hyp_results, k):
    """
    k=0 yields the best model
    """
    hyperparam = dict(hyp_results.iloc[k])
    hyperparam.pop('loss')
    hyperparam.pop('loss_variance')
    hyperparam.pop('status')

    fixed_parameters = {
        'max_bin': 100,
        'subsample_freq': 1,
        'boosting_type': 'dart' 
    }

    hyperparam = {**hyperparam, **fixed_parameters}
    
    # convert to integers
    hyperparam['max_depth'] = int(hyperparam['max_depth'])
    hyperparam['n_estimators'] = int(hyperparam['n_estimators'])
    hyperparam['num_leaves'] = int(hyperparam['num_leaves'])

    return hyperparam

In [None]:
K_BEST_MODELS = 5
y_pred_matrix = np.zeros((X_test.shape[0], K_BEST_MODELS))
for k in range(K_BEST_MODELS):
    params = get_kth_best_hyperparams(hyp_results, k)
    
    lgbm_classifier = lgbm.LGBMClassifier(
        **params, 
        class_weight = {
                0: 1.69,
                2: 3.47,
                1: 11.55,
                3: 30.10 
        },
        objective = 'multiclass',
        num_class = 4
    )

    lgbm_classifier.fit(
        X_train,
        y_train,
        verbose=True
    )

    y_test_pred = lgbm_classifier.predict(X_test)
    y_pred_matrix[:,k] = y_test_pred

In [None]:
y_majority = pd.DataFrame(y_pred_matrix).mode(axis=1)[0].astype(int)
y_majority = pd.DataFrame({'id':np.arange(0,len(y_majority)), 'y':y_majority})
y_majority.to_csv("data/predictions.csv", index=False)