In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from lightgbm.callback import early_stopping, log_evaluation
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import f1_score

In [2]:
train_labels = pd.read_csv('./input/train_dccweek2023-labels.csv')
train_labels.columns = ['exam_id', 'classe']

In [3]:
train_df = pd.read_csv('./input/v1processed_train.csv')
test_df = pd.read_csv('./input/v1processed_test.csv')

train_df2 = pd.read_csv('./input/v2processed_train.csv')
test_df2 = pd.read_csv('./input/v2processed_test.csv')

In [4]:
train_df = pd.merge(train_df, train_df2, on='exam_id', how='left')
test_df = pd.merge(test_df, test_df2, on='exam_id', how='left')

In [5]:
train_df.head()

Unnamed: 0,exam_id,DI_raw_diff_avg,DI_raw_diff_std,DI_raw_diff_var,DI_raw_diff_min,DI_raw_diff_max,DI_raw_diff_q25,DI_raw_diff_q50,DI_raw_diff_q75,DI_avg,...,lead11_average_qrs_durations,lead11_average_qt_interval,lead11_lf_power,lead11_hf_power,lead11_vlf_power,lead11_spectral_centroid,lead11_spectral_bandwidth,lead11_spectral_entropy,lead11_average_energy,lead11_average_std
0,3123252,-0.000562,0.14728,0.021691,-0.168918,0.52729,-0.112966,0.0,0.000376,-2.5e-05,...,20.333333,129.833333,0.107051,0.076603,0.002111,14.095755,6.642195,1.306952,145.658532,0.835313
1,2762516,0.00035,0.08768,0.007688,-0.191038,0.281778,-0.06956,0.0,0.034447,1.5e-05,...,20.777778,104.222222,0.019713,0.01279,0.000423,13.854123,6.872359,0.320708,25.316333,0.353215
2,526403,0.001344,0.072196,0.005212,-0.139662,0.309241,-0.048029,0.0,0.017225,5.9e-05,...,21.3,103.3,0.008862,0.008203,0.000108,15.087476,6.812584,0.187361,13.615546,0.241184
3,1359082,0.923394,0.144,0.020736,0.641558,1.314132,0.802376,0.939534,1.037409,0.040774,...,21.8125,108.1875,0.027494,0.016564,0.000418,13.500782,6.276943,0.409348,48.248568,0.400993
4,1140892,0.230409,0.187352,0.035101,-0.160194,0.693695,0.079976,0.263178,0.378977,0.009977,...,21.375,133.875,0.006901,0.005257,0.000205,14.00855,6.500195,0.141729,14.808883,0.225577


In [6]:
train_df = pd.merge(train_df, train_labels, on='exam_id', how='left')

In [7]:
def get_integer_preds(preds):
    def get_class(p):
        return np.argmax(p)
    int_preds = np.apply_along_axis(get_class, 1, preds)
    return int_preds

In [8]:
y = train_df['classe']
train_columns = [col for col in train_df.columns if (col not in ['exam_id','classe'])]

oof_preds = np.zeros((len(train_df), 7))
test_preds_fold = np.zeros((len(test_df), 7))
score_folds = []

In [9]:
w = y.value_counts()
weights = {i : np.sum(w) / w[i] for i in w.index}
classes = [0, 1, 2, 3, 4, 5, 6]
class_weight = {0:1, 1:2, 2:2, 3:2, 4:2, 5:2, 6:2}
for value in classes:
    weights[value] = weights[value] * class_weight[value]

In [10]:
n_folds = 5
folds = StratifiedKFold(n_splits=5, random_state=123, shuffle=True)
for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
    print("Fold: {}".format(fold_+1))

    trn_x, trn_y = train_df.loc[trn_, train_columns], y.loc[trn_]
    val_x, val_y = train_df.loc[val_, train_columns], y.loc[val_]
    
    w = trn_y.map(weights) #* train_seq_len.loc[trn_, 'nw']
    dtrain = lgb.Dataset(trn_x, trn_y, weight=w)
    dvalid = lgb.Dataset(val_x, val_y)
  
    params = {
        'boost': 'gbdt',
        'num_class': 7,
        'max_depth': -1,
        'num_leaves': 10,
        'objective': 'multiclass',
        'min_data_in_leaf': 15,
        'learning_rate': 0.025,
        'feature_fraction': 0.85,
        'bagging_fraction': 0.85,
        'bagging_freq': 10,
        'metric': 'multi_logloss',
        'num_threads': -1,
        'verbosity': -1,
        'seed': 159
    }
    
    execs = 1
    preds = np.zeros((len(val_x), 7))
    test_preds_exec = np.zeros((len(test_df), 7))

    early_stop_cb = early_stopping(stopping_rounds=200)
    log_eval_cb = log_evaluation(period=500)

    
    for p in range(0,execs):
        print("Iteration: {}".format(p+1))
        params['seed'] += p
        model = lgb.train(params,
                        dtrain,
                        num_boost_round = 100000,
                        valid_sets = [dtrain, dvalid],
                        callbacks=[early_stop_cb, log_eval_cb])
        
        preds += ((model.predict(val_x)) / execs)
        test_preds_exec += ((model.predict(test_df[train_columns])) / execs)
        
    #lgb.plot_importance(model, importance_type='split', max_num_features=30)
    #lgb.plot_importance(model, importance_type='gain', max_num_features=20)

    test_preds_fold += (test_preds_exec / n_folds)
    oof_preds[val_] = preds
    oof_int_preds = get_integer_preds(preds)
    test_int_preds = get_integer_preds(test_preds_exec)
    score_folds.append(f1_score(val_y, oof_int_preds, average='macro'))
    print("FOLD MACRO F1 = {}".format(f1_score(val_y, oof_int_preds, average='macro')))
    #break

Fold: 1
Iteration: 1
Training until validation scores don't improve for 200 rounds
[500]	training's multi_logloss: 0.0268263	valid_1's multi_logloss: 0.327813
[1000]	training's multi_logloss: 0.00729567	valid_1's multi_logloss: 0.254005
[1500]	training's multi_logloss: 0.00286937	valid_1's multi_logloss: 0.249147
Early stopping, best iteration is:
[1357]	training's multi_logloss: 0.00360267	valid_1's multi_logloss: 0.24801
FOLD MACRO F1 = 0.7108069154742143
Fold: 2
Iteration: 1
Training until validation scores don't improve for 200 rounds
[500]	training's multi_logloss: 0.0272029	valid_1's multi_logloss: 0.31745
[1000]	training's multi_logloss: 0.0075814	valid_1's multi_logloss: 0.237739
[1500]	training's multi_logloss: 0.00326863	valid_1's multi_logloss: 0.229727
Early stopping, best iteration is:
[1452]	training's multi_logloss: 0.00376807	valid_1's multi_logloss: 0.229505
FOLD MACRO F1 = 0.7269506519821011
Fold: 3
Iteration: 1
Training until validation scores don't improve for 200 r

In [11]:
print("MEAN MACRO F1 = {}".format(np.mean(score_folds)))
print("OOF MACRO F1 = {}".format(f1_score(y, get_integer_preds(oof_preds), average='macro')))
print(score_folds)

MEAN MACRO F1 = 0.7186793036678542
OOF MACRO F1 = 0.7187284495402017
[0.7108069154742143, 0.7269506519821011, 0.721117374762872, 0.7195400382608126, 0.7149815378592713]


In [12]:
sub = pd.DataFrame({'exam_id': test_df.exam_id, 'classe': get_integer_preds(test_preds_fold)})

In [13]:
sub = sub.set_index('exam_id')
train_labels = pd.read_csv('./input/train_dccweek2023-labels.csv', index_col='exam_id')
both = [i for i in train_labels.index if i in sub.index]
sub.loc[both, 'classe'] = train_labels['classe']

In [14]:
sub_probs = pd.DataFrame(test_preds_fold, columns=['class_' + str(s) for s in range(7)])
sub_probs['exam_id'] = test_df.exam_id
sub_probs.to_csv('./output/lgb_v2000_probs.csv', index=False)

In [15]:
sub.to_csv('./output/lgb_v2000_final.csv')