In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import neurokit2 as nk
import biosppy.signals.ecg as ecg

In [2]:
data_train = pd.read_csv('train.csv', header=0, index_col='id')

X = data_train.drop(columns=['y'])
y = data_train.loc[:, 'y']

In [5]:
from joblib import Parallel, delayed
import scipy
from scipy.signal import welch

def seq_to_signal(seq):
    return np.argwhere(seq == 1)

def process_interval(first_signal, second_signal):
    if first_signal.shape[0] == 0 or second_signal.shape[0] == 0:
        return np.nan, np.nan 
    
    if first_signal.shape[0] > second_signal.shape[0]:
        reduce_axis = 0
    else:
        reduce_axis = 1
    
    diff = (second_signal.squeeze(-1)[None, :] - first_signal.squeeze(-1)[:, None]).astype(np.float32)
    diff[diff < 0] = np.inf 
    min_diff =  np.min(diff, axis=reduce_axis)
    return min_diff[min_diff != np.inf]

def interv_mstd(intervals):
    return np.mean(intervals), np.std(intervals)

def unroll_list(x):
    if isinstance(x, np.ndarray):
        while isinstance(x, np.ndarray) and len(x) > 0:
            x = x[0]
        return x if np.isscalar(x) else np.nan
    return x if np.isscalar(x) else np.nan

def signal_features(filtered, rpeaks):
    frequencies, psd = welch(filtered, fs=300, nperseg=None)
    dom_freq_id = np.argmax(psd)
    dominating_frequency = frequencies[dom_freq_id]
    psd_mode = psd[dom_freq_id]
    df_welch = frequencies[1] - frequencies[0]
    e_welch = np.sum(psd) * df_welch
    beats = ecg.extract_heartbeats(filtered, rpeaks, 300)['templates']
    mu = np.mean(beats, axis=0)
    mu_m, mu_std, mu_sk, mu_krt = np.mean(mu), np.std(mu), scipy.stats.skew(mu), scipy.stats.kurtosis(mu)
    return dominating_frequency, psd_mode, e_welch, mu_m, mu_std, mu_sk, mu_krt
    

def process_ecg(ecg_signal, freq, hrv_columns):
    ecg_signal = ecg_signal.dropna().to_numpy(dtype='float32')
    ts, filtered, rpeaks, template_ts, template, heart_rate_ts, heart_rate = ecg.ecg(signal=ecg_signal, sampling_rate=freq, show=False)
    signals, waves = nk.ecg_delineate(filtered,
                                        rpeaks, 
                                        sampling_rate=freq, 
                                        method="dwt", 
                                        show=False)

    ECG_P_Peaks, ECG_P_Onsets, ECG_P_Offsets, ECG_Q_Peaks, ECG_R_Onsets, ECG_R_Offsets, ECG_S_Peaks, ECG_T_Peaks, ECG_T_Onsets, ECG_T_Offsets = \
        seq_to_signal(signals['ECG_P_Peaks']), seq_to_signal(signals['ECG_P_Onsets']), seq_to_signal(signals['ECG_P_Offsets']), seq_to_signal(signals['ECG_Q_Peaks']), \
            seq_to_signal(signals['ECG_R_Onsets']), seq_to_signal(signals['ECG_R_Offsets']), seq_to_signal(signals['ECG_S_Peaks']), seq_to_signal(signals['ECG_T_Peaks']), \
                seq_to_signal(signals['ECG_T_Onsets']), seq_to_signal(signals['ECG_T_Offsets'])
    # interval based features
    PR_inter_mean, PR_inter_std = interv_mstd(process_interval(ECG_P_Onsets, ECG_R_Onsets))
    PR_seg_mean, PR_seg_std = interv_mstd(process_interval(ECG_P_Offsets, ECG_R_Onsets))
    QRS_comp_mean, QRS_comp_std = interv_mstd(process_interval(ECG_R_Onsets, ECG_R_Offsets))
    QT_inter_mean, QT_inter_std = interv_mstd(process_interval(ECG_R_Onsets, ECG_T_Offsets))
    ST_seg_mean, ST_seg_std = interv_mstd(process_interval(ECG_R_Offsets, ECG_T_Onsets))
    QS_seg_mean, QS_seg_std = interv_mstd(process_interval(ECG_Q_Peaks, ECG_S_Peaks))
    PT_inter_mean, PT_inter_std = interv_mstd(process_interval(ECG_P_Peaks, ECG_T_Peaks))
    mean_hrt = np.mean(np.array(heart_rate))
    RR_inter = rpeaks[1:] - rpeaks[:-1]
    RR_inter_mean, RR_inter_std = np.mean(RR_inter), np.std(RR_inter)
    domfreq, psdmode, ewelch, mum, mustd, musk, mukrt = signal_features(filtered, rpeaks)

    features = []
    features += [PR_inter_mean, PR_inter_std]
    features += [PR_seg_mean, PR_seg_std]
    features += [QRS_comp_mean, QRS_comp_std]
    features += [QT_inter_mean, QT_inter_std]
    features += [ST_seg_mean, ST_seg_std]
    features += [QS_seg_mean, QS_seg_std]
    features += [PT_inter_mean, PT_inter_std]
    features += [mean_hrt]
    features += [RR_inter_mean, RR_inter_std]
    features += [domfreq, psdmode, ewelch, mum, mustd, musk, mukrt]

    features = np.array(features)

    cols = ['PRinterm', 'PRinterstd', 'PRsegm', 'PRsegstd', 'QRSmean', 'QRSstd', 'QTinterm', \
        'QTinterstd', 'STsegm', 'STsegstd', 'QSsegm', 'QSsegstd', 'PTinterm', 'PTinterstd', 'HeartRatem', 'RRmean', 'RRstd', \
            'DOMFREQ', 'PSDMODE', 'EWELCH', 'MUM', 'MUSTD', 'MUSK', 'MUKRT']+hrv_columns
    
    try:
        df_sig, info = nk.ecg_process(filtered, sampling_rate=300)
        hrv_features = np.apply_along_axis(unroll_list, arr=np.array(nk.ecg_intervalrelated(df_sig, sampling_rate=300)), axis=0)
        hrv_features[np.logical_or(hrv_features == np.inf, hrv_features == -np.inf)] = np.nan
        features_concat = np.concatenate([features, hrv_features])
        df = pd.DataFrame(features_concat[None, :], columns=cols)
    except Exception as e:
        df = pd.DataFrame(np.append(features, [np.nan] * (len(cols) - features.shape[0]))[None, :], columns=cols)

    return df
    
def process_all_ecg(signals, freq=300, n_jobs=-1):
    ts, filtered, rpeaks, template_ts, template, heart_rate_ts, heart_rate = ecg.ecg(signal=signals.iloc[0, :].dropna().to_numpy(dtype='float32'), sampling_rate=freq, show=False)
    df, info = nk.ecg_process(filtered, sampling_rate=300)
    hrv_columns = nk.ecg_intervalrelated(df, sampling_rate=300).columns
    hrv_columns = hrv_columns.to_list()
    
    results = Parallel(n_jobs=n_jobs, backend='multiprocessing')(delayed(process_ecg)(signals.iloc[i, :], freq, hrv_columns) for i in range(signals.shape[0]))
    
    if results:
        return pd.concat(results, axis=0, ignore_index=True)
    else:
        return None

In [6]:
feature_df_train = process_all_ecg(X)

  warn(
  warn(
  mse = np.trapz(mse) / len(mse)
  normalization = (n - 1) / (np.floor((n - k_subrange) / k).astype(int) * k)
  warn(
  sets = (np.nansum(np.abs(np.diff(sig_values)), axis=1) * normalization) / k
  warn(
  warn(
  warn(
  warn(
  mse = np.trapz(mse) / len(mse)
  mse = np.trapz(mse) / len(mse)
  mse = np.trapz(mse) / len(mse)
  normalization = (n - 1) / (np.floor((n - k_subrange) / k).astype(int) * k)
  mse = np.trapz(mse) / len(mse)
  mse = np.trapz(mse) / len(mse)
  mse = np.trapz(mse) / len(mse)
  sets = (np.nansum(np.abs(np.diff(sig_values)), axis=1) * normalization) / k
  mse = np.trapz(mse) / len(mse)
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  mse = np.trapz(mse) / len(mse)
  warn(
  mse = np.trapz(mse) / len(mse)
  mse = np.trapz(mse) / len(mse)
  warn(
  mse = np.trapz(mse) / len(mse)
  mse = np.trapz(mse) / len(mse)
  normalization = (n - 1) / (np.floor((n - k_subrange) / k).astype(int) * k)
  mse = np.trapz(mse) / len(mse)
  sets = (np.nansum(np.abs(np.d

In [7]:
feature_df_train

Unnamed: 0,PRinterm,PRinterstd,PRsegm,PRsegstd,QRSmean,QRSstd,QTinterm,QTinterstd,STsegm,STsegstd,...,HRV_SampEn,HRV_ShanEn,HRV_FuzzyEn,HRV_MSEn,HRV_CMSEn,HRV_RCMSEn,HRV_CD,HRV_HFD,HRV_KFD,HRV_LZC
0,34.444443,18.956741,106.625000,113.602173,35.984375,11.725363,125.281250,18.186077,55.833332,7.842355,...,0.474685,4.169807,0.390579,0.218803,0.407305,0.387922,0.799496,1.703878,1.196940,0.457909
1,40.090908,25.174980,109.562500,110.992661,40.939392,14.240232,113.818184,21.168093,44.742859,36.855194,...,1.609438,4.650572,1.161852,0.000000,0.000000,0.000000,1.734227,1.663353,2.173917,0.879306
2,30.703703,11.674013,72.074074,107.649261,37.444443,9.397924,131.222229,25.088486,54.799999,17.617037,...,2.397895,4.237291,1.425803,0.000000,0.000000,0.000000,1.929675,1.732052,2.358072,1.340133
3,48.923077,101.152939,70.199997,135.794540,122.142860,100.780426,158.166672,104.881538,562.428589,640.197021,...,1.740466,4.868124,1.394744,0.000000,0.993329,1.242779,1.650555,1.881967,2.419543,1.090850
4,36.162792,9.757359,35.279068,58.047825,29.767443,7.944098,97.000000,13.397327,34.755554,32.962036,...,0.464306,4.381316,0.591076,0.318809,0.273941,0.265536,1.292900,1.417458,1.487845,0.610206
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5112,41.727272,22.785923,63.285713,81.729607,36.272728,9.031167,95.454544,37.664459,62.160000,45.627777,...,0.367725,4.201841,0.980824,,,,0.793415,1.875295,2.602427,1.464106
5113,26.600000,16.239048,186.172409,111.438576,33.599998,5.817216,120.566666,27.121557,63.258064,27.730230,...,,3.252426,1.140114,,,,2.149990,1.822316,1.757852,1.278502
5114,41.333332,18.743349,55.625000,88.965843,36.151516,10.954787,114.121216,10.545019,45.142857,46.063400,...,,,,,,,,,,
5115,34.464287,21.182005,131.214279,115.514488,48.206898,15.232406,139.068970,15.787230,57.406250,45.099098,...,,4.051109,1.405816,,,,1.828850,2.037834,3.045494,1.093750


In [8]:
droped_cols = feature_df_train.isna().all(axis=0)
droped_cols

PRinterm      False
PRinterstd    False
PRsegm        False
PRsegstd      False
QRSmean       False
              ...  
HRV_RCMSEn    False
HRV_CD        False
HRV_HFD       False
HRV_KFD       False
HRV_LZC       False
Length: 107, dtype: bool

In [9]:
feature_df_train = feature_df_train.dropna(axis=1, how='all')

In [10]:
feature_df_train

Unnamed: 0,PRinterm,PRinterstd,PRsegm,PRsegstd,QRSmean,QRSstd,QTinterm,QTinterstd,STsegm,STsegstd,...,HRV_SampEn,HRV_ShanEn,HRV_FuzzyEn,HRV_MSEn,HRV_CMSEn,HRV_RCMSEn,HRV_CD,HRV_HFD,HRV_KFD,HRV_LZC
0,34.444443,18.956741,106.625000,113.602173,35.984375,11.725363,125.281250,18.186077,55.833332,7.842355,...,0.474685,4.169807,0.390579,0.218803,0.407305,0.387922,0.799496,1.703878,1.196940,0.457909
1,40.090908,25.174980,109.562500,110.992661,40.939392,14.240232,113.818184,21.168093,44.742859,36.855194,...,1.609438,4.650572,1.161852,0.000000,0.000000,0.000000,1.734227,1.663353,2.173917,0.879306
2,30.703703,11.674013,72.074074,107.649261,37.444443,9.397924,131.222229,25.088486,54.799999,17.617037,...,2.397895,4.237291,1.425803,0.000000,0.000000,0.000000,1.929675,1.732052,2.358072,1.340133
3,48.923077,101.152939,70.199997,135.794540,122.142860,100.780426,158.166672,104.881538,562.428589,640.197021,...,1.740466,4.868124,1.394744,0.000000,0.993329,1.242779,1.650555,1.881967,2.419543,1.090850
4,36.162792,9.757359,35.279068,58.047825,29.767443,7.944098,97.000000,13.397327,34.755554,32.962036,...,0.464306,4.381316,0.591076,0.318809,0.273941,0.265536,1.292900,1.417458,1.487845,0.610206
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5112,41.727272,22.785923,63.285713,81.729607,36.272728,9.031167,95.454544,37.664459,62.160000,45.627777,...,0.367725,4.201841,0.980824,,,,0.793415,1.875295,2.602427,1.464106
5113,26.600000,16.239048,186.172409,111.438576,33.599998,5.817216,120.566666,27.121557,63.258064,27.730230,...,,3.252426,1.140114,,,,2.149990,1.822316,1.757852,1.278502
5114,41.333332,18.743349,55.625000,88.965843,36.151516,10.954787,114.121216,10.545019,45.142857,46.063400,...,,,,,,,,,,
5115,34.464287,21.182005,131.214279,115.514488,48.206898,15.232406,139.068970,15.787230,57.406250,45.099098,...,,4.051109,1.405816,,,,1.828850,2.037834,3.045494,1.093750


In [11]:
X_, y_ = feature_df_train, y

In [12]:
from sklearn.impute import KNNImputer

knn_impute = KNNImputer(n_neighbors=15)
X_ = knn_impute.fit_transform(X_)


In [13]:
from imblearn.over_sampling import SMOTE, ADASYN

X_resampled, y_resampled = SMOTE().fit_resample(X_, y_)

In [14]:
y_resampled.value_counts()

y
0    3030
1    3030
2    3030
3    3030
Name: count, dtype: int64

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2)

In [42]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(n_estimators=1000, n_jobs=-1)
lgbm.fit(X_train, y_train)
y_pred = lgbm.predict(X_val)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003979 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25245
[LightGBM] [Info] Number of data points in the train set: 9696, number of used features: 99
[LightGBM] [Info] Start training from score -1.387946
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.378896
[LightGBM] [Info] Start training from score -1.392087


In [43]:
f1_score(y_val, y_pred, average='micro')

np.float64(0.9286303630363036)

In [44]:
from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred, labels=[0, 1, 2, 3]))

              precision    recall  f1-score   support

           0       0.89      0.88      0.88       610
           1       0.96      0.97      0.97       606
           2       0.89      0.86      0.88       588
           3       0.97      1.00      0.98       620

    accuracy                           0.93      2424
   macro avg       0.93      0.93      0.93      2424
weighted avg       0.93      0.93      0.93      2424



In [46]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline 

pipeline = Pipeline([
    ('scaler', RobustScaler()),
    ('class', KNeighborsClassifier())
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_val)

In [47]:
f1_score(y_val, y_pred, average='micro')

np.float64(0.8234323432343235)

In [48]:
from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred, labels=[0, 1, 2, 3]))

              precision    recall  f1-score   support

           0       0.82      0.63      0.71       610
           1       0.82      0.98      0.90       606
           2       0.74      0.67      0.70       588
           3       0.90      1.00      0.95       620

    accuracy                           0.82      2424
   macro avg       0.82      0.82      0.81      2424
weighted avg       0.82      0.82      0.82      2424



In [20]:
data_test = pd.read_csv('test.csv')


In [21]:
X_test = data_test


In [22]:
X_test_id = X_test['id']
X_test.drop(columns=['id'])

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x17797,x17798,x17799,x17800,x17801,x17802,x17803,x17804,x17805,x17806
0,73,73,73,72,70,68,66,63,60,56,...,,,,,,,,,,
1,-100,-97,-95,-94,-94,-93,-93,-93,-92,-92,...,,,,,,,,,,
2,-571,-656,-756,-833,-893,-952,-1000,-1061,-1112,-1128,...,,,,,,,,,,
3,-44,-41,-39,-37,-35,-34,-33,-32,-32,-31,...,,,,,,,,,,
4,-68,-70,-73,-76,-78,-80,-81,-82,-83,-87,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3406,8,8,9,11,13,13,13,13,13,13,...,,,,,,,,,,
3407,104,73,42,17,0,-12,-17,-20,-24,-26,...,,,,,,,,,,
3408,-42,-38,-34,-30,-27,-24,-21,-18,-15,-12,...,,,,,,,,,,
3409,12,8,4,0,-3,-8,-12,-15,-19,-23,...,,,,,,,,,,


In [23]:
feature_df_test = process_all_ecg(X_test)

  warn(
  warn(
  mse = np.trapz(mse) / len(mse)
  normalization = (n - 1) / (np.floor((n - k_subrange) / k).astype(int) * k)
  sets = (np.nansum(np.abs(np.diff(sig_values)), axis=1) * normalization) / k
  warn(
  mse = np.trapz(mse) / len(mse)
  normalization = (n - 1) / (np.floor((n - k_subrange) / k).astype(int) * k)
  warn(
  sets = (np.nansum(np.abs(np.diff(sig_values)), axis=1) * normalization) / k
  warn(
  mse = np.trapz(mse) / len(mse)
  mse = np.trapz(mse) / len(mse)
  warn(
  mse = np.trapz(mse) / len(mse)
  mse = np.trapz(mse) / len(mse)
  warn(
  mse = np.trapz(mse) / len(mse)
  mse = np.trapz(mse) / len(mse)
  mse = np.trapz(mse) / len(mse)
  warn(
  mse = np.trapz(mse) / len(mse)
  mse = np.trapz(mse) / len(mse)
  warn(
  mse = np.trapz(mse) / len(mse)
  warn(
  warn(
  warn(
  mse = np.trapz(mse) / len(mse)
  mse = np.trapz(mse) / len(mse)
  warn(
  mse = np.trapz(mse) / len(mse)
  mse = np.trapz(mse) / len(mse)
  mse = np.trapz(mse) / len(mse)
  warn(
  normalization =

In [24]:
feature_df_test

Unnamed: 0,PRinterm,PRinterstd,PRsegm,PRsegstd,QRSmean,QRSstd,QTinterm,QTinterstd,STsegm,STsegstd,...,HRV_SampEn,HRV_ShanEn,HRV_FuzzyEn,HRV_MSEn,HRV_CMSEn,HRV_RCMSEn,HRV_CD,HRV_HFD,HRV_KFD,HRV_LZC
0,41.571430,23.498371,122.500000,113.015007,57.357143,48.100502,136.964279,54.204960,50.133335,43.006771,...,1.216395,4.220168,0.656577,0.000000,0.000000,0.000000,1.488919,1.297254,1.477121,0.479438
1,47.769230,29.574295,186.538467,153.789368,110.384613,86.042519,174.000000,9.742847,79.076920,67.432541,...,,3.690117,1.364004,,,,1.479603,,2.340867,1.683073
2,43.096775,17.966555,102.709679,118.960327,48.750000,27.871357,114.031250,51.510002,61.878788,30.672026,...,0.379490,4.548394,1.051182,0.000000,0.000000,0.000000,1.295233,1.823698,2.190935,1.144941
3,42.153847,27.995457,160.360001,132.999512,41.615383,10.510350,133.653839,9.050893,53.392857,8.981486,...,,3.767376,1.609551,,,,1.922481,1.909094,2.347472,1.373530
4,38.185184,19.427048,73.384613,109.204491,48.074074,17.659174,140.407410,14.722770,68.714287,16.713676,...,,4.430632,1.572925,,,,1.739370,2.016442,3.609488,1.056642
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3406,40.921051,19.508947,58.289474,72.172279,33.026318,9.753506,99.447365,25.311426,43.256409,16.266558,...,0.346523,4.366091,0.534205,0.178562,0.242684,0.243057,0.776082,2.012522,2.145491,1.242930
3407,45.888889,36.242325,184.740738,129.507202,47.592594,34.526707,131.518524,43.935448,59.392857,60.229527,...,1.386294,4.478232,1.048295,0.000000,0.000000,0.000000,1.591612,1.961068,2.605917,0.958877
3408,28.421053,17.777323,99.803574,100.803993,42.719299,13.456564,112.701752,13.325829,40.534248,49.961376,...,2.047693,5.431162,1.376602,0.825114,0.998401,1.111397,2.127661,1.769890,3.429638,1.039394
3409,20.055555,9.336143,157.000000,88.869408,38.555557,7.369021,126.888885,8.002701,58.342106,9.248830,...,1.223775,3.146245,0.900516,0.508676,0.708303,0.753884,1.530041,1.596430,2.062229,0.563184


In [25]:
feature_df_test_bis = feature_df_test.loc[:, ~droped_cols]

In [26]:
feature_df_test_bis

Unnamed: 0,PRinterm,PRinterstd,PRsegm,PRsegstd,QRSmean,QRSstd,QTinterm,QTinterstd,STsegm,STsegstd,...,HRV_SampEn,HRV_ShanEn,HRV_FuzzyEn,HRV_MSEn,HRV_CMSEn,HRV_RCMSEn,HRV_CD,HRV_HFD,HRV_KFD,HRV_LZC
0,41.571430,23.498371,122.500000,113.015007,57.357143,48.100502,136.964279,54.204960,50.133335,43.006771,...,1.216395,4.220168,0.656577,0.000000,0.000000,0.000000,1.488919,1.297254,1.477121,0.479438
1,47.769230,29.574295,186.538467,153.789368,110.384613,86.042519,174.000000,9.742847,79.076920,67.432541,...,,3.690117,1.364004,,,,1.479603,,2.340867,1.683073
2,43.096775,17.966555,102.709679,118.960327,48.750000,27.871357,114.031250,51.510002,61.878788,30.672026,...,0.379490,4.548394,1.051182,0.000000,0.000000,0.000000,1.295233,1.823698,2.190935,1.144941
3,42.153847,27.995457,160.360001,132.999512,41.615383,10.510350,133.653839,9.050893,53.392857,8.981486,...,,3.767376,1.609551,,,,1.922481,1.909094,2.347472,1.373530
4,38.185184,19.427048,73.384613,109.204491,48.074074,17.659174,140.407410,14.722770,68.714287,16.713676,...,,4.430632,1.572925,,,,1.739370,2.016442,3.609488,1.056642
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3406,40.921051,19.508947,58.289474,72.172279,33.026318,9.753506,99.447365,25.311426,43.256409,16.266558,...,0.346523,4.366091,0.534205,0.178562,0.242684,0.243057,0.776082,2.012522,2.145491,1.242930
3407,45.888889,36.242325,184.740738,129.507202,47.592594,34.526707,131.518524,43.935448,59.392857,60.229527,...,1.386294,4.478232,1.048295,0.000000,0.000000,0.000000,1.591612,1.961068,2.605917,0.958877
3408,28.421053,17.777323,99.803574,100.803993,42.719299,13.456564,112.701752,13.325829,40.534248,49.961376,...,2.047693,5.431162,1.376602,0.825114,0.998401,1.111397,2.127661,1.769890,3.429638,1.039394
3409,20.055555,9.336143,157.000000,88.869408,38.555557,7.369021,126.888885,8.002701,58.342106,9.248830,...,1.223775,3.146245,0.900516,0.508676,0.708303,0.753884,1.530041,1.596430,2.062229,0.563184


In [27]:
X_test = knn_impute.transform(feature_df_test_bis)

In [28]:
lgbm.fit(X_resampled, y_resampled)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003665 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25245
[LightGBM] [Info] Number of data points in the train set: 12120, number of used features: 99
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294


In [29]:
y_test_pred = lgbm.predict(X_test)

In [30]:
submission = pd.DataFrame({'id': X_test_id, 'y': y_test_pred})
submission.to_csv('lgbm_submission.csv', index=False)