In [None]:
!pip install tsfel
!pip install catboost
!pip install optuna
!pip install shap

In [None]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 250)
pd.set_option('display.max_columns', 250)
pd.set_option('display.width', 1000)

import math

from scipy.fft import fft, fftfreq, fftshift

import tsfel

from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

from catboost import CatBoostRegressor, CatBoostClassifier

from scipy.stats import hmean
from scipy.stats import gmean
from scipy.stats import sem 
from scipy.signal import wiener 
from scipy.stats import iqr
from scipy.stats import differential_entropy
from scipy.stats import median_abs_deviation

from sklearn.metrics import f1_score

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle

import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/content/train.csv')
df = round(df, 3) # best2
df = df.fillna(0) # without tsfel so so...

In [None]:
cfg = tsfel.get_features_by_domain()

In [None]:
dfx = pd.DataFrame()
dfx = pd.DataFrame(columns=tsfel.time_series_features_extractor(cfg, df.drop('label', axis=1).loc[0].values, verbose=0).columns)

In [None]:
for i in tqdm(range(df.shape[0])):
    tem = tsfel.time_series_features_extractor(cfg, df.drop('label', axis=1).loc[i].values, verbose=0) # , fs=300, fs=300, fs=117400, default = 1
    dfx.loc[i] = tem.loc[0]

100%|██████████| 3792/3792 [04:50<00:00, 13.06it/s]


In [None]:
dfx.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3792 entries, 0 to 3791
Columns: 285 entries, 0_FFT mean coefficient_0 to label
dtypes: float64(284), int64(1)
memory usage: 8.3 MB


In [None]:
def feat_ing(train_df):
    only_dat = train_df.columns[train_df.columns.str.contains("data_")]
    else_dat = train_df.columns[~train_df.columns.str.contains("data_")]
    
    train_df['mean'] = train_df[only_dat].mean(axis=1, skipna=True)
    train_df['max'] = train_df[only_dat].max(axis=1, skipna=True)
    train_df['min'] = train_df[only_dat].min(axis=1, skipna=True)
    train_df['median'] = train_df[only_dat].median(axis=1, skipna=True)
    train_df['sum'] = train_df[only_dat].sum(axis=1, skipna=True)
    train_df['std'] = train_df[only_dat].std(axis=1, skipna=True)
    train_df['sem'] = train_df[only_dat].sem(axis=1, skipna=True) # standard error of the mean
    train_df['skew'] = train_df[only_dat].skew(axis=1, skipna=True) # unibias skewness
    train_df['var'] = train_df[only_dat].var(axis=1, skipna=True) # variance (expectation of the squared deviation)

    train_df['gt_015_sum'] = train_df[only_dat].gt(0.15, axis=1).sum(axis=1) # Get Greater than of dataframe and other

    train_df['dif_ent'] = train_df[only_dat].apply(lambda x: differential_entropy(x.dropna()), axis=1)
    train_df['hmean'] = train_df[only_dat].apply(lambda x: hmean(abs(x+0.0000001), nan_policy='omit'), axis=1) # weighted harmonic mean
    train_df['gmean'] = train_df[only_dat].apply(lambda x: gmean(abs(x.dropna()+0.0000001)), axis=1) # weighted geometric mean
    train_df['iqr'] = train_df[only_dat].apply(lambda x: iqr(x.dropna()), axis=1) # difference between the 75th and 25th percentile of the data
    train_df['sem'] = train_df[only_dat].apply(lambda x: sem(x.dropna()), axis=1) # standard error of the mean
    train_df['wiener'] = train_df[only_dat].apply(lambda x: wiener(x.dropna()).mean(), axis=1) # a Wiener filter, to test (wiener - orig)

    train_df['iqr_05_95'] = train_df[only_dat].apply(lambda x: iqr(x.dropna(), rng=(5, 95)), axis=1) # difference between the 5th and 95th percentile of the data
    train_df['iqr_10_90'] = train_df[only_dat].apply(lambda x: iqr(x.dropna(), rng=(10, 90)), axis=1)
    train_df['iqr_15_85'] = train_df[only_dat].apply(lambda x: iqr(x.dropna(), rng=(15, 85)), axis=1)
    train_df['iqr_20_80'] = train_df[only_dat].apply(lambda x: iqr(x.dropna(), rng=(20, 80)), axis=1)
    train_df['iqr_30_70'] = train_df[only_dat].apply(lambda x: iqr(x.dropna(), rng=(30, 70)), axis=1)
    train_df['iqr_35_65'] = train_df[only_dat].apply(lambda x: iqr(x.dropna(), rng=(35, 65)), axis=1)    
    train_df['iqr_40_60'] = train_df[only_dat].apply(lambda x: iqr(x.dropna(), rng=(40, 60)), axis=1)
    train_df['iqr_45_65'] = train_df[only_dat].apply(lambda x: iqr(x.dropna(), rng=(45, 65)), axis=1)

    train_df['q15'] = train_df[only_dat].apply(lambda x: x.quantile(q=0.15), axis=1) #value at the given quantile
    train_df['q45'] = train_df[only_dat].apply(lambda x: x.quantile(q=0.45), axis=1)
    train_df['q75'] = train_df[only_dat].apply(lambda x: x.quantile(q=0.75), axis=1)

    train_df['fur_85'] = train_df[only_dat].apply(lambda x: (np.absolute(pd.Series(fft(x.fillna(0).values)).quantile(q=0.85))), axis=1)
    train_df['fur_75'] = train_df[only_dat].apply(lambda x: (np.absolute(pd.Series(fft(x.fillna(0).values)).quantile(q=0.75))), axis=1)  
    train_df['fur_95'] = train_df[only_dat].apply(lambda x: (np.absolute(pd.Series(fft(x.fillna(0).values)).quantile(q=0.95))), axis=1) 

    train_df['fur_max'] = train_df[only_dat].apply(lambda x: (np.absolute(pd.Series(fft(x.fillna(0).values)).max())), axis=1)  
    train_df['fur_med'] = train_df[only_dat].apply(lambda x: (np.absolute(pd.Series(fft(x.fillna(0).values)).median())), axis=1)  

    train_df['fur_05_95'] = train_df[only_dat].apply(lambda x: iqr((np.absolute(pd.Series(fft(x.fillna(0).values)))), rng=(5, 95)), axis=1)
    train_df['fur_10_90'] = train_df[only_dat].apply(lambda x: iqr((np.absolute(pd.Series(fft(x.fillna(0).values)))), rng=(10, 90)), axis=1)
    train_df['fur_15_85'] = train_df[only_dat].apply(lambda x: iqr((np.absolute(pd.Series(fft(x.fillna(0).values)))), rng=(15, 85)), axis=1)
    train_df['fur_20_80'] = train_df[only_dat].apply(lambda x: iqr((np.absolute(pd.Series(fft(x.fillna(0).values)))), rng=(20, 80)), axis=1)
    train_df['fur_25_75'] = train_df[only_dat].apply(lambda x: iqr((np.absolute(pd.Series(fft(x.fillna(0).values)))), rng=(25, 75)), axis=1)
    train_df['fur_30_70'] = train_df[only_dat].apply(lambda x: iqr((np.absolute(pd.Series(fft(x.fillna(0).values)))), rng=(30, 70)), axis=1)
    train_df['fur_35_65'] = train_df[only_dat].apply(lambda x: iqr((np.absolute(pd.Series(fft(x.fillna(0).values)))), rng=(35, 65)), axis=1)
    train_df['fur_40_60'] = train_df[only_dat].apply(lambda x: iqr((np.absolute(pd.Series(fft(x.fillna(0).values)))), rng=(40, 60)), axis=1)
    train_df['fur_45_65'] = train_df[only_dat].apply(lambda x: iqr((np.absolute(pd.Series(fft(x.fillna(0).values)))), rng=(45, 55)), axis=1)

    diff_df = train_df[only_dat].diff(axis=1)
    diff_df['mean_dif'] = diff_df[only_dat].mean(axis=1, skipna=True)
    diff_df['max_dif'] = diff_df[only_dat].max(axis=1, skipna=True)
    diff_df['min_dif'] = diff_df[only_dat].min(axis=1, skipna=True)
    diff_df['median_dif'] = diff_df[only_dat].median(axis=1, skipna=True)
    diff_df['sum_dif'] = diff_df[only_dat].sum(axis=1, skipna=True)
    diff_df['std_dif'] = diff_df[only_dat].std(axis=1, skipna=True)
    diff_df['std_sem'] = diff_df[only_dat].sem(axis=1, skipna=True)
    diff_df['std_skew'] = diff_df[only_dat].skew(axis=1, skipna=True)
    diff_df['std_var'] = diff_df[only_dat].var(axis=1, skipna=True)

    diff_df['iqr_15_85'] = diff_df[only_dat].apply(lambda x: iqr(x.dropna(), rng=(15, 85)), axis=1)
    diff_df['iqr_35_65'] = diff_df[only_dat].apply(lambda x: iqr(x.dropna(), rng=(35, 65)), axis=1)


    diff_df = diff_df.drop(only_dat, axis=1)
    train_df = train_df.join(diff_df, rsuffix='diff')
    return train_df

In [None]:
feat_df = feat_ing(df)

In [None]:
only_dat = feat_df.columns[feat_df.columns.str.contains("data_")]
else_dat = feat_df.columns[~feat_df.columns.str.contains("data_")]

In [None]:
ev_th = feat_df[else_dat].drop('label', axis=1).merge(dfx, left_index=True, right_index=True)

In [None]:
X, y = ev_th.drop(['label'], axis=1), ev_th['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=47, 
    stratify=y, 
    shuffle=True)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3033, 335), (759, 335), (3033,), (759,))

In [None]:
pipe = Pipeline(
    [
        ("regressor", CatBoostClassifier())
    ]
)

param_grid = [ 
    {
        'regressor': [CatBoostClassifier(
            random_state=47, 
            eval_metric='F1',
            silent=True,
            early_stopping_rounds=80,
            )
            ],
    },
]

grid = RandomizedSearchCV(pipe, param_grid, n_iter=1, cv=5, scoring = 'f1', verbose=3, random_state=47)

In [None]:
%%time
grid.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END regressor=<catboost.core.CatBoostClassifier object at 0x7f15301f3670>;, score=0.919 total time= 1.4min
[CV 2/5] END regressor=<catboost.core.CatBoostClassifier object at 0x7f15301f3670>;, score=0.940 total time= 1.4min
[CV 3/5] END regressor=<catboost.core.CatBoostClassifier object at 0x7f15301f3670>;, score=0.937 total time= 1.5min
[CV 4/5] END regressor=<catboost.core.CatBoostClassifier object at 0x7f15301f3670>;, score=0.912 total time= 1.4min
[CV 5/5] END regressor=<catboost.core.CatBoostClassifier object at 0x7f15301f3670>;, score=0.952 total time= 1.4min
CPU times: user 13min 55s, sys: 8.18 s, total: 14min 3s
Wall time: 8min 45s


In [None]:
result = pd.DataFrame(grid.cv_results_)
result = result.sort_values('mean_test_score', ascending=False)
result

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,86.19494,0.653257,0.062342,0.070441,<catboost.core.CatBoostClassifier object at 0x...,{'regressor': <catboost.core.CatBoostClassifie...,0.918699,0.939891,0.936937,0.912442,0.951872,0.931968,0.014429,1


In [None]:
y_pred = grid.predict(X_test)
f1_score(y_test, y_pred)

0.9696969696969696

In [None]:
test_df = pd.read_csv("test.csv")
test_df = round(test_df, 3)
test_df = test_df.fillna(0)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450 entries, 0 to 449
Columns: 300 entries, data_1 to data_300
dtypes: float64(300)
memory usage: 1.0 MB


In [None]:
test_fx = pd.DataFrame(columns=tsfel.time_series_features_extractor(cfg, test_df.loc[0].values, verbose=0).columns)

In [None]:
for i in tqdm(range(test_df.shape[0])):
    tem_ts = tsfel.time_series_features_extractor(cfg, test_df.loc[i].values, verbose=0)
    test_fx.loc[i] = tem_ts.loc[0]

100%|██████████| 450/450 [00:33<00:00, 13.47it/s]


In [None]:
test_df_feat = feat_ing(test_df)
test_df_feat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450 entries, 0 to 449
Columns: 351 entries, data_1 to iqr_35_65diff
dtypes: float64(350), int64(1)
memory usage: 1.2 MB


In [None]:
ev_th_test = test_df_feat[else_dat.drop('label')].merge(test_fx, left_index=True, right_index=True)

In [None]:
predicted_label = grid.predict(ev_th_test)

In [None]:
predicted_label

array([1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,

In [None]:
result_df = pd.DataFrame({"label":predicted_label})
result_df.to_csv("submission.csv", index=False) 