In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np



sns.set_style('whitegrid')

In [2]:
from scipy.special import kl_div

def kl_divergence(solution: np.ndarray, submission: np.ndarray, epsilon: float=10**-15, micro_average: bool=True, sample_weights: pd.Series =None):
    
     # Prevent issue with populating int columns with floats
        
    solution= solution.astype(float)

        # Clip both the min and max following Kaggle conventions for related metrics like log loss
        # Clipping the max avoids cases where the loss would be infinite or undefined, clipping the min
        # prevents users from playing games with the 20th decimal place of predictions.
    submission = np.clip(submission, epsilon, 1 - epsilon)

    if micro_average:
        return np.average(kl_div(solution, submission).sum(axis=1), weights=sample_weights)
    else:
        return np.average(kl_div(solution, submission).mean(axis=0))

In [3]:
#the train dataframe with minimal filtering and no features
df=pd.read_csv('train_final_less_filtering.csv')

#dataframe with eeg-derived features, 
#setting index of the one saved in train.csv to the one in train_final_less_filtering.csv
#to allow fast dropping of the row in spec derived feature file
df_train=pd.read_csv('features_folder/merged_votes_no_overlap_filter_before_split/train.csv',index_col=0)
df_train=df[['Unnamed: 0', 'label_id']].merge(df_train, how='inner', left_on='label_id', right_on='label_id')
df_train=df_train.set_index('Unnamed: 0')
df_train.index.name=None

#dataframe with kaggle spec derived features
df_train2=pd.read_parquet('train_features_from_kaggle_spec.parquet').loc[df_train.index]

#dataframe with spec from eeg features
df_train3=pd.read_parquet('features_folder/features_from_eeg_spectrograms_without_kaggle_spec_data/train_mrgd_votes_no_ovlp_bfsplt_feats_from_eeg_to_spec.parquet')
df_train3=df[['Unnamed: 0', 'label_id']].merge(df_train3, how='inner', left_on='label_id', right_on='label_id')
df_train3=df_train3.set_index('Unnamed: 0')
df_train3.index.name=None
df_train3=df_train3[df_train3.columns[465:]]


#not all yet>>merging all features together
df_train=df_train.join(other=df_train2[df_train2.columns[3:]], how='left')
df_train=df_train.join(other=df_train3, how='left')
df_train

Unnamed: 0,label_id,eeg_id,offset_bins,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,patient_id,expert_consensus,...,eeg_std_f502_10s,eeg_std_f503_10s,eeg_std_f504_10s,eeg_std_f505_10s,eeg_std_f506_10s,eeg_std_f507_10s,eeg_std_f508_10s,eeg_std_f509_10s,eeg_std_f510_10s,eeg_std_f511_10s
0,127492639,1628180742,0,0,0.0,353733,0,0.0,42516,Seizure,...,0.097148,0.101822,0.107496,0.117469,0.123991,0.124876,0.122735,0.118822,0.115565,0.112540
4,3080632009,1628180742,2,4,24.0,353733,4,24.0,42516,Seizure,...,0.087977,0.084165,0.079340,0.079363,0.078445,0.081192,0.088005,0.093877,0.094662,0.096880
8,3388718494,1628180742,4,8,40.0,353733,8,40.0,42516,Seizure,...,0.103731,0.103387,0.105315,0.109041,0.108415,0.108729,0.110379,0.112730,0.115688,0.117514
11,557980729,722738444,0,0,0.0,999431,0,0.0,56885,LRDA,...,0.092119,0.095282,0.095436,0.098361,0.095192,0.093605,0.092762,0.090578,0.088483,0.088205
20,4101058765,722738444,2,9,22.0,999431,9,22.0,56885,LRDA,...,0.091724,0.090160,0.088409,0.087728,0.086762,0.086047,0.084729,0.077754,0.076896,0.078420
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106781,2587113091,3938393892,0,0,0.0,2146798838,1,60.0,28488,LPD,...,0.088470,0.085234,0.081271,0.078118,0.077326,0.078201,0.080196,0.084141,0.088271,0.086192
106783,2394534310,1850739625,0,0,0.0,2146798838,3,162.0,28488,LPD,...,0.082822,0.084246,0.088201,0.094187,0.095871,0.098783,0.106579,0.096520,0.094196,0.093533
106784,1216355904,1306668185,0,0,0.0,2147312808,0,0.0,57480,LPD,...,0.088233,0.091125,0.095469,0.095419,0.096099,0.098046,0.099099,0.096894,0.098195,0.095179
106789,429140316,351917269,0,0,0.0,2147388374,0,0.0,10351,LRDA,...,0.092753,0.093040,0.090667,0.092633,0.090093,0.087803,0.086154,0.086293,0.091146,0.091187


In [5]:
#some rows have NaN values - these come from kaggle spec-derived features. Total 30
df.set_index('Unnamed: 0').loc[df_train[df_train.isna().sum(axis=1)>0].index]

Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,eeg_50sec_nan_row_count,eeg_10sec_nan_row_count,spectrogram_600sec_nan_row_count,spectrogram_10sec_nan_row_count,total_votes
1701,2536169515,0,0.0,19384736,55,1394.0,1305119316,56450,LPD,0,3,0,0,0,0,0,0,92,6,3
1707,2536169515,6,20.0,19384736,61,1414.0,2772427326,56450,LPD,0,3,0,0,0,0,0,0,92,6,3
11571,4046938588,0,0.0,225347743,2,632.0,4062091881,29441,Other,0,0,0,0,0,1,0,0,150,6,1
11573,1225901968,0,0.0,225347743,4,824.0,2380909638,29441,LPD,0,3,0,0,0,0,0,0,150,6,3
11575,1225901968,2,22.0,225347743,6,846.0,2283132019,29441,LPD,0,3,0,0,0,0,0,0,150,6,3
11576,1225901968,3,42.0,225347743,7,866.0,2361116046,29441,LPD,0,3,0,0,0,0,0,0,150,6,3
11578,1225901968,5,80.0,225347743,9,904.0,18660663,29441,LPD,0,3,0,0,0,0,0,0,150,6,3
11579,1225901968,6,106.0,225347743,10,930.0,1960506997,29441,LPD,0,3,0,0,0,0,0,0,144,4,3
24017,191408832,1,8.0,470580984,15,683.0,414130193,56450,LPD,0,3,0,0,0,0,0,0,124,6,3
24385,3350961119,0,0.0,480127814,17,717.0,713604191,56450,LPD,0,3,0,0,0,0,0,0,142,6,3


In [4]:
#features names. all features from kaggle spec and eeg 
feature_cols=list(df_train.columns[17:])
#features from kaggle spec
spec_features=df_train2.columns[3:]
#features from eeg
eeg_features=feature_cols[:448]
#features from specs from eegs
mels_features=df_train3.columns
#columns with probability values
vote_cats=['seizure_vote', 'lpd_vote',	'gpd_vote',	'lrda_vote',	'grda_vote',	'other_vote']
# encoding of the expert consensus into integers
codes = {'Seizure':0, 'LPD':1, 'GPD':2, 'LRDA':3, 'GRDA':4, 'Other':5}


In [5]:
select_features=['T5-O1.skewness',
 'F4-C4.samp_en',
 'C4-P4.samp_en',
 'T5-O1.rel_bp_delta',
 'C4-P4.kurtosis',
 'Fp1-F3.abs_bp_delta',
 'LP_10.16_mean_20s',
 'F3-C3.bp_delta_theta',
 'F3-C3.rel_bp_delta',
 'T5-O1.abs_bp_beta',
 'F4-C4.abs_bp_delta',
 'T6-O2.skewness',
 'RL_1.56_min_10m',
 'T5-O1.bp_delta_beta',
 'eeg_std_f302_10s',
 'P4-O2.samp_en',
 'F4-C4.abs_bp_theta',
 'RP_19.92_min_10m',
 'T4-T6.rel_bp_theta',
 'LP_3.52_min_10m',
 'C4-P4.rel_bp_delta',
 'T3-T5.rel_bp_delta']

In [6]:
#logistic regression on all features 
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FunctionTransformer, FeatureUnion
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import StandardScaler


n_splits=5
sgkf=StratifiedGroupKFold(n_splits=n_splits, random_state=216, shuffle=True)
gen=sgkf.split(X=df_train, y=df_train['expert_consensus'], 
                        groups=df_train['patient_id'])
#features=select_features.feature_names.values
features=feature_cols

df_scores=pd.DataFrame(index=list(range(n_splits)), columns=['kl_div', 'accuracy'], data=None)


for i, (train_index, test_index) in enumerate(gen):

    n_classes = 6

    print(f'running fold {i}')
    X_train=df_train.iloc[train_index].dropna()[features]
    Y_train=df_train.iloc[train_index].dropna().expert_consensus.map(codes)
    Y_train_proba=df_train.iloc[train_index].dropna()[vote_cats]

    n_samples, n_classes=Y_train_proba.shape
    X_upsampled = np.array(X_train).repeat(n_classes, axis=0)
    Y_direct = np.tile(range(n_classes), n_samples)
    
    weight=np.array(df_train.iloc[train_index].dropna().total_votes.apply(lambda x: min(x/3, 1.0)))
    sample_weight = np.array(Y_train_proba).ravel()*weight.repeat(6, axis=0)
    
    X_test=df_train.iloc[test_index].dropna()[features]
    Y_test_cat=df_train.iloc[test_index].dropna().expert_consensus.map(codes)
    Y_test_proba=df_train.iloc[test_index].dropna()[vote_cats]

    pipel=Pipeline([('scale', StandardScaler()), ('logreg',  LogisticRegression(penalty=None,max_iter=1000))])
    pipel.fit(X_upsampled,Y_direct, **{'logreg__sample_weight': sample_weight})
    pred=pipel.predict_proba(X_test.values)

    weight=np.array(df_train.iloc[test_index].dropna().total_votes.apply(lambda x: min(x/3, 1.0)))
    df_scores.loc[i,'kl_div']=kl_divergence(Y_test_proba, pred, sample_weights=weight)
    df_scores.loc[i,'accuracy']=pipel.score(X_test.values, Y_test_cat.values)

display(df_scores)
df_feature_imp=pd.DataFrame({'importance': pipel['logreg'].coef_[0], 'feature':features})
display(df_feature_imp.sort_values(by='importance', ascending=False))

running fold 0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


running fold 1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


running fold 2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


running fold 3


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


running fold 4


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,kl_div,accuracy
0,1.493109,0.450411
1,1.812877,0.490979
2,1.795182,0.439418
3,1.491921,0.441056
4,1.638241,0.489429


Unnamed: 0,importance,feature
127,1.471220,F8-T4.bp_theta_alpha
2074,1.165701,LL_0.98_min_10m
242,1.128340,C3-P3.bp_theta_alpha
2895,0.995582,eeg_mean_f23_10s
0,0.962255,Fp1-F7.abs_bp_alpha
...,...,...
622,-0.836299,RL_10.35_max_10m
2083,-0.842163,LL_2.73_min_10m
554,-0.961705,LL_16.6_max_10m
2178,-0.967552,RL_1.76_min_10m


In [7]:
df_scores.mean()

kl_div      1.646266
accuracy    0.462259
dtype: object

In [92]:
#logistic regression on kaggle spec+ mels spec features
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FunctionTransformer, FeatureUnion
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import StandardScaler

df_train=pd.read_parquet('all_features.parquet')
n_splits=5
sgkf=StratifiedGroupKFold(n_splits=n_splits, random_state=216, shuffle=True)
gen=sgkf.split(X=df_train, y=df_train['expert_consensus'], 
                        groups=df_train['patient_id'])
#features=select_features.feature_names.values
features=list(mels_features)+list(spec_features)

df_scores=pd.DataFrame(index=list(range(n_splits)), columns=['kl_div', 'accuracy'], data=None)


for i, (train_index, test_index) in enumerate(gen):

    n_classes = 6

    print(f'running fold {i}')
    X_train=df_train.iloc[train_index].dropna()[features]
    Y_train=df_train.iloc[train_index].dropna().expert_consensus.map(codes)
    Y_train_proba=df_train.iloc[train_index].dropna()[vote_cats]

    n_samples, n_classes=Y_train_proba.shape
    X_upsampled = np.array(X_train).repeat(n_classes, axis=0)
    Y_direct = np.tile(range(n_classes), n_samples)
    
    weight=np.array(df_train.iloc[train_index].dropna().total_votes.apply(lambda x: min(x/3, 1.0)))
    sample_weight = np.array(Y_train_proba).ravel()*weight.repeat(6, axis=0)
    
    X_test=df_train.iloc[test_index].dropna()[features]
    Y_test_cat=df_train.iloc[test_index].dropna().expert_consensus.map(codes)
    Y_test_proba=df_train.iloc[test_index].dropna()[vote_cats]

    pipel=Pipeline([('scale', StandardScaler()), ('logreg',  LogisticRegression(penalty=None,max_iter=1000))])
    pipel.fit(X_upsampled,Y_direct, **{'logreg__sample_weight': sample_weight})
    pred=pipel.predict_proba(X_test.values)

    weight=np.array(df_train.iloc[test_index].dropna().total_votes.apply(lambda x: min(x/3, 1.0)))
    df_scores.loc[i,'kl_div']=kl_divergence(Y_test_proba, pred, sample_weights=weight)
    df_scores.loc[i,'accuracy']=pipel.score(X_test.values, Y_test_cat.values)

display(df_scores)
display(df_scores.mean())
df_feature_imp=pd.DataFrame({'importance': pipel['logreg'].coef_[0], 'feature':features})
#display(df_feature_imp.sort_values(by='importance', ascending=False))

running fold 0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


running fold 1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


running fold 2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


running fold 3


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


running fold 4


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,kl_div,accuracy
0,2.313131,0.382382
1,2.237772,0.400648
2,1.909278,0.400102
3,2.14868,0.360982
4,2.277866,0.378493


Unnamed: 0,importance,feature
3982,2.789671,RP_2.54_min_10m
2281,1.952764,LP_2.34_max_10m
23,1.887749,eeg_mean_f23_10s
3772,1.655559,RL_0.59_min_10m
1170,1.633624,eeg_max_f146_10s
...,...,...
401,-1.584107,eeg_mean_f401_10s
3977,-1.745224,RP_1.56_min_10m
3781,-1.802254,RL_2.34_min_10m
402,-1.933468,eeg_mean_f402_10s


In [93]:
display(df_scores.mean())

kl_div      2.177345
accuracy    0.384521
dtype: object

In [77]:
#logistic regression on select features
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FunctionTransformer, FeatureUnion
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import StandardScaler

df_train=pd.read_parquet('all_features.parquet')
n_splits=5
sgkf=StratifiedGroupKFold(n_splits=n_splits, random_state=216, shuffle=True)
gen=sgkf.split(X=df_train, y=df_train['expert_consensus'], 
                        groups=df_train['patient_id'])
#features=select_features.feature_names.values
features=select_features

df_scores=pd.DataFrame(index=list(range(n_splits)), columns=['kl_div', 'accuracy'], data=None)


for i, (train_index, test_index) in enumerate(gen):

    n_classes = 6

    print(f'running fold {i}')
    X_train=df_train.iloc[train_index].dropna()[features]
    Y_train=df_train.iloc[train_index].dropna().expert_consensus.map(codes)
    Y_train_proba=df_train.iloc[train_index].dropna()[vote_cats]

    n_samples, n_classes=Y_train_proba.shape
    X_upsampled = np.array(X_train).repeat(n_classes, axis=0)
    Y_direct = np.tile(range(n_classes), n_samples)
    
    weight=np.array(df_train.iloc[train_index].dropna().total_votes.apply(lambda x: min(x/3, 1.0)))
    sample_weight = np.array(Y_train_proba).ravel()*weight.repeat(6, axis=0)
    
    X_test=df_train.iloc[test_index].dropna()[features]
    Y_test_cat=df_train.iloc[test_index].dropna().expert_consensus.map(codes)
    Y_test_proba=df_train.iloc[test_index].dropna()[vote_cats]

    pipel=Pipeline([('scale', StandardScaler()), ('logreg',  LogisticRegression(penalty=None,max_iter=1000, verbose=True))])
    pipel.fit(X_upsampled,Y_direct, **{'logreg__sample_weight': sample_weight})
    pred=pipel.predict_proba(X_test.values)

    weight=np.array(df_train.iloc[test_index].dropna().total_votes.apply(lambda x: min(x/3, 1.0)))
    df_scores.loc[i,'kl_div']=kl_divergence(Y_test_proba, pred, sample_weights=weight)
    df_scores.loc[i,'accuracy']=pipel.score(X_test.values, Y_test_cat.values)

display(df_scores)
df_feature_imp=pd.DataFrame({'importance': pipel['logreg'].coef_[0], 'feature':features})
display(df_feature_imp.sort_values(by='importance', ascending=False))

running fold 0


 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          138     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.79176D+00    |proj g|=  1.37050D-01

At iterate   50    f=  1.42234D+00    |proj g|=  3.92396D-04

At iterate  100    f=  1.42203D+00    |proj g|=  1.82796D-04

At iterate  150    f=  1.42181D+00    |proj g|=  4.74556D-04

At iterate  200    f=  1.42131D+00    |proj g|=  6.68031D-04

At iterate  250    f=  1.42102D+00    |proj g|=  2.02689D-04

At iterate  300    f=  1.42056D+00    |proj g|=  2.25387D-04

At iterate  350    f=  1.42031D+00    |proj g|=  3.31149D-04

At iterate  400    f=  1.42005D+00    |proj g|=  2.25993D-04

At iterate  450    f=  1.41979D+00    |proj g|=  1.72655D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = nu

 This problem is unconstrained.



At iterate   50    f=  1.43508D+00    |proj g|=  1.62594D-03

At iterate  100    f=  1.43220D+00    |proj g|=  5.32433D-04

At iterate  150    f=  1.43117D+00    |proj g|=  6.69817D-04

At iterate  200    f=  1.43033D+00    |proj g|=  5.33898D-04

At iterate  250    f=  1.43000D+00    |proj g|=  3.25628D-04

At iterate  300    f=  1.42966D+00    |proj g|=  3.21942D-04

At iterate  350    f=  1.42936D+00    |proj g|=  1.42587D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  138    383    399      1     0     0   9.098D-05   1.429D+00
  F =   1.4292636612242398     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL  

 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          138     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.79176D+00    |proj g|=  1.18280D-01

At iterate   50    f=  1.44578D+00    |proj g|=  1.10801D-03

At iterate  100    f=  1.44217D+00    |proj g|=  6.78953D-04

At iterate  150    f=  1.44140D+00    |proj g|=  7.98055D-04

At iterate  200    f=  1.44080D+00    |proj g|=  2.84965D-04

At iterate  250    f=  1.44056D+00    |proj g|=  5.83236D-04

At iterate  300    f=  1.44012D+00    |proj g|=  2.91332D-04

At iterate  350    f=  1.43997D+00    |proj g|=  3.87568D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function 

 This problem is unconstrained.



At iterate   50    f=  1.40949D+00    |proj g|=  9.73308D-04

At iterate  100    f=  1.40547D+00    |proj g|=  7.09378D-04

At iterate  150    f=  1.40469D+00    |proj g|=  3.27807D-04

At iterate  200    f=  1.40404D+00    |proj g|=  8.18772D-04

At iterate  250    f=  1.40378D+00    |proj g|=  4.62122D-04

At iterate  300    f=  1.40339D+00    |proj g|=  2.59677D-04

At iterate  350    f=  1.40306D+00    |proj g|=  4.42375D-04

At iterate  400    f=  1.40273D+00    |proj g|=  4.54420D-04

At iterate  450    f=  1.40257D+00    |proj g|=  2.78199D-04

At iterate  500    f=  1.40235D+00    |proj g|=  1.45362D-04

At iterate  550    f=  1.40220D+00    |proj g|=  3.24699D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F   

 This problem is unconstrained.



At iterate   50    f=  1.42862D+00    |proj g|=  9.14674D-04

At iterate  100    f=  1.42613D+00    |proj g|=  4.62708D-04

At iterate  150    f=  1.42536D+00    |proj g|=  4.03325D-04

At iterate  200    f=  1.42505D+00    |proj g|=  3.92440D-04

At iterate  250    f=  1.42475D+00    |proj g|=  3.45073D-04

At iterate  300    f=  1.42433D+00    |proj g|=  2.09959D-04

At iterate  350    f=  1.42420D+00    |proj g|=  3.04150D-04

At iterate  400    f=  1.42408D+00    |proj g|=  2.11131D-04

At iterate  450    f=  1.42377D+00    |proj g|=  6.31291D-04

At iterate  500    f=  1.42347D+00    |proj g|=  1.78322D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tn

Unnamed: 0,kl_div,accuracy
0,1.129247,0.399606
1,1.177647,0.415767
2,1.120705,0.477064
3,1.280833,0.418443
4,1.141157,0.434378


Unnamed: 0,importance,feature
9,8.288939,T5-O1.abs_bp_beta
5,0.858171,Fp1-F3.abs_bp_delta
16,0.607915,F4-C4.abs_bp_theta
6,0.462118,LP_10.16_mean_20s
15,0.306838,P4-O2.samp_en
4,0.234005,C4-P4.kurtosis
18,0.23054,T4-T6.rel_bp_theta
12,0.206549,RL_1.56_min_10m
11,0.151871,T6-O2.skewness
1,0.119545,F4-C4.samp_en


In [78]:
df_scores.mean()

kl_div      1.169918
accuracy    0.429052
dtype: object

In [8]:
#naive bayes eeg+kaggle spec features
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline, FunctionTransformer, FeatureUnion
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import StandardScaler

n_splits=5
sgkf=StratifiedGroupKFold(n_splits=n_splits, random_state=216, shuffle=True)
gen=sgkf.split(X=df_train, y=df_train['expert_consensus'], 
                        groups=df_train['patient_id'])
features=list(eeg_features)+list(spec_features)
df_scores=pd.DataFrame(index=list(range(n_splits)), columns=['kl_div', 'accuracy'], data=None)


for i, (train_index, test_index) in enumerate(gen):
    n_classes=6
    print(f'running fold {i}')
    X_train=df_train.iloc[train_index].dropna()[features]
    Y_train=df_train.iloc[train_index].dropna().expert_consensus.map(codes)
    Y_train_proba=df_train.iloc[train_index].dropna()[vote_cats]

    n_samples, n_classes=Y_train_proba.shape
    X_upsampled = np.array(X_train).repeat(n_classes, axis=0)
    Y_direct = np.tile(range(n_classes), n_samples)
    
    weight=np.array(df_train.iloc[train_index].dropna().total_votes.apply(lambda x: min(x/3, 1.0)))
    sample_weight = np.array(Y_train_proba).ravel()*weight.repeat(6, axis=0)
    
    X_test=df_train.iloc[test_index].dropna()[features]
    Y_test_cat=df_train.iloc[test_index].dropna().expert_consensus.map(codes)
    Y_test_proba=df_train.iloc[test_index].dropna()[vote_cats]

    pipe=Pipeline([ ('naive',  GaussianNB())])
    pipe.fit(X_upsampled,Y_direct, **{'naive__sample_weight': sample_weight})
    pred=pipe.predict_proba(X_test.values)
    df_scores.loc[i,'kl_div']=kl_divergence(Y_test_proba, pred)
    df_scores.loc[i,'accuracy']=pipe.score(X_test.values, Y_test_cat.values)

display(df_scores)
display(df_scores.mean())
#df_feature_imp=pd.DataFrame({'importance': pipe['naive'].coef_[0], 'feature':features})
#display(df_feature_imp.sort_values(by='importance', ascending=False))

running fold 0
running fold 1
running fold 2
running fold 3
running fold 4


Unnamed: 0,kl_div,accuracy
0,30.588705,0.100693
1,31.440666,0.079348
2,28.196223,0.121873
3,26.995905,0.21848
4,31.661713,0.069591


kl_div      29.776642
accuracy     0.117997
dtype: object

In [88]:
#naive bayes on kaggle spec +mels features
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline, FunctionTransformer, FeatureUnion
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import StandardScaler

n_splits=5
sgkf=StratifiedGroupKFold(n_splits=n_splits, random_state=216, shuffle=True)
gen=sgkf.split(X=df_train, y=df_train['expert_consensus'], 
                        groups=df_train['patient_id'])
features=list(mels_features)+list(spec_features)
df_scores=pd.DataFrame(index=list(range(n_splits)), columns=['kl_div', 'accuracy'], data=None)


for i, (train_index, test_index) in enumerate(gen):
    n_classes=6
    print(f'running fold {i}')
    X_train=df_train.iloc[train_index].dropna()[features]
    Y_train=df_train.iloc[train_index].dropna().expert_consensus.map(codes)
    Y_train_proba=df_train.iloc[train_index].dropna()[vote_cats]

    n_samples, n_classes=Y_train_proba.shape
    X_upsampled = np.array(X_train).repeat(n_classes, axis=0)
    Y_direct = np.tile(range(n_classes), n_samples)
    
    weight=np.array(df_train.iloc[train_index].dropna().total_votes.apply(lambda x: min(x/3, 1.0)))
    sample_weight = np.array(Y_train_proba).ravel()*weight.repeat(6, axis=0)
    
    X_test=df_train.iloc[test_index].dropna()[features]
    Y_test_cat=df_train.iloc[test_index].dropna().expert_consensus.map(codes)
    Y_test_proba=df_train.iloc[test_index].dropna()[vote_cats]

    pipe=Pipeline([ ('naive',  GaussianNB())])
    pipe.fit(X_upsampled,Y_direct,**{'naive__sample_weight': sample_weight})
    pred=pipe.predict_proba(X_test.values)
    df_scores.loc[i,'kl_div']=kl_divergence(Y_test_proba, pred)
    df_scores.loc[i,'accuracy']=pipe.score(X_test.values, Y_test_cat.values)

display(df_scores)
display(df_scores.mean())

running fold 0
running fold 1
running fold 2
running fold 3
running fold 4


Unnamed: 0,kl_div,accuracy
0,29.741907,0.133858
1,31.494096,0.069114
2,31.635889,0.070846
3,28.62301,0.15987
4,30.122617,0.127011


kl_div      30.323504
accuracy      0.11214
dtype: object

In [89]:
#naive bayes select features 
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline, FunctionTransformer, FeatureUnion
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import StandardScaler

n_splits=5
sgkf=StratifiedGroupKFold(n_splits=n_splits, random_state=216, shuffle=True)
gen=sgkf.split(X=df_train, y=df_train['expert_consensus'], 
                        groups=df_train['patient_id'])
features=select_features
df_scores=pd.DataFrame(index=list(range(n_splits)), columns=['kl_div', 'accuracy'], data=None)


for i, (train_index, test_index) in enumerate(gen):
    n_classes=6
    print(f'running fold {i}')
    X_train=df_train.iloc[train_index].dropna()[features]
    Y_train=df_train.iloc[train_index].dropna().expert_consensus.map(codes)
    Y_train_proba=df_train.iloc[train_index].dropna()[vote_cats]

    n_samples, n_classes=Y_train_proba.shape
    X_upsampled = np.array(X_train).repeat(n_classes, axis=0)
    Y_direct = np.tile(range(n_classes), n_samples)
    
    weight=np.array(df_train.iloc[train_index].dropna().total_votes.apply(lambda x: min(x/3, 1.0)))
    sample_weight = np.array(Y_train_proba).ravel()*weight.repeat(6, axis=0)
    
    X_test=df_train.iloc[test_index].dropna()[features]
    Y_test_cat=df_train.iloc[test_index].dropna().expert_consensus.map(codes)
    Y_test_proba=df_train.iloc[test_index].dropna()[vote_cats]

    pipe=Pipeline([ ('naive',  GaussianNB())])
    pipe.fit(X_upsampled,Y_direct, **{'naive__sample_weight': sample_weight})
    pred=pipe.predict_proba(X_test.values)
    df_scores.loc[i,'kl_div']=kl_divergence(Y_test_proba, pred)
    df_scores.loc[i,'accuracy']=pipe.score(X_test.values, Y_test_cat.values)

display(df_scores)
display(df_scores.mean())

running fold 0
running fold 1
running fold 2
running fold 3
running fold 4


Unnamed: 0,kl_div,accuracy
0,7.778208,0.220472
1,10.100291,0.106911
2,9.739156,0.227829
3,12.960481,0.186747
4,9.275047,0.213802


kl_div      9.970637
accuracy    0.191152
dtype: object

In [90]:
#naive bayes on select features trained on expert consensus
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline, FunctionTransformer, FeatureUnion
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import StandardScaler

n_splits=5
sgkf=StratifiedGroupKFold(n_splits=n_splits, random_state=216, shuffle=True)
gen=sgkf.split(X=df_train, y=df_train['expert_consensus'], 
                        groups=df_train['patient_id'])
features=select_features
df_scores=pd.DataFrame(index=list(range(n_splits)), columns=['kl_div', 'accuracy'], data=None)


for i, (train_index, test_index) in enumerate(gen):
    print(f'running fold {i}')
    X_train=df_train.iloc[train_index].dropna()[features]
    Y_train=df_train.iloc[train_index].dropna().expert_consensus.map(codes)
    X_test=df_train.iloc[test_index].dropna()[features]
    Y_test_cat=df_train.iloc[test_index].dropna().expert_consensus.map(codes)
    Y_test_proba=df_train.iloc[test_index].dropna()[vote_cats]

    weight=np.array(df_train.iloc[train_index].dropna().total_votes.apply(lambda x: min(x/3, 1.0)))

    pipe=Pipeline([ ('naive',  GaussianNB())])
    pipe.fit(X_train,Y_train, **{'naive__sample_weight': weight})
    pred=pipe.predict_proba(X_test)
    df_scores.loc[i,'kl_div']=kl_divergence(Y_test_proba, pred, )
    df_scores.loc[i,'accuracy']=pipe.score(X_test, Y_test_cat)

display(df_scores)
display(df_scores.mean())
#df_feature_imp=pd.DataFrame({'importance': pipe['naive'].coef_[0], 'feature':features})
#display(df_feature_imp.sort_values(by='importance', ascending=False))

running fold 0
running fold 1
running fold 2
running fold 3
running fold 4


Unnamed: 0,kl_div,accuracy
0,14.829157,0.166339
1,17.520649,0.166847
2,17.896336,0.151886
3,16.205136,0.15987
4,17.140304,0.146486


kl_div      16.718316
accuracy     0.158285
dtype: object

In [91]:
#baseline model: predicting probability 1/6 for each category
from sklearn.pipeline import Pipeline, FunctionTransformer, FeatureUnion
from sklearn.model_selection import StratifiedGroupKFold


n_splits=5
sgkf=StratifiedGroupKFold(n_splits=n_splits, random_state=216, shuffle=True)
gen=sgkf.split(X=df_train, y=df_train['expert_consensus'], 
                        groups=df_train['patient_id'])
features=feature_cols
df_scores=pd.DataFrame(index=list(range(n_splits)), columns=['kl_div', 'accuracy'], data=None)


for i, (train_index, test_index) in enumerate(gen):
    print(f'running fold {i}')

    Y_test_proba=df_train.iloc[test_index][vote_cats]
    pred=np.full(Y_test_proba.shape, 1/6, dtype=float)
    df_scores.loc[i,'kl_div']=kl_divergence(Y_test_proba, pred)

display(df_scores)
display(df_scores.mean())

running fold 0
running fold 1
running fold 2
running fold 3
running fold 4


Unnamed: 0,kl_div,accuracy
0,1.42668,
1,1.486069,
2,1.504954,
3,1.455384,
4,1.457921,


kl_div      1.466202
accuracy         NaN
dtype: object

In [176]:
#catboost
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline, FunctionTransformer, FeatureUnion
from sklearn.model_selection import StratifiedGroupKFold


df_train=pd.read_parquet('all_features.parquet')
n_splits=5
sgkf=StratifiedGroupKFold(n_splits=n_splits, random_state=216, shuffle=True)
gen=sgkf.split(X=df_train, y=df_train['expert_consensus'], 
                        groups=df_train['patient_id'])
features=feature_cols
df_scores=pd.DataFrame(index=list(range(n_splits)), columns=['kl_div', 'accuracy'], data=None)


for i, (train_index, test_index) in enumerate(gen):
    print(f'running fold {i}')
    X_train=df_train.iloc[train_index].dropna()[features]
    Y_train=df_train.iloc[train_index].dropna().expert_consensus.map(codes)
    X_test=df_train.iloc[test_index].dropna()[features]
    Y_test_cat=df_train.iloc[test_index].dropna().expert_consensus.map(codes)
    Y_test_proba=df_train.iloc[test_index].dropna()[vote_cats]

    pipe=Pipeline([ ('catboost',  CatBoostClassifier( loss_function='MultiClass'))])
    pipe.fit(X_train,Y_train)
    pred=pipe.predict_proba(X_test)
    df_scores.loc[i,'kl_div']=kl_divergence(Y_test_proba, pred)
    df_scores.loc[i,'accuracy']=pipe.score(X_test, Y_test_cat)

display(df_scores)
df_feature_imp=pd.DataFrame({'importance': pipe['catboost'].get_feature_importance(), 'feature':features})
display(df_feature_imp.sort_values(by='importance', ascending=False))

running fold 0
Learning rate set to 0.088163
0:	learn: 1.7203624	total: 2.39s	remaining: 39m 45s
1:	learn: 1.6630793	total: 4.07s	remaining: 33m 50s
2:	learn: 1.6086344	total: 5.82s	remaining: 32m 15s
3:	learn: 1.5631464	total: 7.5s	remaining: 31m 8s
4:	learn: 1.5252434	total: 9.03s	remaining: 29m 56s
5:	learn: 1.4912376	total: 10.6s	remaining: 29m 9s
6:	learn: 1.4594607	total: 12.1s	remaining: 28m 40s
7:	learn: 1.4337732	total: 13.7s	remaining: 28m 21s
8:	learn: 1.4076658	total: 15.3s	remaining: 28m 2s
9:	learn: 1.3834512	total: 16.8s	remaining: 27m 45s
10:	learn: 1.3597124	total: 18.2s	remaining: 27m 19s
11:	learn: 1.3374807	total: 19.6s	remaining: 26m 57s
12:	learn: 1.3214106	total: 21.2s	remaining: 26m 49s
13:	learn: 1.3037892	total: 22.5s	remaining: 26m 27s
14:	learn: 1.2902312	total: 24.1s	remaining: 26m 25s
15:	learn: 1.2740490	total: 25.6s	remaining: 26m 17s
16:	learn: 1.2597554	total: 27.1s	remaining: 26m 4s
17:	learn: 1.2469098	total: 28.6s	remaining: 26m 1s
18:	learn: 1.2364

Unnamed: 0,kl_div,accuracy
0,0.926177,0.536417
1,0.863135,0.603132
2,0.932071,0.560143
3,0.919606,0.545412
4,0.88216,0.596952


Unnamed: 0,importance,feature
403,0.728428,T5-O1.skewness
329,0.678110,C4-P4.rel_bp_delta
321,0.567030,F4-C4.samp_en
430,0.560838,C4-P4.kurtosis
2177,0.481407,RL_1.56_min_10m
...,...,...
1752,0.000000,LL_16.21_mean_20s
1751,0.000000,LL_16.02_mean_20s
1749,0.000000,LL_15.63_mean_20s
1748,0.000000,LL_15.43_mean_20s


In [16]:
df_scores.mean()

kl_div      0.860031
accuracy    0.588024
dtype: object