In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np



sns.set_style('whitegrid')

In [19]:
from scipy.special import rel_entr

def kl_divergence(solution: np.ndarray, submission: np.ndarray, epsilon: float=10**-15, micro_average: bool=True, sample_weights: pd.Series =None):
    
     # Prevent issue with populating int columns with floats
        
    solution= solution.astype(float)

        
    # clipping the min prevents users from playing games with the 20th decimal place of predictions.
    submission = np.clip(submission, epsilon, 1 )

    if micro_average:
        return np.average(rel_entr(solution, submission).sum(axis=1), weights=sample_weights)
    else:
        return np.average(rel_entr(solution, submission).mean(axis=0))

In [None]:
#the train dataframe with minimal filtering and no features
df=pd.read_csv('../../train_final_less_filtering.csv')

#dataframe with eeg-derived features, 
#setting index of the one saved in train.csv to the one in train_final_less_filtering.csv
#to allow fast dropping of the row in spec derived feature file
df_train=pd.read_csv('../../features_folder/merged_votes_no_overlap_filter_before_split/train.csv',index_col=0)
df_train=df[['Unnamed: 0', 'label_id']].merge(df_train, how='inner', left_on='label_id', right_on='label_id')
df_train=df_train.set_index('Unnamed: 0')
df_train.index.name=None

#dataframe with kaggle spec derived features
df_train2=pd.read_parquet('../../train_features_from_kaggle_spec.parquet').loc[df_train.index]

#dataframe with spec from eeg features
df_train3=pd.read_parquet('../../features_folder/features_from_eeg_spectrograms_without_kaggle_spec_data/train_mrgd_votes_no_ovlp_bfsplt_feats_from_eeg_to_spec.parquet')
df_train3=df[['Unnamed: 0', 'label_id']].merge(df_train3, how='inner', left_on='label_id', right_on='label_id')
df_train3=df_train3.set_index('Unnamed: 0')
df_train3.index.name=None
df_train3=df_train3[df_train3.columns[465:]]


#not all yet>>merging all features together
df_train=df_train.join(other=df_train2[df_train2.columns[3:]], how='left')
df_train=df_train.join(other=df_train3, how='left')
df_train

In [None]:
#some rows have NaN values - these come from kaggle spec-derived features. Total 30
df.set_index('Unnamed: 0').loc[df_train[df_train.isna().sum(axis=1)>0].index]

In [21]:
#features names. all features from kaggle spec and eeg 
feature_cols=list(df_train.columns[17:])
#features from kaggle spec
spec_features=df_train2.columns[3:]
#features from eeg
eeg_features=feature_cols[:448]
#features from specs from eegs
mels_features=df_train3.columns
#columns with probability values
vote_cats=['seizure_vote', 'lpd_vote',	'gpd_vote',	'lrda_vote',	'grda_vote',	'other_vote']
# encoding of the expert consensus into integers
codes = {'Seizure':0, 'LPD':1, 'GPD':2, 'LRDA':3, 'GRDA':4, 'Other':5}


In [6]:
select_features=['T5-O1.skewness',
 'F4-C4.samp_en',
 'C4-P4.samp_en',
 'T5-O1.rel_bp_delta',
 'C4-P4.kurtosis',
 'Fp1-F3.abs_bp_delta',
 'LP_10.16_mean_20s',
 'F3-C3.bp_delta_theta',
 'F3-C3.rel_bp_delta',
 'T5-O1.abs_bp_beta',
 'F4-C4.abs_bp_delta',
 'T6-O2.skewness',
 'RL_1.56_min_10m',
 'T5-O1.bp_delta_beta',
 'eeg_std_f302_10s',
 'P4-O2.samp_en',
 'F4-C4.abs_bp_theta',
 'RP_19.92_min_10m',
 'T4-T6.rel_bp_theta',
 'LP_3.52_min_10m',
 'C4-P4.rel_bp_delta',
 'T3-T5.rel_bp_delta']

In [None]:
#logistic regression on all features 
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FunctionTransformer, FeatureUnion
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import StandardScaler


n_splits=5
sgkf=StratifiedGroupKFold(n_splits=n_splits, random_state=216, shuffle=True)
gen=sgkf.split(X=df_train, y=df_train['expert_consensus'], 
                        groups=df_train['patient_id'])

features=list(mels_features)+list(eeg_features)+list(spec_features)

df_scores=pd.DataFrame(index=list(range(n_splits)), columns=['kl_div', 'accuracy'], data=None)


for i, (train_index, test_index) in enumerate(gen):

    n_classes = 6

    print(f'running fold {i}')
    X_train=df_train.iloc[train_index].dropna()[features]
    Y_train=df_train.iloc[train_index].dropna().expert_consensus.map(codes)
    Y_train_proba=df_train.iloc[train_index].dropna()[vote_cats]

    n_samples, n_classes=Y_train_proba.shape
    X_upsampled = np.array(X_train).repeat(n_classes, axis=0)
    Y_direct = np.tile(range(n_classes), n_samples)
    
    weight=np.array(df_train.iloc[train_index].dropna().total_votes.apply(lambda x: min(x/3, 1.0)))
    sample_weight = np.array(Y_train_proba).ravel()*weight.repeat(6, axis=0)
    
    X_test=df_train.iloc[test_index].dropna()[features]
    Y_test_cat=df_train.iloc[test_index].dropna().expert_consensus.map(codes)
    Y_test_proba=df_train.iloc[test_index].dropna()[vote_cats]

    pipel=Pipeline([('scale', StandardScaler()), ('logreg',  LogisticRegression(max_iter=1000))])
    pipel.fit(X_upsampled,Y_direct, **{'logreg__sample_weight': sample_weight})
    pred=pipel.predict_proba(X_test.values)

    weight=np.array(df_train.iloc[test_index].dropna().total_votes.apply(lambda x: min(x/3, 1.0)))
    df_scores.loc[i,'kl_div']=kl_divergence(Y_test_proba, pred, sample_weights=weight)
    df_scores.loc[i,'accuracy']=pipel.score(X_test.values, Y_test_cat.values)

display(df_scores)
display(df_scores.mean())

In [None]:
#logistic regression on kaggle spec+ mels spec features
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FunctionTransformer, FeatureUnion
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import StandardScaler


n_splits=5
sgkf=StratifiedGroupKFold(n_splits=n_splits, random_state=216, shuffle=True)
gen=sgkf.split(X=df_train, y=df_train['expert_consensus'], 
                        groups=df_train['patient_id'])

features=list(mels_features)+list(spec_features)

df_scores=pd.DataFrame(index=list(range(n_splits)), columns=['kl_div', 'accuracy'], data=None)


for i, (train_index, test_index) in enumerate(gen):

    n_classes = 6

    print(f'running fold {i}')
    X_train=df_train.iloc[train_index].dropna()[features]
    Y_train=df_train.iloc[train_index].dropna().expert_consensus.map(codes)
    Y_train_proba=df_train.iloc[train_index].dropna()[vote_cats]

    n_samples, n_classes=Y_train_proba.shape
    X_upsampled = np.array(X_train).repeat(n_classes, axis=0)
    Y_direct = np.tile(range(n_classes), n_samples)
    
    weight=np.array(df_train.iloc[train_index].dropna().total_votes.apply(lambda x: min(x/3, 1.0)))
    sample_weight = np.array(Y_train_proba).ravel()*weight.repeat(6, axis=0)
    
    X_test=df_train.iloc[test_index].dropna()[features]
    Y_test_cat=df_train.iloc[test_index].dropna().expert_consensus.map(codes)
    Y_test_proba=df_train.iloc[test_index].dropna()[vote_cats]

    pipel=Pipeline([('scale', StandardScaler()), ('logreg',  LogisticRegression(penalty=None,max_iter=1000))])
    pipel.fit(X_upsampled,Y_direct, **{'logreg__sample_weight': sample_weight})
    pred=pipel.predict_proba(X_test.values)

    weight=np.array(df_train.iloc[test_index].dropna().total_votes.apply(lambda x: min(x/3, 1.0)))
    df_scores.loc[i,'kl_div']=kl_divergence(Y_test_proba, pred, sample_weights=weight)
    df_scores.loc[i,'accuracy']=pipel.score(X_test.values, Y_test_cat.values)

display(df_scores)
display(df_scores.mean())

In [None]:
#logistic regression on select features
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FunctionTransformer, FeatureUnion
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import StandardScaler


n_splits=5
sgkf=StratifiedGroupKFold(n_splits=n_splits, random_state=216, shuffle=True)
gen=sgkf.split(X=df_train, y=df_train['expert_consensus'], 
                        groups=df_train['patient_id'])

features=select_features

df_scores=pd.DataFrame(index=list(range(n_splits)), columns=['kl_div', 'accuracy'], data=None)


for i, (train_index, test_index) in enumerate(gen):

    n_classes = 6

    print(f'running fold {i}')
    X_train=df_train.iloc[train_index].dropna()[features]
    Y_train=df_train.iloc[train_index].dropna().expert_consensus.map(codes)
    Y_train_proba=df_train.iloc[train_index].dropna()[vote_cats]

    n_samples, n_classes=Y_train_proba.shape
    X_upsampled = np.array(X_train).repeat(n_classes, axis=0)
    Y_direct = np.tile(range(n_classes), n_samples)
    
    weight=np.array(df_train.iloc[train_index].dropna().total_votes.apply(lambda x: min(x/3, 1.0)))
    sample_weight = np.array(Y_train_proba).ravel()*weight.repeat(6, axis=0)
    
    X_test=df_train.iloc[test_index].dropna()[features]
    Y_test_cat=df_train.iloc[test_index].dropna().expert_consensus.map(codes)
    Y_test_proba=df_train.iloc[test_index].dropna()[vote_cats]

    pipel=Pipeline([('scale', StandardScaler()), ('logreg',  LogisticRegression(max_iter=1000))])
    pipel.fit(X_upsampled,Y_direct, **{'logreg__sample_weight': sample_weight})
    pred=pipel.predict_proba(X_test.values)

    weight=np.array(df_train.iloc[test_index].dropna().total_votes.apply(lambda x: min(x/3, 1.0)))
    df_scores.loc[i,'kl_div']=kl_divergence(Y_test_proba, pred, sample_weights=weight)
    df_scores.loc[i,'accuracy']=pipel.score(X_test.values, Y_test_cat.values)

display(df_scores)
display(df_scores.mean())

In [None]:
#naive bayes eeg+kaggle spec features
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline, FunctionTransformer, FeatureUnion
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import StandardScaler

n_splits=5
sgkf=StratifiedGroupKFold(n_splits=n_splits, random_state=216, shuffle=True)
gen=sgkf.split(X=df_train, y=df_train['expert_consensus'], 
                        groups=df_train['patient_id'])
features=list(eeg_features)+list(spec_features)
df_scores=pd.DataFrame(index=list(range(n_splits)), columns=['kl_div', 'accuracy'], data=None)


for i, (train_index, test_index) in enumerate(gen):
    n_classes=6
    print(f'running fold {i}')
    X_train=df_train.iloc[train_index].dropna()[features]
    Y_train=df_train.iloc[train_index].dropna().expert_consensus.map(codes)
    Y_train_proba=df_train.iloc[train_index].dropna()[vote_cats]

    n_samples, n_classes=Y_train_proba.shape
    X_upsampled = np.array(X_train).repeat(n_classes, axis=0)
    Y_direct = np.tile(range(n_classes), n_samples)
    
    weight=np.array(df_train.iloc[train_index].dropna().total_votes.apply(lambda x: min(x/3, 1.0)))
    sample_weight = np.array(Y_train_proba).ravel()*weight.repeat(6, axis=0)
    
    X_test=df_train.iloc[test_index].dropna()[features]
    Y_test_cat=df_train.iloc[test_index].dropna().expert_consensus.map(codes)
    Y_test_proba=df_train.iloc[test_index].dropna()[vote_cats]

    pipe=Pipeline([ ('naive',  GaussianNB())])
    pipe.fit(X_upsampled,Y_direct, **{'naive__sample_weight': sample_weight})
    pred=pipe.predict_proba(X_test.values)
    df_scores.loc[i,'kl_div']=kl_divergence(Y_test_proba, pred)
    df_scores.loc[i,'accuracy']=pipe.score(X_test.values, Y_test_cat.values)

display(df_scores)
display(df_scores.mean())

In [None]:
#naive bayes on kaggle spec +mels features
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline, FunctionTransformer, FeatureUnion
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import StandardScaler

n_splits=5
sgkf=StratifiedGroupKFold(n_splits=n_splits, random_state=216, shuffle=True)
gen=sgkf.split(X=df_train, y=df_train['expert_consensus'], 
                        groups=df_train['patient_id'])
features=list(mels_features)+list(spec_features)
df_scores=pd.DataFrame(index=list(range(n_splits)), columns=['kl_div', 'accuracy'], data=None)


for i, (train_index, test_index) in enumerate(gen):
    n_classes=6
    print(f'running fold {i}')
    X_train=df_train.iloc[train_index].dropna()[features]
    Y_train=df_train.iloc[train_index].dropna().expert_consensus.map(codes)
    Y_train_proba=df_train.iloc[train_index].dropna()[vote_cats]

    n_samples, n_classes=Y_train_proba.shape
    X_upsampled = np.array(X_train).repeat(n_classes, axis=0)
    Y_direct = np.tile(range(n_classes), n_samples)
    
    weight=np.array(df_train.iloc[train_index].dropna().total_votes.apply(lambda x: min(x/3, 1.0)))
    sample_weight = np.array(Y_train_proba).ravel()*weight.repeat(6, axis=0)
    
    X_test=df_train.iloc[test_index].dropna()[features]
    Y_test_cat=df_train.iloc[test_index].dropna().expert_consensus.map(codes)
    Y_test_proba=df_train.iloc[test_index].dropna()[vote_cats]

    pipe=Pipeline([ ('naive',  GaussianNB())])
    pipe.fit(X_upsampled,Y_direct,**{'naive__sample_weight': sample_weight})
    pred=pipe.predict_proba(X_test.values)
    df_scores.loc[i,'kl_div']=kl_divergence(Y_test_proba, pred)
    df_scores.loc[i,'accuracy']=pipe.score(X_test.values, Y_test_cat.values)

display(df_scores)
display(df_scores.mean())

In [None]:
#naive bayes select features 
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline, FunctionTransformer, FeatureUnion
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import StandardScaler

n_splits=5
sgkf=StratifiedGroupKFold(n_splits=n_splits, random_state=216, shuffle=True)
gen=sgkf.split(X=df_train, y=df_train['expert_consensus'], 
                        groups=df_train['patient_id'])
features=select_features
df_scores=pd.DataFrame(index=list(range(n_splits)), columns=['kl_div', 'accuracy'], data=None)


for i, (train_index, test_index) in enumerate(gen):
    n_classes=6
    print(f'running fold {i}')
    X_train=df_train.iloc[train_index].dropna()[features]
    Y_train=df_train.iloc[train_index].dropna().expert_consensus.map(codes)
    Y_train_proba=df_train.iloc[train_index].dropna()[vote_cats]

    n_samples, n_classes=Y_train_proba.shape
    X_upsampled = np.array(X_train).repeat(n_classes, axis=0)
    Y_direct = np.tile(range(n_classes), n_samples)
    
    weight=np.array(df_train.iloc[train_index].dropna().total_votes.apply(lambda x: min(x/3, 1.0)))
    sample_weight = np.array(Y_train_proba).ravel()*weight.repeat(6, axis=0)
    
    X_test=df_train.iloc[test_index].dropna()[features]
    Y_test_cat=df_train.iloc[test_index].dropna().expert_consensus.map(codes)
    Y_test_proba=df_train.iloc[test_index].dropna()[vote_cats]

    pipe=Pipeline([ ('naive',  GaussianNB())])
    pipe.fit(X_upsampled,Y_direct, **{'naive__sample_weight': sample_weight})
    pred=pipe.predict_proba(X_test.values)
    df_scores.loc[i,'kl_div']=kl_divergence(Y_test_proba, pred)
    df_scores.loc[i,'accuracy']=pipe.score(X_test.values, Y_test_cat.values)

display(df_scores)
display(df_scores.mean())

In [None]:
#naive bayes on select features trained on expert consensus
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline, FunctionTransformer, FeatureUnion
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import StandardScaler

n_splits=5
sgkf=StratifiedGroupKFold(n_splits=n_splits, random_state=216, shuffle=True)
gen=sgkf.split(X=df_train, y=df_train['expert_consensus'], 
                        groups=df_train['patient_id'])
features=select_features
df_scores=pd.DataFrame(index=list(range(n_splits)), columns=['kl_div', 'accuracy'], data=None)


for i, (train_index, test_index) in enumerate(gen):
    print(f'running fold {i}')
    X_train=df_train.iloc[train_index].dropna()[features]
    Y_train=df_train.iloc[train_index].dropna().expert_consensus.map(codes)
    X_test=df_train.iloc[test_index].dropna()[features]
    Y_test_cat=df_train.iloc[test_index].dropna().expert_consensus.map(codes)
    Y_test_proba=df_train.iloc[test_index].dropna()[vote_cats]

    weight=np.array(df_train.iloc[train_index].dropna().total_votes.apply(lambda x: min(x/3, 1.0)))

    pipe=Pipeline([ ('naive',  GaussianNB())])
    pipe.fit(X_train,Y_train, **{'naive__sample_weight': weight})
    pred=pipe.predict_proba(X_test)
    df_scores.loc[i,'kl_div']=kl_divergence(Y_test_proba, pred, )
    df_scores.loc[i,'accuracy']=pipe.score(X_test, Y_test_cat)

display(df_scores)
display(df_scores.mean())

In [None]:
#baseline model: predicting probability 1/6 for each category
from sklearn.pipeline import Pipeline, FunctionTransformer, FeatureUnion
from sklearn.model_selection import StratifiedGroupKFold


n_splits=5
sgkf=StratifiedGroupKFold(n_splits=n_splits, random_state=216, shuffle=True)
gen=sgkf.split(X=df_train, y=df_train['expert_consensus'], 
                        groups=df_train['patient_id'])
features=feature_cols
df_scores=pd.DataFrame(index=list(range(n_splits)), columns=['kl_div', 'accuracy'], data=None)


for i, (train_index, test_index) in enumerate(gen):
    print(f'running fold {i}')

    Y_test_proba=df_train.iloc[test_index][vote_cats]
    pred=np.full(Y_test_proba.shape, 1/6, dtype=float)
    df_scores.loc[i,'kl_div']=kl_divergence(Y_test_proba, pred)

display(df_scores)
display(df_scores.mean())

In [None]:
#baseline model: predicting probability 
# Other      0.29
# Seizure    0.23
# GRDA       0.15
# LPD        0.15
# GPD        0.11
# LRDA       0.07
#according to the data distribution


from sklearn.pipeline import Pipeline, FunctionTransformer, FeatureUnion
from sklearn.model_selection import StratifiedGroupKFold


n_splits=5
sgkf=StratifiedGroupKFold(n_splits=n_splits, random_state=216, shuffle=True)
gen=sgkf.split(X=df_train, y=df_train['expert_consensus'], 
                        groups=df_train['patient_id'])
features=feature_cols
df_scores=pd.DataFrame(index=list(range(n_splits)), columns=['kl_div', 'accuracy'], data=None)


for i, (train_index, test_index) in enumerate(gen):
    print(f'running fold {i}')

    Y_test_proba=df_train.iloc[test_index][vote_cats]
    pred=np.full(Y_test_proba.shape, [0.23, 0.15, 0.11, 0.07, 0.15, 0.29], dtype=float)
    df_scores.loc[i,'kl_div']=kl_divergence(Y_test_proba, pred)

display(df_scores)
display(df_scores.mean())