In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score

In [4]:
n_splits = 5
kfold = StratifiedGroupKFold(n_splits, random_state=7, shuffle=True)

In [5]:
eeg_train = pd.read_parquet("train_eeg_plus_kaggle_spec.parquet")
eeg_train.head(5)

Unnamed: 0,eeg_id,offset_bins,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,...,RP_18.55_min_20s,RP_18.75_min_20s,RP_18.95_min_20s,RP_19.14_min_20s,RP_19.34_min_20s,RP_19.53_min_20s,RP_19.73_min_20s,RP_19.92_min_20s,expert_consensus_to_num,max_index_vote_cols
0,568657,0,0,0.0,789577333,0,0.0,1825637311,20654,Other,...,1.932,2.129,2.472,2.743,2.827,2.875,2.877,2.708,5,5
1,642382,0,0,0.0,14960202,12,1008.0,3254468733,5955,Other,...,0.121,0.124,0.119,0.114,0.093,0.101,0.099,0.092,5,5
2,642382,2,1,24.0,14960202,13,1032.0,2552357208,5955,Other,...,0.1,0.096,0.094,0.133,0.111,0.12,0.111,0.078,5,5
3,751790,0,0,0.0,618728447,4,908.0,2898467035,38549,GPD,...,0.119,0.117,0.113,0.111,0.095,0.097,0.099,0.091,2,2
4,778705,0,0,0.0,52296320,0,0.0,3255875127,40955,Other,...,0.226,0.256,0.252,0.244,0.272,0.281,0.27,0.23,5,5


In [6]:
feature_cols = eeg_train.columns[17:-2]

In [7]:
#vote_cols = [c for c in eeg_train.columns.to_list() if '_vote' in c][:-1]
vote_cols = ['gpd_vote', 'grda_vote', 'lpd_vote', 'lrda_vote', 'other_vote', 'seizure_vote']

In [8]:
from scipy.stats import entropy

In [9]:
eeg_train['weight'] = eeg_train['total_votes'].apply(lambda x : min(x/3, 1))
eeg_train.head(5)['weight']

0    1.000000
1    0.333333
2    0.333333
3    0.333333
4    0.666667
Name: weight, dtype: float64

In [16]:
max_depths = range(14, 21, 3)
n_trees = [500, 1000]

rf_accs = np.zeros((n_splits, len(max_depths), len(n_trees)))
rf_ces = np.zeros((n_splits, len(max_depths), len(n_trees)))

i = 0
for train_index, test_index in kfold.split(eeg_train, eeg_train.expert_consensus, groups=eeg_train.patient_id):
    eeg_tt = eeg_train.iloc[train_index]
    eeg_ho = eeg_train.iloc[test_index]

    j = 0
    for max_depth in max_depths:
        k = 0
        for n_estimators in n_trees:
            print(i,j,k)
            rf = RandomForestClassifier(n_estimators = n_estimators,
                                           max_depth = max_depth,
                                           max_samples = int(.8*len(eeg_tt)),
                                           random_state = 7,
                                           n_jobs = 16)
            
            rf.fit(eeg_tt[feature_cols], eeg_tt.expert_consensus)
            
            pred = rf.predict(eeg_ho[feature_cols])
            pcs = rf.predict_proba(eeg_ho[feature_cols])

            rf_accs[i,j,k] = accuracy_score(eeg_ho.expert_consensus,  pred)
            ent = entropy(eeg_ho[vote_cols], pcs.clip(min=1e-15), axis=1)
            rf_ces[i,j,k] = np.mean(ent*eeg_train.iloc[test_index]['weight'])
            
            k = k + 1
        j = j + 1
    i = i + 1


0 0 0
0 0 1
0 1 0
0 1 1
0 2 0
0 2 1
1 0 0
1 0 1
1 1 0
1 1 1
1 2 0
1 2 1
2 0 0
2 0 1
2 1 0
2 1 1
2 2 0
2 2 1
3 0 0
3 0 1
3 1 0
3 1 1
3 2 0
3 2 1
4 0 0
4 0 1
4 1 0
4 1 1
4 2 0
4 2 1


In [17]:
np.mean(rf_accs, axis=0)

array([[0.54319736, 0.54609077],
       [0.5489751 , 0.55082008],
       [0.55155956, 0.5533279 ]])

In [18]:
np.mean(rf_ces, axis=0)

array([[0.86000861, 0.85853278],
       [0.85626742, 0.85371872],
       [0.8551223 , 0.85296949]])