In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score

In [5]:
n_splits = 5
kfold = StratifiedGroupKFold(n_splits, random_state=7, shuffle=True)

In [20]:
eeg_train = pd.read_parquet("train_all_data.parquet")

In [26]:
feature_cols = eeg_train.columns[465:]

In [27]:
vote_cols = ['gpd_vote', 'grda_vote', 'lpd_vote', 'lrda_vote', 'other_vote', 'seizure_vote']

In [28]:
from scipy.stats import entropy

In [29]:
eeg_train['weight'] = eeg_train['total_votes'].apply(lambda x : min(x/3, 1))
eeg_train.head(5)['weight']

index
0    1.000000
1    0.333333
2    0.333333
3    0.333333
4    0.666667
Name: weight, dtype: float64

In [31]:
max_depths = range(17, 21, 3)
n_trees = [1000]

rf_accs = np.zeros((n_splits, len(max_depths), len(n_trees)))
rf_ces = np.zeros((n_splits, len(max_depths), len(n_trees)))

i = 0
for train_index, test_index in kfold.split(eeg_train, eeg_train.expert_consensus, groups=eeg_train.patient_id):
    eeg_tt = eeg_train.iloc[train_index]
    eeg_ho = eeg_train.iloc[test_index]

    j = 0
    for max_depth in max_depths:
        k = 0
        for n_estimators in n_trees:
            print(i,j,k)
            rf = RandomForestClassifier(n_estimators = n_estimators,
                                           max_depth = max_depth,
                                           max_samples = int(.8*len(eeg_tt)),
                                           random_state = 7,
                                           n_jobs = 16)
            
            rf.fit(eeg_tt[feature_cols], eeg_tt.expert_consensus)
            
            pred = rf.predict(eeg_ho[feature_cols])
            pcs = rf.predict_proba(eeg_ho[feature_cols])

            rf_accs[i,j,k] = accuracy_score(eeg_ho.expert_consensus,  pred)
            ent = entropy(eeg_ho[vote_cols], pcs.clip(min=1e-15), axis=1)
            rf_ces[i,j,k] = np.mean(ent*eeg_train.iloc[test_index]['weight'])
            
            k = k + 1
        j = j + 1
    i = i + 1


0 0 0
0 1 0
1 0 0
1 1 0
2 0 0
2 1 0
3 0 0
3 1 0
4 0 0
4 1 0


In [32]:
np.mean(rf_accs, axis=0)

array([[0.55763171],
       [0.55944496]])

In [33]:
np.mean(rf_ces, axis=0)

array([[0.82170219],
       [0.82080242]])