Goal: Creating train and test sets 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
import matplotlib.pyplot as plt

Load dataset containing the extracted features from and saved in extract_time_frequency_univariate_features.py

In [2]:
train = pd.read_csv('../../train_with_extracted_features.csv')

In [4]:
train.shape

(106800, 464)

## Removing cases with NAN in extracted features

Checking to possible NANs in the extracted features due to NANs in the processed eeg signals

In [3]:
train['na_count'] = train.isna().sum(axis=1).tolist()

In [5]:
# index of rows with NANs in extracted features
idx_na = train.index[train['na_count'] > 0].tolist()

In [13]:
# remove the cases with NANs in extracted features
train_filtered = train.drop(index=idx_na).copy()

In [14]:
train_filtered.shape

(103256, 464)

In [15]:
vote_cols = [c for c in train_filtered.columns if '_vote' in c]
vote_cols

['seizure_vote',
 'lpd_vote',
 'gpd_vote',
 'lrda_vote',
 'grda_vote',
 'other_vote']

## Select rows with non-overlapping 10s eeg windows.

We do this to avoid over representing data points with multiple time offsets close to one another. To achieve this, we first bin the offset seconds into 10s bins, and then select the first row in every other bin, thus ensuring the offset seconds are at least 10s apart.

In [None]:
train_filtered['offset_bins'] = pd.cut(train_filtered['eeg_label_offset_seconds'], bins=[i*10 - 1 for i in range(339)])
train_filtered['offset_bins'] = train_filtered['offset_bins'].astype('str')
train_filtered['offset_bins'] = train_filtered['offset_bins'].apply(lambda x : (int(x.split(',')[1][1:-1]) + 1) // 10 - 1 )

Summing up expert votes within offset_bins for each eeg_id

In [None]:
eeg_ids = train_filtered[['eeg_id','offset_bins','seizure_vote']].groupby(['eeg_id','offset_bins']).seizure_vote.agg('sum').index.get_level_values('eeg_id')
offsets = train_filtered[['eeg_id','offset_bins','seizure_vote']].groupby(['eeg_id','offset_bins']).seizure_vote.agg('sum').index.get_level_values('offset_bins')

In [None]:
for i in range(len(eeg_ids)):
    # get the all the rows for (eeg_id, offset_bins)
    vote_sum = train_filtered.loc[((train_filtered['eeg_id'] == eeg_ids[i]) & (train_filtered['offset_bins'] == offsets[i])),:].sum(axis=0)
    
    # iterate over rows to change
    for idx in train_filtered.index[((train_filtered['eeg_id'] == eeg_ids[i]) & (train_filtered['offset_bins'] == offsets[i]))]:
        train_filtered.loc[idx,vote_cols] = vote_sum

In [None]:
# making sure vote count is same within offset_bins for each vote category
for cat in vote_cols:
    print(((train_filtered[['eeg_id','offset_bins',cat]].groupby(['eeg_id','offset_bins'])[cat].agg('max') - train_filtered[['eeg_id','offset_bins',cat]].groupby(['eeg_id','offset_bins'])[cat].agg('min')) > 0).sum())

Adding votes from the odd offset_bins to prior even bins

In [None]:
for id in set(eeg_ids):
    # check if number of offset_bins for eeg_id are greater than 1
    if eeg_ids.value_counts()[id] > 1:
        # get the offset_bins
        id_offsets = offsets[np.where(eeg_ids == id)[0]]
        # iterate through odd offset_bins
        for i in range(len(id_offsets)):
            # had to do this instead of just iterating through odd indices because there are 
            # cases with even offset bin following an even offset bin 
            # (eg: eeg_id = 2428433259, 40 followed by 46 causing 47 to be at even index)
            if id_offsets[i]%2 == 1:
                # get the vote count for the odd offset_bin
                votes_to_add = train_filtered.loc[((train_filtered['eeg_id'] == id) & (train_filtered['offset_bins'] == id_offsets[i])),vote_cols].iloc[0,:]
                # add to each row of prior even offset_bins
                for idx in train_filtered.index[((train_filtered['eeg_id'] == id) & (train_filtered['offset_bins'] == id_offsets[i-1]))]:
                    train_filtered.loc[idx,vote_cols] += votes_to_add

Remove overlap cases

In [None]:
train_filtered = train_filtered.loc[train_filtered['offset_bins'] % 2 == 0]
train_filtered.shape[0]

In [None]:
train_filtered = train_filtered.groupby(['eeg_id', 'offset_bins']).agg('first').reset_index()
train_filtered.shape[0]

Calculate the total number of expert votes, and normalize votes to percentages.

In [None]:
train_filtered['total_votes'] = train_filtered[vote_cols].sum(axis=1)
for c in vote_cols:
    train_filtered[c] = train_filtered[c] / train_filtered['total_votes']

## Add the features extracted from 10 min Kaggle and 50 sec EEG Spectrograms

In [None]:
# load the 10 min Kaggle spectrogram extracted features
kaggle_spec = pd.read_parquet("train_features_from_kaggle_spec.parquet")
# load the 50 sec EEG spectrogram extracted features
eeg_spec = pd.read_parquet("train_features_from_eeg_spec.parquet")

In [None]:
# merge the eeg spectrogram extracted features with dataset containing univariate extracted features
df = train_filtered.merge(right=eeg_spec, on=['eeg_id','eeg_label_offset_seconds'], how = 'left').copy()

In [None]:
# merge the kaggle spectrogram extracted features with dataset containing univariate extracted features and 50sec eeg spectrogram features
df = train_filtered.merge(right=kaggle_spec, on=['spectrogram_id','spectrogram_label_offset_seconds'], how = 'left').copy()

## StratifiedGroupKFold

Perform train test split stratified on expert consensus. Additionally, we make sure that the patient IDs in train and test sets are disjoint, and that the test set does not contain any row with fewer than 3 expert votes.

We use the first split of StratifiedGroupKFold in order to stratify on the expert consensus and separate patient IDs between the train and test sets.

First seaparate out the entries with total_votes == 1 | total_votes == 2, considered to be as weak samples

In [None]:
df_low_total_votes = df[df['total_votes'] <= 2]
df_low_total_votes.shape[0]

In [None]:
df_high = df[df['total_votes'] > 2]
df_high.shape[0]

Use the set with total_votes >=3 for train/test split

In [None]:
sgkf = StratifiedGroupKFold(n_splits=10, shuffle=True, random_state=216)
for (t,v) in sgkf.split(X = df_high, y=df_high['expert_consensus'], groups=df_high['patient_id']):
    train_idx_full = t
    test_idx_full = v
    break

In [None]:
df_high_train = df_high.iloc[train_idx_full]
df_high_test = df_high.iloc[test_idx_full]
print(df_high_train.shape, df_high_test.shape, df_high.shape)

Making sure that none of patient IDs with total votes < 3 are in test set

In [None]:
bpids = df_high_test['patient_id'].values.tolist()
df_low_total_votes = df_low_total_votes[ ~df_low_total_votes['patient_id'].isin(bpids) ]
set(df_low_total_votes.patient_id.values).intersection(set(df_high_test.patient_id.values))

Add back the rows with low total vote counts to only the train set, still keeping the patient IDs in the two sets disjoint.

In [None]:
df_train = pd.concat([df_low_total_votes, df_high_train])
df_test = df_high_test

In [None]:
df_train.sort_index(inplace=True)
df_train.shape

## Overview of train and test sets

In [None]:
print(df['expert_consensus'].value_counts(normalize=True), '\n')
print(df_train['expert_consensus'].value_counts(normalize=True), '\n')
print(df_test['expert_consensus'].value_counts(normalize=True))

In [None]:
tmp1 = pd.DataFrame(df['expert_consensus'].value_counts(normalize=True))
tmp1.reset_index(inplace=True)

tmp2 = pd.DataFrame(df_train['expert_consensus'].value_counts(normalize=True))
tmp2.reset_index(inplace=True)

tmp3 = pd.DataFrame(df_test['expert_consensus'].value_counts(normalize=True))
tmp3.reset_index(inplace=True)

tmp1 = tmp1.sort_values(by='expert_consensus')
tmp2 = tmp2.sort_values(by='expert_consensus')
tmp3 = tmp3.sort_values(by='expert_consensus')

X = list(tmp1['expert_consensus'].values)

Y1 = tmp1['proportion']
Y2 = tmp2['proportion']
Y3 = tmp3['proportion']

fig, axs = plt.subplots(3, figsize=(8,10))
fig.subplots_adjust(hspace=0.3)

axs[0].bar(X, Y1)
axs[0].set_title('Full data set')

axs[1].bar(X, Y2)
axs[1].set_title(f'Train set - {(df_train.shape[0] / df.shape[0]) * 100 : .2f}% of data')

axs[2].bar(X, Y3)
axs[2].set_title(f'Test set- {(df_test.shape[0] / df.shape[0]) * 100 : .2f}% of data')

for i, p in enumerate(Y1):
    axs[0].text(i, p, f'{p*100 : .2f}%', ha='center', va='bottom')

for i, p in enumerate(Y2):
    axs[1].text(i, p, f'{p*100 : .2f}%', ha='center', va='bottom')

for i, p in enumerate(Y3):
    axs[2].text(i, p, f'{p*100 : .2f}%', ha='center', va='bottom')

plt.show()

## Save the train and test sets

In [None]:
df_train.to_parquet('data/train.parquet', index = False, compression = 'gzip')
df_test.to_parquet('data/test.parquet', index = False, compression = 'gzip')