In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

sns.set_theme(style="whitegrid")

In [2]:
EEG_PATH = 'train_eegs/'
SPEC_PATH = 'train_spectrograms/'

train = pd.read_csv('train.csv')

In [4]:
eeg = pd.read_parquet(f'{EEG_PATH}{train.iloc[0].eeg_id}.parquet')
eeg.head()

Unnamed: 0,Fp1,F3,C3,P3,F7,T3,T5,O1,Fz,Cz,Pz,Fp2,F4,C4,P4,F8,T4,T6,O2,EKG
0,-80.519997,-70.540001,-80.110001,-108.750000,-120.330002,-88.620003,-101.750000,-104.489998,-99.129997,-90.389999,-97.040001,-77.989998,-88.830002,-112.120003,-108.110001,-95.949997,-98.360001,-121.730003,-106.449997,7.920000
1,-80.449997,-70.330002,-81.760002,-107.669998,-120.769997,-90.820000,-104.260002,-99.730003,-99.070000,-92.290001,-96.019997,-84.500000,-84.989998,-115.610001,-103.860001,-97.470001,-89.290001,-115.500000,-102.059998,29.219999
2,-80.209999,-75.870003,-82.050003,-106.010002,-117.500000,-87.489998,-99.589996,-96.820000,-119.680000,-99.360001,-91.110001,-99.440002,-104.589996,-127.529999,-113.349998,-95.870003,-96.019997,-123.879997,-105.790001,45.740002
3,-84.709999,-75.339996,-87.480003,-108.970001,-121.410004,-94.750000,-105.370003,-100.279999,-113.839996,-102.059998,-95.040001,-99.230003,-101.220001,-125.769997,-111.889999,-97.459999,-97.180000,-128.940002,-109.889999,83.870003
4,-90.570000,-80.790001,-93.000000,-113.870003,-129.960007,-102.860001,-118.599998,-101.099998,-107.660004,-102.339996,-98.510002,-95.300003,-88.930000,-115.639999,-99.800003,-97.500000,-88.730003,-114.849998,-100.250000,97.769997
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17995,-144.660004,-147.809998,-129.820007,-129.460007,-157.509995,-124.000000,-124.570000,-94.820000,-153.070007,-121.110001,-86.459999,-132.520004,-138.339996,-128.970001,-71.300003,-114.480003,-86.709999,-114.959999,-81.500000,-20.070000
17996,-140.880005,-153.000000,-129.529999,-129.020004,-154.059998,-131.220001,-128.380005,-95.000000,-140.820007,-114.639999,-84.379997,-115.339996,-119.230003,-114.709999,-70.989998,-92.129997,-79.639999,-116.139999,-81.879997,10.600000
17997,-133.729996,-141.770004,-121.900002,-122.370003,-158.750000,-123.550003,-127.730003,-93.089996,-125.230003,-106.489998,-83.419998,-112.720001,-103.209999,-107.629997,-61.869999,-97.910004,-77.150002,-106.500000,-75.339996,-2.060000
17998,-141.449997,-151.139999,-127.190002,-128.699997,-163.460007,-124.309998,-129.479996,-94.419998,-140.869995,-113.339996,-83.519997,-129.300003,-118.650002,-117.589996,-71.879997,-99.279999,-83.900002,-116.160004,-81.410004,2.820000


Find cases with NA in EEG

In [22]:
eeg_na = set() # using set to keep only unique EEG IDs (https://stackoverflow.com/questions/12897374/get-unique-values-from-a-list-in-python)
for i in range(train.shape[0]):
    if pd.read_parquet(f'{EEG_PATH}{train.iloc[i].eeg_id}.parquet').isnull().any().any() and not(train.iloc[i].eeg_id in eeg_na):
        eeg_na.add(train.iloc[i].eeg_id)

KeyboardInterrupt: 

In [73]:
sec50_nan_row_count = [0]*len(train)    # to store the number of rows with Nans in 50 sec windows
sec10_nan_row_count = [0]*len(train)    # to store the number of rows with Nans in middle 10 sec windows

# iterating through each unique EEG ID
for id in tqdm(list(set(train.eeg_id))):
    # get the eeg file
    eeg = pd.read_parquet(f'{EEG_PATH}{id}.parquet')
    # test if the whole eeg has any Nans
    if eeg.isnull().any().any():
        # test for which offsets have Nans and note the corresponding label IDs
        for i, row in train[train.eeg_id == id].iterrows(): # iterate through all the rows indices for the eeg_id
            # get the 50 sec eeg window
            eeg_50sec = eeg.iloc[int(row.eeg_label_offset_seconds)*200:(int(row.eeg_label_offset_seconds)+50)*200]
            # check for Nans
            if eeg_50sec.isnull().any().any():
                # note number of rows with nans
                sec50_nan_row_count[i] = (eeg_50sec.isnull().sum(axis=1) > 0).sum()
                # note number of rows with nans in the middle 10 sec
                sec10_nan_row_count[i] = (eeg_50sec[4000:6000].isnull().sum(axis=1) > 0).sum()


100%|██████████| 17089/17089 [23:43<00:00, 12.00it/s] 


Spectrograms

In [98]:
spectrogram = pd.read_parquet(f'{SPEC_PATH}{train.iloc[0].spectrogram_id}.parquet')
spectrogram.head()

Unnamed: 0,time,LL_0.59,LL_0.78,LL_0.98,LL_1.17,LL_1.37,LL_1.56,LL_1.76,LL_1.95,LL_2.15,...,RP_18.16,RP_18.36,RP_18.55,RP_18.75,RP_18.95,RP_19.14,RP_19.34,RP_19.53,RP_19.73,RP_19.92
0,1,4.26,10.98,9.05,13.65,11.49,8.93,18.84,19.26,19.24,...,0.31,0.17,0.28,0.19,0.24,0.27,0.29,0.16,0.22,0.19
1,3,2.65,3.97,12.18,13.26,14.21,13.23,9.65,8.11,11.28,...,0.15,0.13,0.14,0.24,0.24,0.36,0.35,0.31,0.36,0.4
2,5,4.18,4.53,8.77,14.26,13.36,16.559999,19.219999,17.51,22.65,...,0.29,0.21,0.16,0.25,0.28,0.28,0.34,0.48,0.44,0.48
3,7,2.41,3.21,4.92,8.07,5.97,12.42,10.82,14.96,21.809999,...,0.33,0.51,0.49,0.64,0.58,0.42,0.32,0.31,0.32,0.33
4,9,2.29,2.44,2.77,4.62,5.39,7.08,9.84,12.27,14.41,...,0.44,0.38,0.48,0.63,0.45,0.45,0.49,0.33,0.31,0.34


In [99]:
spectrogram_600sec_nan_row_count = [0]*len(train)    # to store the number of rows with Nans in 600 sec windows
spectrogram_10sec_nan_row_count = [0]*len(train)    # to store the number of rows with Nans in middle 12 sec windows

# iterating through each unique spectrogram ID
for id in tqdm(list(set(train.spectrogram_id))):
    # get the spectrogram file
    spec = pd.read_parquet(f'{SPEC_PATH}{id}.parquet')
    # test if the whole spectrogram has any Nans
    if spec.isnull().any().any():
        # test for which offsets have Nans 
        for i, row in train[train.spectrogram_id == id].iterrows(): # iterate through all the rows indices for the spectrogram_id
            # get the 600 sec spectrogram window
            spec_600sec = spec.iloc[int(row.spectrogram_label_offset_seconds/2):int(row.spectrogram_label_offset_seconds/2)+300]
            # check for Nans
            if spec_600sec.isnull().any().any():
                # note number of rows with nans
                spectrogram_600sec_nan_row_count[i] = (spec_600sec.isnull().sum(axis=1) > 0).sum()
                # note number of rows with nans in the middle 10 sec
                spectrogram_10sec_nan_row_count[i] = (spec_600sec[147:153].isnull().sum(axis=1) > 0).sum()

100%|██████████| 11138/11138 [22:05<00:00,  8.40it/s] 


Add Nan info to train metadata

In [100]:
train["eeg_50sec_nan_row_count"] = sec50_nan_row_count
train["eeg_10sec_nan_row_count"] = sec10_nan_row_count
train["spectrogram_600sec_nan_row_count"] = spectrogram_600sec_nan_row_count
train["spectrogram_10sec_nan_row_count"] = spectrogram_10sec_nan_row_count

In [102]:
train.to_csv("train_with_nan_info.csv",index=False)

In [4]:
train = pd.read_csv('train_with_nan_info.csv')
train.head()

Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,eeg_50sec_nan_row_count,eeg_10sec_nan_row_count,spectrogram_600sec_nan_row_count,spectrogram_10sec_nan_row_count
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,3,0,0,0,0,0,0,0,0,0
1,1628180742,1,6.0,353733,1,6.0,3887563113,42516,Seizure,3,0,0,0,0,0,0,0,0,0
2,1628180742,2,8.0,353733,2,8.0,1142670488,42516,Seizure,3,0,0,0,0,0,0,0,0,0
3,1628180742,3,18.0,353733,3,18.0,2718991173,42516,Seizure,3,0,0,0,0,0,0,0,0,0
4,1628180742,4,24.0,353733,4,24.0,3080632009,42516,Seizure,3,0,0,0,0,0,0,0,0,0
