In [None]:
import librosa
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import mne
from mne.filter import filter_data, notch_filter

Read in original train data set.

In [None]:
df = pd.read_csv('../../data/train.csv')
df.drop(columns=['Unnamed: 0'], inplace=True)

The following functions make_eeg_diffs and get_filtered_eeg_diffs perform the same function as preprocessing() in eeg_preprocessing.py, except it retains the full 50 seconds of EEG data (instead of keeping only the middle 10 seconds). 

In [None]:
def make_eeg_diffs(df):

    # check if df has any NaNs
    if df.isna().any(axis=None):
        #print('NaNs detected')
        #print('Doing forward interpolation...')
        df.interpolate(method='linear', limit_direction='forward', axis=0, inplace=True)
        #print('Still NaN?:', df.isna().any(axis=None))
        #print('Doing backward interpolation...')
        df.interpolate(method='linear', limit_direction='backward', axis=0, inplace=True)
        #print('Still NaN?:', df.isna().any(axis=None))

    raw_eeg = pd.DataFrame()

    raw_eeg['Fp1 - F7'] = df['Fp1'] - df['F7']
    raw_eeg['F7 - T3'] =  df['F7'] -  df['T3']
    raw_eeg['T3 - T5'] =  df['T3'] -  df['T5']
    raw_eeg['T5 - O1'] =  df['T5'] -  df['O1']

    raw_eeg['Fp2 - F8'] = df['Fp2'] - df['F8']
    raw_eeg['F8 - T4'] =  df['F8'] -  df['T4']
    raw_eeg['T4 - T6'] =  df['T4'] -  df['T6']
    raw_eeg['T6 - O2'] =  df['T6'] -  df['O2']
 
    raw_eeg['Fp1 - F3'] = df['Fp1'] - df['F3']
    raw_eeg['F3 - C3'] =  df['F3'] -  df['C3']
    raw_eeg['C3 - P3'] =  df['C3'] -  df['P3']
    raw_eeg['P3 - O1'] =  df['P3'] -  df['O1']
 
    raw_eeg['Fp2 - F4'] = df['Fp2'] - df['F4']
    raw_eeg['F4 - C4'] =  df['F4'] -  df['C4']
    raw_eeg['C4 - P4'] =  df['C4'] -  df['P4']
    raw_eeg['P4 - O2'] =  df['P4'] -  df['O2']
 
    raw_eeg['Fz - Cz'] =  df['Fz'] -  df['Cz']
    raw_eeg['Cz - Pz'] =  df['Cz'] -  df['Pz']

    return raw_eeg.to_numpy(dtype=np.float64)

In [None]:
def get_filtered_eeg_diffs(eeg_df, low_cut=0.5, high_cut=40, notch_cut=60, sampling_rate=200):
    final_columns = ['Fp1 - F7', 'F7 - T3', 'T3 - T5', 'T5 - O1',
                 'Fp2 - F8', 'F8 - T4', 'T4 - T6', 'T6 - O2', 
                 'Fp1 - F3', 'F3 - C3', 'C3 - P3', 'P3 - O1',
                 'Fp2 - F4', 'F4 - C4', 'C4 - P4', 'P4 - O2',
                 'Fz - Cz', 'Cz - Pz']
    
    raw_eeg = make_eeg_diffs(eeg_df)

    raw_eeg = notch_filter(raw_eeg, sampling_rate, notch_cut, n_jobs=-1, verbose='ERROR')
    raw_eeg = filter_data(raw_eeg, sampling_rate, low_cut, high_cut, n_jobs=-1, verbose='ERROR')
    raw_eeg = np.clip(raw_eeg, -500, 500)

    raw_eeg = np.float32(raw_eeg)

    raw_eeg_df = pd.DataFrame(raw_eeg, columns = final_columns)
    
    return raw_eeg_df

The main function that uses librosa to create the Mel spectrograms from raw EEG data. 

In [None]:
path_to_raw_eeg_data = '~/Harmful-Brain/hms-harmful-brain-activity-classification/train_eegs/'

def spec_from_eeg(eeg_id, eeg_offset, display=False):
    parquet_path = path_to_raw_eeg_data + str(eeg_id) + '.parquet'
    eeg = pd.read_parquet(parquet_path)
    mid = int(eeg_offset * 200 + 5000)
    if mid + 5000 > eeg.shape[0]: 
        print('size error')
        return 

    # extract middle 50 secs of eeg with offset  
    eeg = eeg.iloc[mid-5000 : mid+5000]

    eeg = get_filtered_eeg_diffs(eeg, low_cut=0.5, high_cut=40, notch_cut=60, sampling_rate=200)
    
    # drop the center channels in the eegs
    eeg = eeg.iloc[:, :-2]

    img = np.zeros((4,128,256), dtype='float32')
    if display: plt.figure(figsize=(10,7))

    # k = 0 ---> cols 0 : 3
    # k = 1 ---> cols 4 : 7

    for k in range(4):
        for kk in range(4*k, 4*k+4):
            x = eeg.iloc[:,kk].values
            mel_spec = librosa.feature.melspectrogram(y=x, sr=200, hop_length=len(x)//256, 
                                                     n_fft=1024, n_mels=128, fmin=0, fmax=20, 
                                                     win_length=128)
            width = (mel_spec.shape[1]//32) * 32
            mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max).astype(np.float32)[:,:width]
            mel_spec_db = (mel_spec_db + 40)/40
            
            img[k,:,:] += mel_spec_db
        
        img[k,:,:] /= 4.0

    NAMES = ['LL','RR','LP','RP']
    if display:
        for k in range(4):
            plt.subplot(2,2,k+1)
            plt.imshow(img[k,:,:], aspect='auto', origin='lower')
            plt.title(f'EEG {eeg_id} - Spectrogram {NAMES[k]}')
        plt.show()

    return img

Loop over all (EEG id, EEG offset) pairs to create the Mel spectrograms for the corresponding 50 seconds of EEG data. 

In [None]:
EEG_EEGOFFSET = df[['eeg_id', 'eeg_label_offset_seconds']].apply(tuple, axis=1).to_list()

for i, (eeg_id, eeg_offset) in enumerate(EEG_EEGOFFSET):
    if i%500 == 0: 
        print(f'processing row {i}')
    img = spec_from_eeg(int(eeg_id), eeg_offset, i%5000 == 0)
    np.save(f'train_eegs_specs/{int(eeg_id)}_{int(eeg_offset)}',img)