# Generate HMS-HBA Spectrogram files
* Process and save spectrograms as images for training


# Install libraries

In [1]:
!pip install scienceplots

Collecting scienceplots
  Obtaining dependency information for scienceplots from https://files.pythonhosted.org/packages/51/2a/debffdd7061c7a65ab46c51e671b895dc8231c4a14950849c40699eb2070/SciencePlots-2.1.1-py3-none-any.whl.metadata
  Downloading SciencePlots-2.1.1-py3-none-any.whl.metadata (11 kB)
Downloading SciencePlots-2.1.1-py3-none-any.whl (16 kB)
Installing collected packages: scienceplots
Successfully installed scienceplots-2.1.1


# Import libraries

In [2]:
import os
import cv2
import glob
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 50)

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
import scienceplots
plt.style.use(['science','no-latex'])

from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

# Load data

In [3]:
kaggle = True
root_dir = '.'
if kaggle:
    root_dir = '/kaggle/input/hms-harmful-brain-activity-classification'

train = pd.read_csv(root_dir + '/train.csv')

# set spect path variable
spect_dir = root_dir + '/train_spectrograms/'


# Top level Stats

In [4]:
# print all sets - assuming train,test, ss
print(f'Training Data Shape: {train.shape}')
display(train.head())


Training Data Shape: (106800, 15)


Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,3,0,0,0,0,0
1,1628180742,1,6.0,353733,1,6.0,3887563113,42516,Seizure,3,0,0,0,0,0
2,1628180742,2,8.0,353733,2,8.0,1142670488,42516,Seizure,3,0,0,0,0,0
3,1628180742,3,18.0,353733,3,18.0,2718991173,42516,Seizure,3,0,0,0,0,0
4,1628180742,4,24.0,353733,4,24.0,3080632009,42516,Seizure,3,0,0,0,0,0


## Add path to the spectrogram

In [5]:
# add path to training data
train['spect_path'] = train.spectrogram_id.apply(lambda x: f'{spect_dir}{int(x)}.parquet')
train.head(2)

Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,spect_path
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,3,0,0,0,0,0,/kaggle/input/hms-harmful-brain-activity-class...
1,1628180742,1,6.0,353733,1,6.0,3887563113,42516,Seizure,3,0,0,0,0,0,/kaggle/input/hms-harmful-brain-activity-class...


In [6]:
train.spect_path[0]

'/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/353733.parquet'

## Target Variable

In [7]:
targets = [col for col in train.columns if '_vote' in col]
print(targets)

['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']


In [8]:
# # use glob to make a list of files in the spectrogram directory and verify same number
# spectrograms = glob.glob(train_spectrograms + '/*.parquet')
# len(spectrograms)


# Open a spectrogram file
* Look at format of spectrogram files


In [9]:
# open the first spectrogram file
spect1 = pd.read_parquet(train.spect_path[0])
print(f'Spectrogram shape: {spect1.shape}')
spect1.head()

Spectrogram shape: (320, 401)


Unnamed: 0,time,LL_0.59,LL_0.78,LL_0.98,LL_1.17,LL_1.37,LL_1.56,LL_1.76,LL_1.95,LL_2.15,LL_2.34,LL_2.54,LL_2.73,LL_2.93,LL_3.13,LL_3.32,LL_3.52,LL_3.71,LL_3.91,LL_4.1,LL_4.3,LL_4.49,LL_4.69,LL_4.88,LL_5.08,...,RP_15.23,RP_15.43,RP_15.63,RP_15.82,RP_16.02,RP_16.21,RP_16.41,RP_16.6,RP_16.8,RP_16.99,RP_17.19,RP_17.38,RP_17.58,RP_17.77,RP_17.97,RP_18.16,RP_18.36,RP_18.55,RP_18.75,RP_18.95,RP_19.14,RP_19.34,RP_19.53,RP_19.73,RP_19.92
0,1,4.26,10.98,9.05,13.65,11.49,8.93,18.84,19.26,19.24,19.049999,8.82,5.4,6.47,6.08,3.94,7.67,4.23,5.86,5.23,3.69,2.46,2.32,2.57,3.32,...,0.16,0.17,0.06,0.08,0.42,0.6,0.95,1.07,1.09,1.13,0.46,0.54,0.43,0.32,0.39,0.31,0.17,0.28,0.19,0.24,0.27,0.29,0.16,0.22,0.19
1,3,2.65,3.97,12.18,13.26,14.21,13.23,9.65,8.11,11.28,8.46,5.48,4.17,5.55,3.96,4.71,5.09,3.99,3.6,3.7,1.9,1.88,2.17,1.91,2.5,...,0.17,0.25,0.31,0.61,0.86,1.03,1.28,1.11,0.87,0.66,0.59,0.32,0.27,0.22,0.18,0.15,0.13,0.14,0.24,0.24,0.36,0.35,0.31,0.36,0.4
2,5,4.18,4.53,8.77,14.26,13.36,16.559999,19.219999,17.51,22.65,21.719999,17.75,13.57,5.59,4.79,3.26,2.91,2.93,2.68,4.23,5.22,6.22,6.21,5.79,6.1,...,0.37,0.27,0.75,0.74,1.53,1.51,0.99,1.02,0.53,0.29,0.2,0.2,0.26,0.25,0.28,0.29,0.21,0.16,0.25,0.28,0.28,0.34,0.48,0.44,0.48
3,7,2.41,3.21,4.92,8.07,5.97,12.42,10.82,14.96,21.809999,19.629999,17.43,13.14,7.44,5.39,3.93,4.47,3.41,2.4,7.16,5.56,7.59,9.23,5.28,5.09,...,0.55,0.46,0.39,0.92,0.9,0.92,1.0,0.88,0.71,0.65,0.61,0.63,0.44,0.42,0.41,0.33,0.51,0.49,0.64,0.58,0.42,0.32,0.31,0.32,0.33
4,9,2.29,2.44,2.77,4.62,5.39,7.08,9.84,12.27,14.41,13.31,11.46,12.32,6.97,8.5,7.07,3.98,3.54,2.72,3.57,5.45,5.19,6.39,8.67,7.47,...,0.47,0.63,0.39,0.23,0.52,0.79,1.12,1.12,1.13,0.98,0.38,0.74,0.53,0.55,0.59,0.44,0.38,0.48,0.63,0.45,0.45,0.49,0.33,0.31,0.34


In [10]:
# how many columns have 'LL', 'LR', etc. in them?
print(f'LL cols: {len([col for col in spect1.columns if "LL" in col])}')
print(f'LP cols: {len([col for col in spect1.columns if "LP" in col])}')
print(f'RL cols: {len([col for col in spect1.columns if "RL" in col])}')
print(f'RP cols: {len([col for col in spect1.columns if "LP" in col])}')

LL cols: 100
LP cols: 100
RL cols: 100
RP cols: 100


### Takeaways
* Have 1 time column and 100 columns each for LL, LP, RL, RP
* Try both training on single image and splitting into 4 separate images

## Create Dataset

In [11]:
# create a new subdirectory called /spectrograms if it doesn't exist
if not os.path.exists('./spectrograms'):
    os.mkdir('./spectrograms')

 # create subdirectories for each label
for target in targets:
    tar_path = f'./spectrograms/{target}'
    if not os.path.exists(tar_path):
        os.mkdir(tar_path)

In [12]:
# fig,ax = plt.subplots(3,4,figsize=(15,10))
# ax = ax.flatten()
for i in tqdm(range(len(train))):
    
    spect_id = train.spectrogram_id[i]
    spect_sub_id = train.spectrogram_sub_id[i]
    spect_offset_sec = train.spectrogram_label_offset_seconds[i]
    start = int(spect_offset_sec * .5)
    end = start + 300
    
    df = pd.read_parquet(train.spect_path[i])
    # get the name of the spectrogram file
    #spect_id = int(spectrograms[i].split('/')[-1].split('.')[0])
    #consensus = train_unique[train_unique.spectrogram_id == spect_id].expert_consensus.values[0]
    # normalize the spectrogram between 0 and 1
    spectrogram = df.values / df.values.max()
    # swap x and y axis
    spectrogram = spectrogram.T
    # replace 0 values with 10e-9
    spectrogram = np.where(spectrogram == 0, 10e-9,spectrogram)
    # take the log of the spectrogram
    spectrogram = np.log(spectrogram)
    # make minimum value 0
    spectrogram = spectrogram + np.abs(np.min(spectrogram))
    # scale the spectrogram to have a max of 255
    spectrogram = 255 * (spectrogram / spectrogram.max())
    # convert to uint8
    spectrogram = spectrogram.astype(np.uint8)

    #img = cv2.imread(f'spectrograms/{spect_id}.png')
    spect_crop = spectrogram[:,start:end]
    if np.sum(spect_crop) > 0.0:
        # loop through nonzero targets
        for tar in targets:
            if train[tar][i] > 0.0:
                # save the cropped image to the appropriate label folder
                cv2.imwrite(f'./spectrograms/{tar}/{spect_id}_{spect_sub_id}.png',spect_crop)
    
    # save the spectrogram as a png file in the folder /spectrograms

    #cv2.imwrite(f'spectrograms/{spect_id}.png',spectrogram)
#     ax[i].imshow(spectrogram,cmap='viridis')
#     ax[i].set_title(f"{spectrograms[i].split('/')[-1].split('.')[0]}, {consensus}")
# plt.tight_layout();


100%|██████████| 106800/106800 [1:42:06<00:00, 17.43it/s]


In [13]:
from datetime import datetime
today = datetime.today().strftime('%y-%m-%d')
!zip -r -q f'HBA_spectrogram_images_dataset_{today}' /kaggle/working/spectrograms

zip I/O error: No space left on device
zip error: Output file write failure (write error on zip file)
