In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Imports

In [2]:
import numpy as np
import pickle
import os
import random

## Define methods for extraction

In [3]:
def extract_test_data( test_data_path):
  '''
  extract_test_data reads all of the spectrograms saved as npy and transforms into shape that can be used as input in training model
  param test_data_path: path of the test dataset
  return: test_data, test_label
  '''
  test_data = []
  test_label = []
  i = 0
  for file in os.listdir(test_data_path):
      if file.endswith(".npy"):
        test_data.append(os.path.join(test_data_path, file))
        test_label.append(int(file.split('_')[3]))
        i += 1
  print(f'There are {test_label.count(0)} interictal files')
  print(f'There are {test_label.count(1)} preictal files')

  return test_data, test_label

In [4]:
def split_training_data(train_dataset_dict, partition):
  '''
  split_training_data splits the training dataset into training and validation data sets
  param train_dataset_dict: dictionary with training data, blocks as keys and file paths as values
  param partition: percentage of data to be used for training
  '''
  ## Randomize blocks and split train:hold-out
  random.seed(2021)
  
  print(f'{round(partition,2)} used for training and {round(1 - partition, 2)} for validation')
  blocks = list(train_dataset_dict.keys())
  random.shuffle(blocks)
  # print(blocks)
  train_dataset_dict_shuffled = [(block, train_dataset_dict[block]) for block in blocks]

  # split the data
  train_shuffled = train_dataset_dict_shuffled[:int(partition*len(train_dataset_dict_shuffled))]
  devtest_shuffled =  train_dataset_dict_shuffled[ int(partition*len(train_dataset_dict_shuffled)):]

  # print(f'training data is {len(train_shuffled)} blocks, {get_num_of_files(train_shuffled)} files')

  # print(f'devtest data is {len(devtest_shuffled)} blocks and {get_num_of_files(devtest_shuffled)} files')

  return train_shuffled, devtest_shuffled

In [5]:
def extract_train_data(train_data_path):
  '''
  extract_data reads all of the spectrograms saved as npy and transforms into shape that can be used as input in training model
  param train_data_path: path of the training dataset
  return: trainDict
  '''
  trainDict = {}
  preictal_count = 0
  interictal_count = 0
  prevBlock = 0
  interictalFiles = [] 
  preictalFiles = []
  i = 0
  for file in os.listdir(train_data_path):
    if file.endswith(".npy"):
      if file.split('_')[1] != '':
        block = int(file.split('_')[1]) # extract block
        trainDict[block] = {}
  print(f'There are {len(trainDict)} blocks in this dataset')

  # Extract the class labels
  for file in os.listdir(train_data_path):
    if file.endswith(".npy"):
      if file.split('_')[1] != '':
        block = int(file.split('_')[1]) # extract block
        label = int(file.split('_')[3]) # extract label
        if label == 1: # k = 1 for preictal
          preictalFiles.append(os.path.join(train_data_path, file))
          trainDict[block][1] = preictalFiles
          preictal_count += 1
          # trainDict[block]['preictal'] = preictalFiles
        elif label == 0: # k = 0 for interictal
          interictalFiles.append(os.path.join(train_data_path, file))
          trainDict[block][0] = interictalFiles
          interictal_count += 1
          # trainDict[block]['interictal'] = interictalFiles
        if block != prevBlock:
          prevBlock = block # update prevBlock
          # reset the file list
          preictalFiles = [] 
          interictalFiles = [] 
  npys_count = preictal_count + interictal_count
  print(f'There are {npys_count} files in this dataset. {interictal_count} interictal, {preictal_count} preictal')
  return npys_count, trainDict



In [6]:
def create_data_list(data_dict_list):
  '''
  create_data_list creates a flat list of the dataset and extracts the labels into a separate variable
  param data_dict_list: a list containing a tuple of blocks and dictionary of class and path to data
  return: data_list, label_list
  '''
  data_list = []
  label_list = []
  # Train data list
  for key in data_dict_list:
    for k1, v1 in key[1].items():
      for item in v1:
        data_list.append(item)
        label_list.append(k1)
  print(f'There are {label_list.count(0)} interictal files')
  print(f'There are {label_list.count(1)} preictal files') 

  return data_list, label_list

## Create Datasets

In [7]:
# VALID OPTIONS ARE ['Pat1','Pat2','Pat3']
subId = 'Pat2'
train_data_path = '/content/drive/MyDrive/analysis/spectograms/'+subId+'Train_129x48_stacked'
test_data_path = '/content/drive/MyDrive/analysis/spectograms/'+subId+'Test_129x48_stacked'

print('Test dataset:')
test_data, test_label = extract_test_data(test_data_path)

print('')
print('Train Validation Split:')
npys_count, trainDict = extract_train_data(train_data_path)
partition = 1 - (len(test_data) / npys_count)

train_shuffled, devtest_shuffled = split_training_data(trainDict, partition)
print('')
print('Training dataset:')
train_data, train_label = create_data_list(train_shuffled)

print('')
print('Validation dataset:')
devtest_data, devtest_label = create_data_list(devtest_shuffled)

Test dataset:
There are 2280 interictal files
There are 149 preictal files

Train Validation Split:
There are 209 blocks in this dataset
There are 10007 files in this dataset. 8855 interictal, 1152 preictal
0.76 used for training and 0.24 for validation

Training dataset:
There are 6703 interictal files
There are 936 preictal files

Validation dataset:
There are 2029 interictal files
There are 198 preictal files


## Save Labels to Drive

In [8]:
np.save('/content/drive/MyDrive/analysis/spectograms/cnn_model_input/'+ subId +'_train_label.npy', train_label)
np.save('/content/drive/MyDrive/analysis/spectograms/cnn_model_input/'+ subId +'_devtest_label.npy', devtest_label)
np.save('/content/drive/MyDrive/analysis/spectograms/cnn_model_input/'+ subId +'_test_label.npy', test_label)


## Extract data from path and stack into 4D arrray

In [9]:
def stack_4D(train_data_path_list, devtest_data_path_list, test_data_path_list):
  '''
  stack_4D extracts each file in the list, stacks into a 4D array and saves to Drive
  param data_path_list: list of the data file paths
  return: None
  '''
  result = []
  for i in range(len(train_data_path_list)):
      x = np.load(train_data_path_list[i])
      result.append(x)
  train_data_4D = np.stack(result)
  print(np.shape(train_data_4D))
  np.save('/content/drive/MyDrive/analysis/spectograms/cnn_model_input/'+ subId +'_train_data_4D.npy', train_data_4D)

  result = []
  for i in range(len(devtest_data_path_list)):
    x = np.load(devtest_data_path_list[i])
    result.append(x)
  devtest_data_4D = np.stack(result)
  print(np.shape(devtest_data_4D))
  np.save('/content/drive/MyDrive/analysis/spectograms/cnn_model_input/'+ subId +'_devtest_data_4D.npy', devtest_data_4D)

  result = []
  for i in range(len(test_data_path_list)):
      x = np.load(test_data_path_list[i])
      result.append(x)
  test_data_4D = np.stack(result)

  print(np.shape(test_data_4D))
  np.save('/content/drive/MyDrive/analysis/spectograms/cnn_model_input/'+ subId +'_data_4D.npy', test_data_4D)

In [10]:
stack_4D(train_data, devtest_data, test_data)

(7639, 129, 48, 16)
(2227, 129, 48, 16)
(2429, 129, 48, 16)
