In [0]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import csv    
import gzip
from IPython.core.pylabtools import figsize
from IPython.display import clear_output
figsize(8, 8)
import re
import time
import os
import glob
%matplotlib inline               
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
from mpl_toolkits.mplot3d import Axes3D
plt.style.use('ggplot')

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
''' UNZIPPING MAIN FILE AND ITS CONTENTS '''

import tarfile

filename = os.path.join('/content/drive/My Drive/SMNI_CMI_TRAIN.tar.gz')
tar = tarfile.open(filename)
tar.extractall(path='/content/drive/My Drive/SMNI_CMI_TRAIN_unzipped')
tar.close()

In [0]:
'''NAMING THE LOCATIONS'''

main_dir = "/content/drive/My Drive/SMNI_CMI_TRAIN_unzipped/"
eeg_data = os.path.join(main_dir+'SMNI_CMI_TRAIN/')  # unzipped subjects data file location

subject_files = os.listdir(eeg_data) # list all the subject folders

In [0]:
print(subject_files)

['co2a0000364', 'co2a0000365', 'co2a0000368', 'co2a0000369', 'co2a0000370', 'co2a0000371', 'co2a0000372', 'co2a0000375', 'co2c0000338', 'co2a0000377', 'co2a0000378', 'co2c0000337', 'co2c0000339', 'co2c0000340', 'co2c0000341', 'co2c0000342', 'co2c0000344', 'co2c0000345', 'co2c0000346', 'co2c0000347', 'README']


## Parse Data

The following function will extract relevant parameters from each of the trial files belonging to the subject. The extracted files are stored in .csv format.

In [0]:
def extract_info(filename):
    """ Function to parse individual trial details for a subject.
        Args:
        filename - The name of file to be parsed
        
        Return:
        subject_id - The subject ID of the subject
        alcoholic - Return True for alcoholic subject
        stimulus  - Returns the stimulus for a particular trial
        trial_num - Trial number of the subject. On average there is 90 trial files per 
                    subject"""

    with gzip.open(filename, 'rt') as f:
        reader = csv.reader(f)
        for idx, row in enumerate(reader):
            if idx == 0:
                m = re.match('# (co\d(a|c)\d\d\d\d\d\d\d)', row[0])  #match pattern for the subject name
                                                                     #contains 'a' for alcoholics 
                                                                     #and 'c' for control
                subject_id = m.group(1)
                alcoholic = (m.group(2) == 'a')
            if idx == 3:
                m = re.match('# (.*?) trial (\d+)', row[0] + row[1]) #matching trial number 
                stimulus = re.sub('\W', ' ', m.group(1))
                trial_num = int(m.group(2))
#             if idx == 4:
#                 m = re.match('# (.*?) chan (\d+)', row[0])
#                 chan_name = re.sub('\W', '', m.group(1))
#                 chan_no = int(m.group(2))
    return subject_id, alcoholic, stimulus,trial_num

In [0]:
''' Converts files into CSV'''

def create_csv(subject_folder, main_dir, verbosity):
    """
        This function reads in individual trial files of subject folder, stacks the data
        in the form of a dataframe object and writes them out as CSV files.
        
        Args:
        subject_folder: The folder containing all the unzipped trial files of a subject
        main_dir: The main directory path to the data location
        verbosity: Pass a number greater than 1 for more output details
        
        Return:
        None 
    """
    data_dir = main_dir + subject_folder
    print('##########################################################')
    print('Beginning extraction of subject ', str(subject_folder))
    trial_dir = os.listdir(data_dir)
    trial_files = [file_i for file_i in trial_dir if file_i.endswith('.gz')]
    print('There are ', len(trial_files), 'trials under this subject')
    for file_no, file in enumerate(trial_files):
        tic = time.time()
        filename = data_dir + file
        df = pd.DataFrame()
        if verbosity == 0:
            print('Beginning extraction from file ', filename)

        sub_id, whether_alc, what_stimulus, trial_no = extract_info(filename)
        if verbosity > 1:
            print('Beginning extraction of file: ', filename)
            print('Subject id:', sub_id, ' Trial No: ', trial_no)
        with gzip.open(filename, 'rt') as f:
            reader = csv.reader(f, delimiter=' ')
            for idx, row in enumerate(reader):
                if row[0] != '#':
                    df = df.append(pd.DataFrame([row],columns=list(['trial_num', 'chan_name', 'epoch','voltage'])))
                        
                
            df = pd.concat([df,pd.DataFrame(columns=['subject_id', 'stimulus', 'alcoholic'])],
                           ignore_index=True)
            df[['subject_id', 'stimulus','alcoholic']] = [sub_id, what_stimulus, whether_alc] 
                
            out_name = main_dir + 'SMNI_CMI_TRAIN/' + sub_id + '_' + str(
                    trial_no) + '.csv'
            df.to_csv(out_name, index=False, sep='\t', encoding='utf-8')
            toc = time.time()
            
            if verbosity == 0:
                print('CSV file saved as ', out_name)
            elif verbosity > 1:
                print('There are ', df.shape[0], ' rows and ', df.shape[1],
                        ' columns in this dataframe object')
                print('CSV file successully saved as ', out_name)
                print('It will take %f mins more for this subject!' %
                        round(((len(trial_files) - float(file_no+1)) * float((toc - tic) / 60.0)),2))
                print('-------------------------------------------------------------------------------')
    print('All files extracted and saved.')
    print('###########################################################')
    return None

Start extraction of files for each subject. The extracted files will saved as csv files in the disk. 
The code proceeds only if files are missing. If all the files are satisfied the code prints the information with the number of trial files per subject. 
### NOTE: Subject co2c0000367 has error flags in the trial files and hence is avoided from the study.
 
### IMP: DO NOT RUN THIS CELL IF YOU DON'T WANT TO PARSE THE DATA(OR IF YOU DON'T HAVE THE FOLDER WITH ALL THE PARSED FILE IN THE DISK). 
THIS WILL TAKE A LONG TIME TO EXECUTE.

In [0]:
len_trialfiles = []
for folder in subject_files:
    if(folder[0]!='c'):
      continue
    os.chdir(eeg_data+folder)
    zipped_files = [zipfile for zipfile in glob.glob(folder+'*.gz')]
    os.chdir(main_dir+'SMNI_CMI_TRAIN/')
    files = [file for file in glob.glob(folder+'*.csv')]
    len_trialfiles.append(len(zipped_files))
    ## check if all files are already extracted or not.
    if len(files) < len(zipped_files):
        create_csv("SMNI_CMI_TRAIN/"+folder+"/", main_dir, verbosity=5)
    else:
        print('All files satisfied for subject',folder,'; There are',len(files),'files.')

##########################################################
Beginning extraction of subject  SMNI_CMI_TRAIN/co2a0000364/
There are  30 trials under this subject
Beginning extraction of file:  /content/drive/My Drive/SMNI_CMI_TRAIN_unzipped/SMNI_CMI_TRAIN/co2a0000364/co2a0000364.rd.002.gz
Subject id: co2a0000364  Trial No:  2


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




There are  16384  rows and  7  columns in this dataframe object
CSV file successully saved as  /content/drive/My Drive/SMNI_CMI_TRAIN_unzipped/SMNI_CMI_TRAIN/co2a0000364_2.csv
It will take 12.180000 mins more for this subject!
-------------------------------------------------------------------------------
Beginning extraction of file:  /content/drive/My Drive/SMNI_CMI_TRAIN_unzipped/SMNI_CMI_TRAIN/co2a0000364/co2a0000364.rd.000.gz
Subject id: co2a0000364  Trial No:  0


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




There are  16384  rows and  7  columns in this dataframe object
CSV file successully saved as  /content/drive/My Drive/SMNI_CMI_TRAIN_unzipped/SMNI_CMI_TRAIN/co2a0000364_0.csv
It will take 11.640000 mins more for this subject!
-------------------------------------------------------------------------------
Beginning extraction of file:  /content/drive/My Drive/SMNI_CMI_TRAIN_unzipped/SMNI_CMI_TRAIN/co2a0000364/co2a0000364.rd.015.gz
Subject id: co2a0000364  Trial No:  15
There are  16384  rows and  7  columns in this dataframe object
CSV file successully saved as  /content/drive/My Drive/SMNI_CMI_TRAIN_unzipped/SMNI_CMI_TRAIN/co2a0000364_15.csv
It will take 11.860000 mins more for this subject!
-------------------------------------------------------------------------------
Beginning extraction of file:  /content/drive/My Drive/SMNI_CMI_TRAIN_unzipped/SMNI_CMI_TRAIN/co2a0000364/co2a0000364.rd.019.gz
Subject id: co2a0000364  Trial No:  19
There are  16384  rows and  7  columns in this data

## Functions for Concatenating Data for Single Subject

In [0]:
def concat_df(sorted_files,verbosity):
    
    """
        Function to join together trial files belonging to a single subject
        and returns a dataframe object containing all the trials."""
    
    result = pd.DataFrame()
    for sorted_file in sorted_files:
        df = pd.read_csv(sorted_file, delim_whitespace=False, sep='\t', 
                         index_col = False)
        chan_names = df.chan_name.unique()    
        if verbosity >1:
            print('File loaded {}'.format((sorted_file.split('/')[5])))
        trial_voltage = []
        norm_voltage = []
        Norm_voltage =[]
        for chan in chan_names:
            #norm_voltage = []
            voltages = []  
            chan_df = pd.DataFrame()
            chan_df = df.loc[df['chan_name'] == chan]
            voltages = chan_df[['voltage']].values
            max_volt = max(voltages)
            min_volt = min(voltages)
            voltages = [x/abs(max_volt) if (x>0) else x for x in voltages]            #rescale voltage values to range [-1,1]
            voltages = [x/abs(min_volt) if (x<0) else x for x in voltages]
            norm_voltage.extend(voltages)
            Norm_voltage = np.asarray(norm_voltage)
        trial_voltage = np.concatenate(Norm_voltage, axis=0)
        df['norm_voltage'] = pd.Series(trial_voltage, index=df.index)
        result = result.append(df, ignore_index=True)
            
    return result

In [0]:
def sort_files(dir_path,sub_id):
    """
        Function sorts trial files under a subject folder in ascending order of trial number.
    """
    os.chdir('/content/drive/My Drive/SMNI_CMI_TRAIN_unzipped/SMNI_CMI_TRAIN/')
    files = [file for file in glob.glob(dir_path+sub_id+'_[0-9]*.csv') if file]
    print(files)
    # sorting assumes that the file directory structure for a single trial file is of the format
    # '/home/user/eeg_data/eeg_csv/co2a0000372_107.csv'. If not change the position
    # and character used in the following line.
    sorted_files = sorted(files, key=lambda x: int((x.split('.')[0]).split('_')[6]))
    
    return sorted_files

In [0]:
from pathlib import Path
import os.path
def create_SubCSV(sub_files):
    """
        This function calls the concat_df function and joins together indivdual trial files
        of the subject if the file does not exist. The dataframe object is written to disk as
        a pickle object.
    """
    
    
    out_dir = eeg_data

    for subject in sub_files:
        print(subject)
        if(subject[0]!='c'):
          continue
        outname = subject+'_full.pkl' 
        # file = Path(out_dir+outname)
        # try:
        #     file_path = file.resolve()
        #     print('File exists '+outname+'. Continue.')
                       
        # except FileNotFoundError:
        print('File does not exists '+outname+'. Begin.')
        sorted_files = sort_files(out_dir,subject)
        sub_df = concat_df(sorted_files,5)
        sub_df.to_pickle(outname)

## Create Subject files and store as pickle files
Create full subject files and store them to disk as pickle files for easy and fast access later.

In [0]:
main_dir = "/content/drive/My Drive/SMNI_CMI_TRAIN_unzipped/"
eeg_data = os.path.join(main_dir+'SMNI_CMI_TRAIN/')  # unzipped subjects data file location

subject_files = os.listdir(eeg_data) # list all the subject folders
os.chdir(eeg_data)
alcoholics = [file.replace('/', '') for file in glob.glob('co?a*/')]
controls = [file.replace('/', '') for file in glob.glob('co?c*/')]
create_SubCSV(controls)
create_SubCSV(alcoholics)

co2c0000338
File does not exists co2c0000338_full.pkl. Begin.
['/content/drive/My Drive/SMNI_CMI_TRAIN_unzipped/SMNI_CMI_TRAIN/co2c0000338_0.csv', '/content/drive/My Drive/SMNI_CMI_TRAIN_unzipped/SMNI_CMI_TRAIN/co2c0000338_4.csv', '/content/drive/My Drive/SMNI_CMI_TRAIN_unzipped/SMNI_CMI_TRAIN/co2c0000338_8.csv', '/content/drive/My Drive/SMNI_CMI_TRAIN_unzipped/SMNI_CMI_TRAIN/co2c0000338_6.csv', '/content/drive/My Drive/SMNI_CMI_TRAIN_unzipped/SMNI_CMI_TRAIN/co2c0000338_2.csv', '/content/drive/My Drive/SMNI_CMI_TRAIN_unzipped/SMNI_CMI_TRAIN/co2c0000338_1.csv', '/content/drive/My Drive/SMNI_CMI_TRAIN_unzipped/SMNI_CMI_TRAIN/co2c0000338_18.csv', '/content/drive/My Drive/SMNI_CMI_TRAIN_unzipped/SMNI_CMI_TRAIN/co2c0000338_13.csv', '/content/drive/My Drive/SMNI_CMI_TRAIN_unzipped/SMNI_CMI_TRAIN/co2c0000338_16.csv', '/content/drive/My Drive/SMNI_CMI_TRAIN_unzipped/SMNI_CMI_TRAIN/co2c0000338_5.csv', '/content/drive/My Drive/SMNI_CMI_TRAIN_unzipped/SMNI_CMI_TRAIN/co2c0000338_19.csv', '/content

In [0]:
print("Number of alcoholics = {} and controls = {}".format(len(alcoholics),len(controls)))

Number of alcoholics = 10 and controls = 10
