In [1]:
## Erdos Bootcamp
## Josimar A. Silva, May 2021
## This script reads the data for the EEG dataset and create a  dataframe and numpy matrix to perform data analysis 
import re
import numpy as np
import pandas as pd

In [2]:
## Reading the EEG dataset (full data). Note that it is assumed the user has downloaded and extracted the files
## using the GetData.sh bash script
file = open('./data/FullData/fullData.dat','r')
lines = file.readlines()

In [3]:
## Defining set of funtions to auxiliate on the data reading

def GetGroupAndSubjectNumber(line):
    # Read group and subject number
    
    subject=line.split(" ")[1].split('.')[0]
    group=subject[3]
    return group, subject

def GetConditionAndTrialNumber(line):
    # Read condition and trial number
    
    tmp = line.split(','); 
    condition = tmp[0].split('#')[1].strip()
    trial = int(line.split(' ')[-1])
    return condition, trial

def GetChannelInfo(line):
    # Get channel information 
    
    tmp = line.split(' ')
    channel = tmp[1]
    chanNumber = tmp[-1].rstrip()
    return channel, chanNumber

def IncreaseArraySize(numberOfCurrentRows, sensorValue):
    #Increase the array size for the sensorValue
        
    numberOfCols = sensorValue.shape[1]
    newVar = np.zeros([numberOfCurrentRows * 2, numberOfCols]) ## doubling the size of the numpy array
    newVar[0:numberOfCurrentRows,:] =  sensorValue[0:numberOfCurrentRows,:].copy()
    sensorValue=[]; 
    sensorValue = newVar.copy(); 
    newVar=[];
    
    #print('Increasing array size for the sensor values')
    
    return sensorValue

In [None]:
## Read data and create numpy matrix and dataframe information

sensorValue = np.zeros([4 , 256]) ## sensor values (nLines x nSamples). Note that nLines will increase adaptively
headerInfo=[] # list containing the information for each signal in the file
numberOfDfLines = 0
iRow=0; iCol=0
subjectNumber=1
subjectName='tst' ## so as to print the subject that it is currently working on
for line in lines:

    if '#' in line: #reading header info
        
        if 'co' in line:
            #get group and subject number 
            group, subject = GetGroupAndSubjectNumber(line)
            #print('group = '+group+' subject = '+str(subject))
        if re.search(r'\btrial\b',line):
            condition, trial = GetConditionAndTrialNumber(line)
            #print('condition = '+condition+' trial = '+str(trial))
        if re.search(r'\bchan\b', line):
            channel, chanNumber = GetChannelInfo(line)
            #print('channel = '+channel+' chanNumber = '+chanNumber)
    
    else: ## reading channel samples
        
        ## Checking if the numpy array size should be increased
        if iRow > sensorValue.shape[0] - 1:
            sensorValue = IncreaseArraySize(iRow, sensorValue)
        
        ## Saving sensor value to numpy array
        sensorValue[iRow, iCol] = float(line.split(' ')[-1])
        iCol = iCol + 1    
        
        ## Checking to see if a new line to the dataframe needs to be created
        sampleNumber = float(line.split(' ')[-2])
        if sampleNumber == 255:
            headerInfo = headerInfo + [group,subject,condition,trial,channel,chanNumber]
            numberOfDfLines = numberOfDfLines + 1
            iCol = 0
            iRow = iRow + 1
            if (subjectName == subject) == 0:
                print('Working on subject # '+str(subjectNumber)+'\t'+subject)
                subjectName = subject
                subjectNumber=subjectNumber+1

## Keeping only non zero rows            
sensorValue = sensorValue[0:iRow,:]        
        
   

Working on subject # 1	co2a0000364
Working on subject # 2	co2a0000365
Working on subject # 3	co2a0000368
Working on subject # 4	co2a0000369
Working on subject # 5	co2a0000370
Working on subject # 6	co2a0000371
Working on subject # 7	co2a0000372
Working on subject # 8	co2a0000375
Working on subject # 9	co2a0000377
Working on subject # 10	co2a0000378
Working on subject # 11	co2a0000379
Working on subject # 12	co2a0000380
Working on subject # 13	co2a0000381
Working on subject # 14	co2a0000382
Working on subject # 15	co2a0000384
Working on subject # 16	co2a0000385
Working on subject # 17	co2a0000386
Working on subject # 18	co2a0000387
Working on subject # 19	co2a0000388
Working on subject # 20	co2a0000390
Working on subject # 21	co2a0000392


In [None]:
## Convert list to DataFrame to 
tmp = np.array(headerInfo)
tmp2 = tmp.reshape(numberOfDfLines,6)
df = pd.DataFrame(tmp2, columns = ['group' ,'subject','condition','trial','channel','chanNumber'])

In [None]:
df['subject'].unique()

In [None]:
df.head()

In [None]:
## Saving dataframe and numpy matrix to file
df.to_csv('./data/FullData/dataHeader.csv', index=False)
np.savetxt('./data/FullData/sensorValue.dat',sensorValue, fmt='%f')