In [1]:
import pandas as pd
import numpy as np
import time
import cv2
import matplotlib.pyplot as plt

# ASC to GSSP
This script uses data from /asc and /results folders to create GSSP files (as images) for each reading longer than 15 seconds and stores them in the /gssp folder

In [2]:
##
# Find types for lines
def find_types(fileTxt0):
    print('Sorting lines...')
    nLines = len(fileTxt0)
    lineType = np.array(['OTHER']*nLines,dtype='object')
    lineTS = np.zeros(nLines)
    iStartRec = None
    t = time.time()
    for iLine in range(nLines):
        if len(fileTxt0[iLine])<3:
            lineType[iLine] = 'EMPTY'
        elif fileTxt0[iLine].startswith('*') or fileTxt0[iLine].startswith('>>>>>'):
            lineType[iLine] = 'COMMENT'
        elif fileTxt0[iLine].split()[0][0].isdigit() or fileTxt0[iLine].split()[0].startswith('-'):
            lineType[iLine] = 'SAMPLE'
        else:
            lineType[iLine] = fileTxt0[iLine].split()[0]
            if 'START' in lineType[iLine] or 'END' in lineType[iLine]:
                lineTS[iLine] = fileTxt0[iLine].split()[1]
        if '!CAL' in fileTxt0[iLine]: # TODO: Find more general way of determining if recording has started
            iStartRec = iLine+1
    print('Done! Took %f seconds.'%(time.time()-t))
    return lineType, lineTS

In [3]:
##
# Extract STARTs and ENDs of trials
def find_starts_and_ends(part,filename,lineType):
    print('Parsing recording markers...')
    iNotStart = np.nonzero(lineType!='START')[0]
    #dfRecStart = pd.read_csv(filename,skiprows=iNotStart,header=None,delim_whitespace=True,usecols=[1],encoding="latin-1")
    dfRecStart = pd.read_csv(filename,skiprows=iNotStart,header=None,sep='\s+',usecols=[1],encoding="latin-1")
    
    dfRecStart.columns = ['tStart']
    iNotEnd = np.nonzero(lineType!='END')[0]
    #dfRecEnd = pd.read_csv(filename,skiprows=iNotEnd,header=None,delim_whitespace=True,usecols=[1,5,6],encoding="latin-1")
    dfRecEnd = pd.read_csv(filename,skiprows=iNotEnd,header=None,sep='\s+',usecols=[1,5,6],encoding="latin-1")
    dfRecEnd.columns = ['tEnd','xRes','yRes']
    # combine trial info
    dfRec = pd.concat([dfRecStart,dfRecEnd],axis=1)
    nRec = dfRec.shape[0]
    print('%d readings (recording periods) found.'%nRec)
    return dfRec


In [4]:
##
# Create GSSP using dg data
def build_gssp(dg, sizeX, sizeY):
    size = dg.shape[0]
   # print(f"{size=}")
    gssp = np.zeros((size,size,3))
    for i in range(size):
        pt_i = dg[i][0:2]
        for j in range(size):
            pt_j = dg[j][0:2]
            dd = 0;
            if i>j:
                dd = (pt_j[0]-pt_i[0])/sizeX
            else:
                dd = (pt_j[1]-pt_i[1])/sizeY
            #print(pt_i, pt_j, dd)
            if(dd>1): dd=1;
            if(dd<-1): dd=-1;
            if(i>j): dd=-dd;
            if(dd>=0):
                gssp[i][j][0]=int(256*dd)
                #gssp[i][j][0]=dd
            else:
                gssp[i][j][1]=-int(256*dd)
                #gssp[i][j][1]=-dd
    return gssp

In [5]:
##
# Generate GSSPs for all readings in dfRec
def generate_gssp(part,results,dfRec,filename,nLines):
    for index, row in dfRec.iterrows():
        if len(results)<=index:
            print(f'ERR index={index} exceeded len(results)={len(results)}')
            continue
        res = results.iloc[index]
    #    print('index',res['Trial_Index_'],'condition',res['condition'],res['paragraphid'])
        speechid = res['speechid']
        paragraphid = res['paragraphid']
        question = res['question']
        # skip irrelewant readings
        if 'experiment' not in res['condition']:
            continue
        if res['paragraphid']==-1:
            continue
    
        tStart = row['tStart']
        tEnd = row['tEnd']

        iStart = np.where(lineTS[:]==tStart)[0]
        iEnd = np.where(lineTS[:]==tEnd)[0]
        iNotSample = np.nonzero( np.logical_or(lineType!='SAMPLE',  np.logical_or(np.arange(nLines)<iStart[0] , np.arange(nLines)>iEnd[0])))[0]
        #dfSamples = pd.read_csv(filename,skiprows=iNotSample,header=None,delim_whitespace=True,encoding="latin-1" ) #,usecols=range(0,len(cols)))
        dfSamples = pd.read_csv(filename,skiprows=iNotSample,header=None,sep='\s+',encoding="latin-1" ) #,usecols=range(0,len(cols)))
        xx = dfSamples[1].to_numpy()
        yy = dfSamples[2].to_numpy()
        if type(xx[0]) is str:
            xx = xx[np.where(xx[:]!='.')]
        if type(yy[0]) is str:    
            yy = yy[np.where(yy[:]!='.')]
        xx = np.array(xx,dtype=float)
        yy = np.array(yy,dtype=float)
        # downsample by 100!
        xx=xx[::100]
        yy=yy[::100]
        #print(xx.shape)
        # skip is size < 150 (duration < 15 sec)
        if xx.shape[0]<150:
            continue
        xx = np.expand_dims(xx,axis=1)
        yy = np.expand_dims(yy,axis=1)
        dg = np.concatenate((xx,yy),axis=1)
        
        if question.startswith('NO'):
            q = 'N'
        else:
            q = 'Q'
        
        name = f'{part}_{index}_{speechid}-{paragraphid}_{q}'
        print(name," of size",dg.shape[0])
        sizeX = 1920
        sizeY = 1080
        gssp = build_gssp(dg, sizeX, sizeY)
        #plt.imshow(gssp)
        #plt.show()
        cv2.imwrite(f"gssp/{name}.jpg",gssp)


In [6]:
## These subjects were used in the original paper!
subjects_to_use = [2, 3, 4, 5, 6, 7, 8, 9,10,11,12,15,16,18,19,20,21,22,
                   23,24,25,26,27,28,29,30,31,33,34,35,36,37,38,39,40,41]

##
# Build GSSPs for all subjects
for p in subjects_to_use:
    part = "P"+str(p).zfill(2)
    filename=f'asc/{part}.asc'
    print("==",part,"==================")
    f = open(filename, 'r', encoding="latin-1")#, encoding="utf-8")
    fileTxt0 = f.read().splitlines(True) # split into lines
    fileTxt0 = list(filter(None, fileTxt0)) #  remove emptys
    fileTxt0 = np.array(fileTxt0) # convert to np array for simpler indexing
    f.close()
    nLines = len(fileTxt0)
    results = pd.read_csv(f'results/{part}_results.txt', delimiter="\t")
    lineType, lineTS = find_types(fileTxt0)
    dfRec = find_starts_and_ends(part,filename,lineType)
    generate_gssp(part,results,dfRec,filename,nLines)
    print("DONE")

Sorting lines...
Done! Took 6.529621 seconds.
Parsing recording markers...
126 readings (recording periods) found.
P02_10_7905-0_N  of size 260
P02_11_7905-1_Q  of size 308
P02_13_7905-3_N  of size 157
P02_14_7905-4_N  of size 188
P02_24_7905-14_N  of size 257
P02_28_7905-18_N  of size 172
P02_32_7905-22_N  of size 196
P02_33_7905-23_N  of size 230
P02_35_7905-25_Q  of size 171
P02_37_7905-27_Q  of size 277
P02_38_7905-28_Q  of size 206
P02_39_7905-29_Q  of size 307
P02_40_7905-30_N  of size 153
P02_43_18561-0_N  of size 163
P02_57_18561-14_N  of size 205
P02_65_18473-2_N  of size 225
P02_67_18473-4_Q  of size 158
P02_72_18473-9_N  of size 197
P02_74_18473-11_N  of size 163
P02_75_18473-12_N  of size 185
P02_78_18473-15_N  of size 168
P02_81_11171-0_N  of size 159
P02_82_11171-1_N  of size 211
P02_88_11171-7_N  of size 163
P02_93_11171-12_N  of size 162
P02_96_18670-1_N  of size 190
P02_97_18670-2_Q  of size 167
P02_98_18670-3_N  of size 180
P02_99_18670-4_Q  of size 362
P02_100_18670-

  dfSamples = pd.read_csv(filename,skiprows=iNotSample,header=None,sep='\s+',encoding="latin-1" ) #,usecols=range(0,len(cols)))


P33_57_7905-27_Q  of size 1369
P33_58_7905-28_Q  of size 548
P33_59_7905-29_Q  of size 877
P33_60_7905-30_N  of size 318
DONE
Sorting lines...
Done! Took 7.151457 seconds.
Parsing recording markers...
60 readings (recording periods) found.
P34_10_1125-0_N  of size 662
P34_11_1125-1_Q  of size 344
P34_12_1125-2_N  of size 479
P34_13_1125-3_N  of size 213
P34_14_1125-4_Q  of size 670
P34_15_1125-5_N  of size 464
P34_16_1125-6_N  of size 313
P34_17_1125-7_N  of size 548
P34_18_1125-8_N  of size 401
P34_19_1125-9_Q  of size 539
P34_20_1125-10_N  of size 254
P34_21_1125-11_N  of size 614
P34_22_1125-12_Q  of size 418
P34_23_1125-13_N  of size 543
P34_24_1125-14_N  of size 458
P34_26_1125-16_N  of size 286
P34_27_1125-17_N  of size 243
P34_28_1125-18_N  of size 378
P34_29_1125-19_N  of size 258
P34_30_1125-20_N  of size 346
P34_33_1317-0_N  of size 279
P34_34_1317-1_N  of size 495
P34_35_1317-2_N  of size 272
P34_36_1317-3_N  of size 239
P34_37_1317-4_N  of size 587
P34_38_1317-5_N  of size 