In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from merging_data import *
import glob
import tarfile

In [2]:
#import processed data from the Potsdam Search Corpus
corpus_df = pd.read_csv("../incomplete_corpus_data/PotsdamSearchCorpus.dat", sep = "\t")
#import testing with raw data from participant 1
raw_data = pd.read_csv("../separate_participant_data/asc\SFC1_2.dat", sep = "\t", 
                       header = None, names=['time', 'x', 'y', 'pupil'])

In [3]:
#then I have to filter in corpus_df which data is for participant 1, but later I should optimize
#such that I import data and read from the corpus for all participants
corpus_p1 = corpus_df[corpus_df.subject == 1]

In [4]:
inds = pd.isnull(corpus_p1.sacno) #get where saccade number is NaN

In [5]:
len(corpus_p1[(corpus_p1.sacno==26)&(corpus_p1.imageno==50)])

0

In [6]:
invalid_sacfix = ((corpus_p1.fixinvalid + corpus_p1.sacinvalid) >= 1)

In [7]:
corpus_p1[invalid_sacfix & (corpus_p1.imageno==50)]

Unnamed: 0,subject,colorimages,trialno,trialstart,trialend,imagestart,imageend,imageno,filtertype,filterregion,...,fixposy,fixinvalid,sacno,saconset,sacoffset,sacdur,velpeak,amplitude,sacinvalid,glissade_handled
16,1,1,1,4235551,4245171,4239225,4245052,50,1,2,...,22.35,1,17.0,5445.0,5456.0,12.0,710.28,2.14,1,0
17,1,1,1,4235551,4245171,4239225,4245052,50,1,2,...,22.39,1,18.0,5467.0,5493.0,27.0,18058.26,56.09,1,0
18,1,1,1,4235551,4245171,4239225,4245052,50,1,2,...,-0.04,1,19.0,5588.0,5657.0,70.0,17012.22,53.72,1,0
19,1,1,1,4235551,4245171,4239225,4245052,50,1,2,...,21.1,1,20.0,5747.0,5813.0,67.0,507.92,13.34,0,0
20,1,1,1,4235551,4245171,4239225,4245052,50,1,2,...,15.61,1,,,,,,,1,0


for image in np.unique(corpus_p1.imageno):
    corpus_invalid = corpus_p1[invalid_sacfix & (corpus_p1.imageno==image)]
    invalid_fix_no = np.unique(corpus_invalid.fixno[corpus_invalid.fixno.notnull()])
    invalid_sac_no = np.unique(corpus_invalid.sacno[corpus_invalid.sacno.notnull()])
    
    for infix in invalid_fix_no:
        in_fixonset = corpus_invalid[corpus_invalid.fixno == infix].fixonset
        in_fixoffset = corpus_invalid[corpus_invalid.fixno == infix].fixoffset
        
        filtered_raw.loc[(filtered_raw.time >= imagestart + in_fixonset) &
                         (filtered_raw.time <= imagestart + in_fixoffset), "invalid"] = 1
                
        
    for insac in invalid_sac_no:
        in_saconset = corpus_invalid[corpus_invalid.sacno == insac].saconset
        in_sacoffset = corpus_invalid[corpus_invalid.sacno == insac].sacoffset
        
        filtered_raw.loc[(filtered_raw.time >= imagestart + int(in_saconset)) &
                         (filtered_raw.time <= imagestart + int(in_sacoffset)), "invalid"] = 1
                

# By doing:

__invalid_sacfix = ((corpus_data.fixinvalid + corpus_data.sacinvalid) >= 1).astype(int)__

# my assignment is wrong when fixation is invalid and sacccade is valid or the other way around.


In [8]:
def add_sacc_val_id_correct(filtered_raw, corpus_data):
    """
    By comparing time stamps with the corpus_data, we get which time stamps corresponds
    to saccades (=1) and which dont (=0). Add an id that corresponds to 3 digits of subjet number
    + 3 digits trial number + 2 digits fixation/saccade number. Also add information about invalid
    saccades/fixations, in invalid column where valid=0 and invalid=1.

    :param filtered_raw: pandas data frame with columns = =['time', 'x', 'y', 'pupil','imageno']
    :param corpus data: pandas data frame filtered for participant number #n

    :return: filtered_raw with three more columns: 'is_saccade', that is composed by zeros and ones,
    where zero correspond to fixation and a one correspond to a saccade, 'identifier' and 'invalid'.

    """
    
    pd.options.mode.chained_assignment = None  # default='warn'

    inds = pd.isnull(corpus_data.sacno)
    corpus_data.loc[inds,"sacinvalid"]=0

    invalid_sacfix = ((corpus_data.fixinvalid + corpus_data.sacinvalid) >= 1).astype(int)

    identifier = np.zeros(len(filtered_raw.time), dtype=str)
    is_saccade = np.ones(len(filtered_raw.time), dtype=int)
    invalid = np.zeros(len(filtered_raw.time), dtype=int)

    try:
        filtered_raw.insert(0, "identifier", identifier)
        filtered_raw.insert(7, "is_saccade", is_saccade)
        filtered_raw.insert(7, "invalid", invalid)
    except:
        pass

    corpus_data_noNaN = corpus_data[corpus_data.sacno.notnull()]

    subject = np.unique(corpus_data.subject)[0]
    for image in np.unique(filtered_raw.imageno):
        imagestart = filtered_raw[filtered_raw.imageno == image].time.iloc[0]
        
   
# First we just try to get the invalid saccades
        corpus_invalid = corpus_data[invalid_sacfix & (corpus_data.imageno==image)]
        invalid_sac_no = np.unique(corpus_invalid.sacno[corpus_invalid.sacno.notnull()])                
        
        for insac in invalid_sac_no:
            invalidity = corpus_invalid[corpus_invalid.sacno == insac].sacinvalid
            in_saconset = corpus_invalid[corpus_invalid.sacno == insac].saconset
            in_sacoffset = corpus_invalid[corpus_invalid.sacno == insac].sacoffset

            filtered_raw.loc[(filtered_raw.time >= imagestart + int(in_saconset)) &
                             (filtered_raw.time <= imagestart + int(in_sacoffset)), "invalid"] = int(invalidity)

        
# Second we get invalid fixations + introduce the ids        
        trialno = corpus_data[corpus_data.imageno == image].trialno.iloc[0]
        fix_nbs = corpus_data[(corpus_data.imageno == image)].fixno
        for count, fix in enumerate(fix_nbs):

            if math.isnan(fix):
                # this is to make sure that fixations that have NaNs have the id
                # corresponding to the saccade in that same row and are invalid/valid
                # for the duration of that saccade
                c_temp = corpus_data[corpus_data.imageno == image].copy()
                sacno = c_temp[np.logical_not(c_temp.fixno.notnull())].sacno.iloc[0]
                saconset = c_temp[np.logical_not(c_temp.fixno.notnull())].saconset.iloc[0]
                sacoffset = c_temp[np.logical_not(c_temp.fixno.notnull())].sacoffset.iloc[0]
                if saconset == 1:
                    saconset = 0
                invalidity = c_temp[np.logical_not(c_temp.fixno.notnull())].sacinvalid.iloc[0]
                ident = "" + str(subject).zfill(3) + str(trialno).zfill(3) + str(int(sacno)).zfill(2) + ""
                filtered_raw.loc[(filtered_raw.time >= imagestart + saconset) &
                                 (filtered_raw.time <= imagestart + sacoffset), "identifier"] = ident
                filtered_raw.loc[(filtered_raw.time >= imagestart + saconset) &
                                 (filtered_raw.time <= imagestart + sacoffset), "invalid"] = int(invalidity)


            else:
                # This else handles assigning saccades and invalidity
                invalidity = int(corpus_data[(corpus_data.imageno == image) & (corpus_p1.fixno == fix)].fixinvalid)
                ident = "" + str(subject).zfill(3) + str(trialno).zfill(3) + str(int(fix)).zfill(2) + ""
                fixonset = int(corpus_data[(corpus_data.imageno == image) & (corpus_data.fixno == fix)].fixonset)
                #necessary if because in the corpus data their time starts at 1.
                if fixonset == 1:
                    fixonset = 0
                fixoffset = int(corpus_data[(corpus_data.imageno == image) & (corpus_data.fixno == fix)].fixoffset)
                filtered_raw.loc[(filtered_raw.time >= imagestart + fixonset) &
                                 (filtered_raw.time <= imagestart + fixoffset), "is_saccade"] = 0
                filtered_raw.loc[(filtered_raw.time >= imagestart + fixonset) &
                                 (filtered_raw.time <= imagestart + fixoffset), "invalid"] = invalidity
                
                            
                # This handles assigning ids
                if count < len(corpus_data[(corpus_data.imageno == image)].fixno) - 1:
                    next_fix = int(corpus_data[(corpus_data.imageno == image)].fixno.iloc[count + 1])
                    next_fixonset = int(
                        corpus_data[(corpus_data.imageno == image) & (corpus_data.fixno == next_fix)].fixonset)
                    filtered_raw.loc[(filtered_raw.time >= imagestart + fixonset) &
                                     (filtered_raw.time < imagestart + next_fixonset), "identifier"] = ident


                else:
                    filtered_raw.loc[(filtered_raw.time >= imagestart + fixonset) &
                                     (filtered_raw.time <= imagestart + fixoffset), "identifier"] = ident

                        
                    
        try:
            assert np.sum(corpus_data_noNaN[corpus_data_noNaN.imageno == image].sacdur) == np.sum(
                filtered_raw[filtered_raw.imageno == image].is_saccade != 0)
            assert np.sum(corpus_data[corpus_data.imageno == image].fixdur) + 1 == np.sum(
                filtered_raw[filtered_raw.imageno == image].is_saccade == 0)

        except AssertionError:
            print("image", image)
            print(np.sum(corpus_data[corpus_data.imageno == image].fixdur) + 1,
                  np.sum(filtered_raw[filtered_raw.imageno == image].is_saccade == 0))

        filtered_raw['identifier'] = filtered_raw['identifier'].astype(str)

    return filtered_raw

In [9]:
raw_valid_times, intervals = get_valid_times(raw_data, corpus_p1)

In [10]:
data_dva = linear_transf_pix_dva(raw_valid_times)

In [11]:
data_trial_info, trial_durations = associate_trial_info(data_dva, intervals, corpus_p1)

In [12]:
data_trial_info

Unnamed: 0,time,x,y,pupil,imageno,filtertype,filterregion,targetpresent,expectedlocation
43873,4239225,18.9329,13.7973,6802.0,50,1,2,0,0
43874,4239226,18.9329,13.7973,6802.0,50,1,2,0,0
43875,4239227,18.9366,13.7751,6804.0,50,1,2,0,0
43876,4239228,18.9329,13.7751,6804.0,50,1,2,0,0
43877,4239229,18.9329,13.7751,6804.0,50,1,2,0,0
...,...,...,...,...,...,...,...,...,...
1115751,6234829,12.9759,16.9349,8635.0,62,2,1,1,0
1115752,6234830,12.9722,16.9349,8635.0,62,2,1,1,0
1115753,6234831,12.9722,16.9349,8637.0,62,2,1,1,0
1115754,6234832,12.9759,16.8794,8638.0,62,2,1,1,0


In [13]:
final = add_sacc_val_id_correct(data_trial_info, corpus_p1)

In [14]:
final[final.imageno==50][-50:]

Unnamed: 0,identifier,time,x,y,pupil,imageno,filtertype,invalid,is_saccade,filterregion,targetpresent,expectedlocation
49651,100120,4245003,29.6888,18.5222,10360.0,50,1,0,1,2,0,0
49652,100120,4245004,29.452,18.1781,10417.0,50,1,0,1,2,0,0
49653,100120,4245005,29.082,18.0264,10511.0,50,1,0,1,2,0,0
49654,100120,4245006,28.712,17.2198,10557.0,50,1,0,1,2,0,0
49655,100120,4245007,28.4826,17.2198,10557.0,50,1,0,1,2,0,0
49656,100120,4245008,28.083,17.2235,10581.0,50,1,0,1,2,0,0
49657,100120,4245009,27.8314,16.6241,10606.0,50,1,0,1,2,0,0
49658,100120,4245010,27.5095,16.6278,10606.0,50,1,0,1,2,0,0
49659,100120,4245011,27.2394,16.2171,10582.0,50,1,0,1,2,0,0
49660,100120,4245012,26.936,16.2356,10582.0,50,1,0,1,2,0,0
