# Logistic regression

## Data Prep

In [1]:
import pandas as pd
import rdata
import numpy as np
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

### Get data

In [2]:
path_l2_trimmed = r"../data/version 1.1/primary data/eye tracking data/joint_data_l2_trimmed.rda"
l2_trimmed_parsed = rdata.parser.parse_file(path_l2_trimmed)
l2_trimmed_converted = rdata.conversion.convert(l2_trimmed_parsed)
df = l2_trimmed_converted['joint.data']

In [3]:
df

Unnamed: 0,subid,trialid,trialnum,itemid,cond,sentnum,ianum,ia,blink,skip,nrun,reread,nfix,refix,reg.in,reg.out,dur,firstrun.skip,firstrun.nfix,firstrun.refix,firstrun.reg.in,firstrun.reg.out,firstrun.dur,firstrun.gopast,firstrun.gopast.sel,firstfix.sac.in,firstfix.sac.out,firstfix.launch,firstfix.land,firstfix.cland,firstfix.dur,singlefix,singlefix.sac.in,singlefix.sac.out,singlefix.launch,singlefix.land,singlefix.cland,singlefix.dur,lang,trial,uniform_id
1,DU_04,1,1.0,1,1.0,6.0,141.0,he,0.0,0.0,2.0,1.0,2.0,0.0,1.0,0.0,464.0,1.0,1.0,0.0,1.0,0.0,278.0,0.0,0.0,-3,3,-5.0,2.0,0.5,278.0,1.0,-3,3,-5.0,2.0,0.5,278.0,du,,du_4
2,DU_04,1,1.0,1,1.0,6.0,142.0,gave,0.0,0.0,2.0,1.0,2.0,0.0,0.0,1.0,480.0,0.0,1.0,0.0,0.0,1.0,164.0,944.0,480.0,11,-3,9.0,2.0,-0.5,164.0,1.0,11,-3,9.0,2.0,-0.5,164.0,du,,du_4
3,DU_04,1,1.0,1,1.0,6.0,143.0,up,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,155.0,0.0,1.0,0.0,0.0,0.0,155.0,155.0,155.0,6,6,6.0,0.0,-1.5,155.0,1.0,6,6,6.0,0.0,-1.5,155.0,du,,du_4
4,DU_04,1,1.0,1,1.0,6.0,144.0,painting,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,323.0,0.0,1.0,0.0,0.0,0.0,323.0,323.0,323.0,6,10,3.0,3.0,-1.5,323.0,1.0,6,10,3.0,3.0,-1.5,323.0,du,,du_4
5,DU_04,1,1.0,1,1.0,6.0,145.0,and,0.0,1.0,,,,,,,,1.0,,,,,,,,-2147483648,-2147483648,,,,,0.0,-2147483648,-2147483648,,,,,du,,du_4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
669949,tr_52,12,12.0,12,1.0,5.0,144.0,their,0.0,0.0,2.0,1.0,2.0,0.0,0.0,0.0,354.0,1.0,1.0,0.0,0.0,0.0,197.0,0.0,0.0,9,6,6.0,3.0,0.0,197.0,1.0,9,6,6.0,3.0,0.0,197.0,tr,,tr_52
669950,tr_52,12,12.0,12,1.0,5.0,145.0,connectivity,0.0,0.0,3.0,1.0,4.0,1.0,1.0,1.0,872.0,1.0,1.0,0.0,0.0,0.0,222.0,0.0,0.0,6,13,3.0,3.0,-3.5,222.0,1.0,6,13,3.0,3.0,-3.5,222.0,tr,,tr_52
669951,tr_52,12,12.0,12,1.0,5.0,146.0,in,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,220.0,1.0,1.0,0.0,0.0,0.0,220.0,0.0,0.0,9,5,9.0,0.0,-1.5,220.0,1.0,9,5,9.0,0.0,-1.5,220.0,tr,,tr_52
669952,tr_52,12,12.0,12,1.0,5.0,147.0,personal,0.0,0.0,2.0,1.0,4.0,1.0,0.0,1.0,661.0,1.0,1.0,0.0,0.0,1.0,233.0,0.0,0.0,13,-11,13.0,0.0,-4.5,233.0,1.0,13,-11,13.0,0.0,-4.5,233.0,tr,,tr_52


In [4]:
# total sentence reading time
total_sentence_readtime = df.groupby(['subid', 'trialid', 'sentnum'])['dur'].sum().reset_index()
total_sentence_readtime = total_sentence_readtime.rename(columns = {'dur':'total_dur'})
df = df.merge(total_sentence_readtime, on = ['subid', 'trialid', 'sentnum'], how = 'left')

### Bigrams

In [5]:
pairs1 = np.arange(0, len(df))
pairs1 = [val for val in pairs1 for _ in (0, 1)][:len(df)]

pairs1[-1] = np.nan # bc not even, last num wo pair
pairs2 = [np.nan] + pairs1[:-1]

In [6]:
# add to df:
df['bigram_index1'] = pairs1
df['bigram_index2'] = pairs2

In [7]:
bigrams1 = df.groupby('bigram_index1').agg({'ia' : lambda x: ' '.join(x),
                                'firstfix.dur' : 'sum',
                                'firstrun.gopast' : 'sum',
                                'dur' : 'sum',
                                'subid' : 'first',
                                'total_dur' : 'mean',
                                'lang': 'first'}).reset_index()

bigrams2 = df.groupby('bigram_index2').agg({'ia' : lambda x: ' '.join(x),
                                'firstfix.dur' : 'sum',
                                'firstrun.gopast' : 'sum',
                                'dur' : 'sum',
                                'subid' : 'first',
                                'total_dur' : 'mean',
                                'lang': 'first'}).reset_index()                                 

In [8]:
bigrams1  = bigrams1[~bigrams1['ia'].str.contains('.*[.].*[^.]')] # remove 19014 bigrams that are an end of sentence
bigrams2  = bigrams2[~bigrams2['ia'].str.contains('.*[.].*[^.]')] 

In [9]:
bigrams = pd.concat([bigrams1, bigrams2]) 
bigrams = bigrams[['subid', 'firstfix.dur', 'firstrun.gopast', 'dur', 'total_dur', 'lang']] # 'ia'

In [10]:
print(f'Number of bigrams: {len(bigrams)}')

Number of bigrams: 632225


### Trigrams

In [11]:
triplets1 = np.arange(0, len(df))
triplets1 = [val for val in triplets1 for _ in (0, 1, 2)] [:len(df)]

In [12]:
# remove last 2:
triplets1[-2] = np.nan
triplets1[-1] = np.nan

triplets2 = [np.nan] + triplets1[:-1] 
triplets3 = [np.nan, np.nan] + triplets1[:-2] 

In [13]:
# add to df:
df['trigram_index1'] = triplets1
df['trigram_index2'] = triplets2
df['trigram_index3'] = triplets3

In [14]:
trigrams1 = df.groupby('trigram_index1').agg({'ia' : lambda x: ' '.join(x),
                                'firstfix.dur' : 'sum',
                                'firstrun.gopast' : 'sum',
                                'dur' : 'sum',
                                'subid' : 'first',
                                'total_dur' : 'mean',
                                'lang': 'first'}).reset_index()

trigrams2 = df.groupby('trigram_index2').agg({'ia' : lambda x: ' '.join(x),
                                'firstfix.dur' : 'sum',
                                'firstrun.gopast' : 'sum',
                                'dur' : 'sum',
                                'subid' : 'first',
                                'total_dur' : 'mean',
                                'lang': 'first'}).reset_index()    
trigrams3 = df.groupby('trigram_index3').agg({'ia' : lambda x: ' '.join(x),
                                'firstfix.dur' : 'sum',
                                'firstrun.gopast' : 'sum',
                                'dur' : 'sum',
                                'subid' : 'first',
                                'total_dur' : 'mean',
                                'lang': 'first'}).reset_index()    

In [15]:
trigrams1  = trigrams1[~trigrams1['ia'].str.contains('.*[.].*[^.]')]
trigrams2  = trigrams2[~trigrams2['ia'].str.contains('.*[.].*[^.]')] 
trigrams3  = trigrams3[~trigrams3['ia'].str.contains('.*[.].*[^.]')] 

In [16]:
trigrams = pd.concat([trigrams1, trigrams2, trigrams3]) 
trigrams = trigrams[['subid', 'firstfix.dur', 'firstrun.gopast', 'dur', 'total_dur', 'lang']] # 'ia'

In [17]:
print(f'Number of trigrams: {len(trigrams)}')

Number of trigrams: 595922


### Subset

In [18]:
subset_features = ['subid', 'firstrun.gopast', 'firstfix.dur', 'dur', 'total_dur', 'lang'] # 'firstrun.gopast.sel'
df_subset = df[subset_features]

In [19]:
# concat with bigrams and trigrams
df_subset = pd.concat([df_subset, bigrams, trigrams])

In [20]:
df_subset

Unnamed: 0,subid,firstrun.gopast,firstfix.dur,dur,total_dur,lang
0,DU_04,0.0,278.0,464.0,5486.0,du
1,DU_04,944.0,164.0,480.0,5486.0,du
2,DU_04,155.0,155.0,155.0,5486.0,du
3,DU_04,323.0,323.0,323.0,5486.0,du
4,DU_04,,,,5486.0,du
...,...,...,...,...,...,...
223312,tr_52,0.0,560.0,1598.0,26247.0,tr
223313,tr_52,0.0,432.0,1810.0,26247.0,tr
223314,tr_52,0.0,766.0,2109.0,26247.0,tr
223315,tr_52,0.0,652.0,1459.0,26247.0,tr


In [21]:
# test and remove NaN
df_subset.isna().sum()

subid                   0
firstrun.gopast    164910
firstfix.dur       164910
dur                153643
total_dur               0
lang                    0
dtype: int64

In [22]:
df_subset = df_subset[~df_subset['firstfix.dur'].isna()]
df_subset.isna().sum()

subid              0
firstrun.gopast    0
firstfix.dur       0
dur                0
total_dur          0
lang               0
dtype: int64

In [23]:
# speed normalization
df_subset['FF_norm'] = df_subset['firstrun.gopast'] / df_subset['total_dur'] 
df_subset['FP_norm'] = df_subset['firstfix.dur'] / df_subset['total_dur']
df_subset['TF_norm'] = df_subset['dur'] / df_subset['total_dur']

In [24]:
# hopefully no need after data quality checks
df_subset = df_subset[~df_subset['FF_norm'].isna()]

In [25]:
# add word length?

Add language:

In [27]:
#from functions import map_language
#language_lookup = map_language(df_subset)
languages = df_subset['lang'].unique()
languages.sort()
len_languages = len(languages)
language_lookup = dict(zip(languages, range(len_languages)))
df_subset["lang_code"] = df_subset['lang'].replace(language_lookup).values

In [35]:
import json

with open("../data/outputs/language_lookup.json", "w") as outfile:
    json.dump(language_lookup, outfile)

In [30]:
print(f'Number unigrams: {len(df)}')
print(f'Number bigrams: {len(bigrams)}')
print(f'Number trigrams: {len(trigrams)}')
print(f'Number of samples after removing { len(df) + len(bigrams) + len(trigrams) - len(df_subset)} NaNs: {len(df_subset)}')
print('Languages:')
df_subset['lang'].value_counts()

Number unigrams: 669953
Number bigrams: 632225
Number trigrams: 595922
Number of samples after removing 165857 NaNs: 1732243
Languages:


ee    200255
fi    190271
en    174577
it    161516
ge    151280
gr    140373
ru    138264
he    130689
du    126104
sp    125887
no    125046
tr     67981
Name: lang, dtype: int64

In [31]:
df_subset

Unnamed: 0,subid,firstrun.gopast,firstfix.dur,dur,total_dur,lang,FF_norm,FP_norm,TF_norm,lang_code
0,DU_04,0.0,278.0,464.0,5486.0,du,0.000000,0.050674,0.084579,0
1,DU_04,944.0,164.0,480.0,5486.0,du,0.172074,0.029894,0.087495,0
2,DU_04,155.0,155.0,155.0,5486.0,du,0.028254,0.028254,0.028254,0
3,DU_04,323.0,323.0,323.0,5486.0,du,0.058877,0.058877,0.058877,0
5,DU_04,462.0,265.0,462.0,5486.0,du,0.084214,0.048305,0.084214,0
...,...,...,...,...,...,...,...,...,...,...
223312,tr_52,0.0,560.0,1598.0,26247.0,tr,0.000000,0.021336,0.060883,11
223313,tr_52,0.0,432.0,1810.0,26247.0,tr,0.000000,0.016459,0.068960,11
223314,tr_52,0.0,766.0,2109.0,26247.0,tr,0.000000,0.029184,0.080352,11
223315,tr_52,0.0,652.0,1459.0,26247.0,tr,0.000000,0.024841,0.055587,11


write

In [33]:
df_subset.to_csv(r"../data/data_extended_v1_1/data_v1_1.parquet.gzip", index = False) # csv produces a too large file