In [2]:

import pandas as pd
import numpy as np
import random

# Threshold parameters
FREQ_MEAN = 5
FREQ_SD = 2
SURP_MEAN = 3
SURP_SD = 8
LENGTH_MEAN = 30

# Generate sample data
data = []
for i in range(8000):
    row = {
        'subject_id': i+1,
        'unique_paragraph_id': random.randint(1, 10),
        'reread': random.randint(0, 1),
        'Wordfreq_Frequency': np.random.normal(FREQ_MEAN, FREQ_SD),
        'gpt2_Surprisal': np.random.normal(SURP_MEAN, SURP_SD),
        'Length': np.random.poisson(LENGTH_MEAN),
        'prev_Wordfreq_Frequency': np.random.normal(FREQ_MEAN, FREQ_SD),
        'prev_gpt2_Surprisal': np.random.normal(SURP_MEAN, SURP_SD),
        'prev_Length': np.random.poisson(LENGTH_MEAN),\
        'has_preview': random.randint(0, 1),\
        'IA_FIRST_FIX_PROGRESSIVE': np.random.choice([0, 1], p=[0.15, 0.85]),\
        'practice': np.random.choice([0, 1], p=[0.92, 0.08]),\
        'not_num_or_punc': np.random.choice([False, True], p=[0.04, 0.96]),\
        'normalized_ID': np.random.choice([0, 0.5, 1], p=[0.03, 0.94, 0.03]),\
        'is_in_aspan': np.random.choice([False, True], p=[0.7, 0.3]),\
        'article_ind': random.randint(1, 12),
    }
    data.append(row)

# Convert data to a pandas DataFrame
df = pd.DataFrame(data)
num_cols = ['Wordfreq_Frequency','gpt2_Surprisal', 'Length', 'prev_Wordfreq_Frequency', 'prev_gpt2_Surprisal', 'prev_Length']
df[num_cols] = df[num_cols].applymap(lambda x: max(1, x))
df['IA_FIRST_FIXATION_DURATION'] =(
        df['subject_id']%10
        - df['unique_paragraph_id']%10
        - 8*df['reread'] 
        + 100/df['Wordfreq_Frequency']
        + 50/df['prev_Wordfreq_Frequency']
        + df['Length']/2
        + df['prev_Length']/4
        + 3*df['gpt2_Surprisal']
        + 1*df['prev_gpt2_Surprisal']
        + 0.5*df['has_preview']
    )
df['IA_FIRST_RUN_DWELL_TIME'] = df['IA_FIRST_FIXATION_DURATION'] * 2
df['IA_REGRESSION_PATH_DURATION'] = df['IA_FIRST_FIXATION_DURATION'] * 1.5
df['IA_DWELL_TIME'] = df['IA_FIRST_FIXATION_DURATION'] * 3
df['has_preview'] = df['has_preview'].replace({0: 'Gathering', 1: 'Hunting'})

random_numbers = np.random.rand(len(df))
columns_to_replace = ['prev_Wordfreq_Frequency', 'prev_gpt2_Surprisal', 'prev_Length']
for column in columns_to_replace:
    df.loc[random_numbers < 0.1, column] = np.nan

for s_col in [
    "meta-llama-Llama-2-7b-hf-Surprisal",
    "gpt2-Surprisal",
    "gpt2-medium-Surprisal",
    "gpt2-large-Surprisal",
    "gpt2-xl-Surprisal",
    "EleutherAI-gpt-neo-125M-Surprisal",
    "EleutherAI-gpt-neo-1.3B-Surprisal",
    "EleutherAI-gpt-neo-2.7B-Surprisal",
    "EleutherAI-gpt-j-6B-Surprisal",
    "facebook-opt-350m-Surprisal",
    "facebook-opt-1.3b-Surprisal",
    "facebook-opt-2.7b-Surprisal",
    "facebook-opt-6.7b-Surprisal",
    "EleutherAI-pythia-70m-Surprisal",
    "EleutherAI-pythia-160m-Surprisal",
    "EleutherAI-pythia-410m-Surprisal",
    "EleutherAI-pythia-1b-Surprisal",
    "EleutherAI-pythia-1.4b-Surprisal",
    "EleutherAI-pythia-2.8b-Surprisal",
    "EleutherAI-pythia-6.9b-Surprisal",
    "EleutherAI-pythia-12b-Surprisal"
]:
    epsilon = random.uniform(0.001, 0.05)  # Small random epsilon between 0.0001 and 0.001
    df[s_col] = df["gpt2_Surprisal"] + epsilon
    df[f"prev_{s_col}"] = df[s_col].shift(1)


# Write data to CSV file
df.to_csv('generated_data.csv', index=False)

  df[num_cols] = df[num_cols].applymap(lambda x: max(1, x))


In [2]:
import pandas as pd

df = pd.read_csv("/src/GAM/tests/generated_data.csv")

In [3]:
df

Unnamed: 0,subject_id,unique_paragraph_id,reread,Wordfreq_Frequency,gpt2_Surprisal,Length,prev_Wordfreq_Frequency,prev_gpt2_Surprisal,prev_Length,has_preview,...,EleutherAI-pythia-1b-Surprisal,prev_EleutherAI-pythia-1b-Surprisal,EleutherAI-pythia-1.4b-Surprisal,prev_EleutherAI-pythia-1.4b-Surprisal,EleutherAI-pythia-2.8b-Surprisal,prev_EleutherAI-pythia-2.8b-Surprisal,EleutherAI-pythia-6.9b-Surprisal,prev_EleutherAI-pythia-6.9b-Surprisal,EleutherAI-pythia-12b-Surprisal,prev_EleutherAI-pythia-12b-Surprisal
0,1,3,0,3.859441,1.000000,27,4.656098,1.676257,31.0,Gathering,...,1.033707,,1.012872,,1.044860,,1.044236,,1.001283,
1,2,5,1,4.315444,9.338984,29,,,,Gathering,...,9.372691,1.033707,9.351856,1.012872,9.383844,1.044860,9.383221,1.044236,9.340267,1.001283
2,3,2,0,5.148340,6.533253,33,1.978352,1.000000,27.0,Gathering,...,6.566960,9.372691,6.546125,9.351856,6.578113,9.383844,6.577489,9.383221,6.534536,9.340267
3,4,5,0,4.159779,1.000000,32,6.810877,18.200951,29.0,Hunting,...,1.033707,6.566960,1.012872,6.546125,1.044860,6.578113,1.044236,6.577489,1.001283,6.534536
4,5,9,1,5.892303,2.840666,30,6.482940,12.263366,31.0,Hunting,...,2.874373,1.033707,2.853538,1.012872,2.885526,1.044860,2.884902,1.044236,2.841949,1.001283
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1996,8,0,4.946084,11.844690,27,4.836165,13.681689,50.0,Hunting,...,11.878397,21.255150,11.857562,21.234315,11.889549,21.266302,11.888926,21.265679,11.845973,21.222725
1996,1997,4,1,4.711239,1.000000,29,3.970123,7.024013,30.0,Hunting,...,1.033707,11.878397,1.012872,11.857562,1.044860,11.889549,1.044236,11.888926,1.001283,11.845973
1997,1998,7,0,4.552740,1.000000,23,5.320858,1.000000,40.0,Gathering,...,1.033707,1.033707,1.012872,1.012872,1.044860,1.044860,1.044236,1.044236,1.001283,1.001283
1998,1999,2,1,1.000000,11.852806,22,5.454564,1.000000,25.0,Hunting,...,11.886513,1.033707,11.865678,1.012872,11.897666,1.044860,11.897042,1.044236,11.854089,1.001283


In [7]:
df.columns

Index(['subject_id', 'unique_paragraph_id', 'reread', 'Wordfreq_Frequency',
       'gpt2_Surprisal', 'Length', 'prev_Wordfreq_Frequency',
       'prev_gpt2_Surprisal', 'prev_Length', 'has_preview',
       'IA_FIRST_FIX_PROGRESSIVE', 'practice', 'not_num_or_punc',
       'normalized_ID', 'is_in_aspan', 'IA_FIRST_FIXATION_DURATION',
       'IA_FIRST_RUN_DWELL_TIME', 'IA_DWELL_TIME'],
      dtype='object')

In [79]:
63*4

252

In [82]:
len(df[(df["has_preview"]=="Hunting") & (df["reread"]==1)])

262

In [81]:
df.dtypes

subject_id                   int64
unique_paragraph_id          int64
reread                       int64
Wordfreq_Frequency         float64
gpt2_Surprisal             float64
Length                       int64
prev_Wordfreq_Frequency    float64
prev_gpt2_Surprisal        float64
prev_Length                  int64
has_preview                 object
FF                         float64
dtype: object