In [10]:
import random
import json
import pandas as pd
from itertools import groupby

def create_prompt(df, data, word_field="word"):
    random.seed(42)
    sents = []
    for sent_id, sent in groupby(df[[word_field, "time", "sent_id", "article"]].iterrows(), key=lambda x: x[1][2]):
        sent = list(sent)
        sents.append([("".join(tok[1][0].split()).replace("▁", " ").strip(), tok[1][1]) for tok in sent])

    for i in range(3):
        print(i)
        j = 0
        with open(f"../data/{data}/prompts/prompt_{i}.txt", "w") as f:
            while j < 3:
                selected_sent = random.choice(sents)
                sent = " ".join([tok[0] for tok in selected_sent])
                tok_time_id = [(tok[0], tok[1], i) for i, tok in enumerate(selected_sent)]
                if min([time for tok, time, id in tok_time_id]) == 0:
                    continue
                f.write(f'Suppose humans read the following sentence: "{sent}" \nList the tokens and their IDs in order of their reading cost (high to low) during sentence processing.\n')
                f.write("Token ID:\n")
                for tok, time, id in tok_time_id:
                    f.write(f"{id}: {tok}, ")
                f.write("\n")
                sorted_sent = sorted(tok_time_id, key=lambda x: x[1], reverse=True)
                f.write("Answer:\n" + " ".join(f"{id}: {tok}," for tok, time, id in sorted_sent) + "\n\n")
                j += 1
    return sents

In [11]:
def create_surprisal_prompt(df, data, model, word_field="word"):
    random.seed(42)
    target_file = f"../results/{data}/{model}/surprisal.json"
    article2surprisals = json.load(open(target_file))
    all_sups = [sup for _, sents_sups in sorted(article2surprisals.items(), key=lambda x: int(x[0])) for sent_sups in sents_sups for sup in sent_sups]
    assert len(df) == len(all_sups)
    df["surprisal"] = all_sups

    sents = []
    for sent_id, sent in groupby(df[[word_field, "surprisal", "sent_id", "article"]].iterrows(), key=lambda x: x[1][2]):
        sent = list(sent)
        sents.append([("".join(tok[1][0].split()).replace("▁", " ").strip(), tok[1][1]) for tok in sent])

    for i in range(3):
        print(i)
        j = 0
        with open(f"../data/{data}/prompts/prompt_surprisal_{model}_{i}.txt", "w") as f:
            while j < 3:
                selected_sent = random.choice(sents)
                sent = " ".join([tok[0] for tok in selected_sent])
                tok_time_id = [(tok[0], tok[1], i) for i, tok in enumerate(selected_sent)]
                if min([time for tok, time, id in tok_time_id]) == 0:
                    continue
                f.write(f'Suppose you read the following sentence: "{sent}" \nList the tokens and their IDs in order of their probability in context (low to high).\n')
                f.write("Token ID:\n")
                for tok, time, id in tok_time_id:
                    f.write(f"{id}: {tok}, ")
                f.write("\n")
                sorted_sent = sorted(tok_time_id, key=lambda x: x[1], reverse=True)
                f.write("Answer:\n" + " ".join(f"{id}: {tok}," for tok, time, id in sorted_sent) + "\n\n")
                j += 1
    return sents

## DC

In [12]:
import pandas as pd

df = pd.read_csv("../data/DC/all.txt.annotation.filtered.averaged_rt.csv")

In [13]:
sents = create_prompt(df, "DC", word_field="surface")
models = ["Llama-2-7b-chat-hf", "Llama-2-13b-chat-hf", "Llama-2-70b-chat-hf", "gpt-3.5-turbo-instruct", "falcon-7b-instruct", "falcon-40b-instruct", "text-davinci-002", "text-davinci-003"]
for model in models:
    sents = create_surprisal_prompt(df, "DC", model, word_field="surface")

0
1
2


## NS

In [14]:
df = pd.read_csv("../data/NS/all.txt.annotation.filtered.averaged_rt.csv")

In [15]:
sents = create_prompt(df, "NS", word_field="word")
models = ["Llama-2-7b-chat-hf", "Llama-2-13b-chat-hf", "Llama-2-70b-chat-hf", "gpt-3.5-turbo-instruct", "falcon-7b-instruct", "falcon-40b-instruct", "text-davinci-002", "text-davinci-003"]
models = ["Mistral-7B-Instruct-v0.1"]
for model in models:
    sents = create_surprisal_prompt(df, "NS", model, word_field="word")

0
1
2


## Example

In [18]:
target_sent = random.choice(sents)
sent = " ".join([tok[0] for tok in target_sent])
print(f'Suppose humans read the following sentence: "{sent}" \nList the tokens and their IDs in order of their reading cost (high to low) during sentence processing.')
tok_time_id = [(tok[0], tok[1], i) for i, tok in enumerate(target_sent)]
print("Token ID:")
for tok, time, id in tok_time_id:
    print(f"{id}:{tok},", end = " ")
print()
print("Answer:")
sorted_sent = sorted(tok_time_id, key=lambda x: x[1], reverse=True)
print("Answer:\n" + " ".join(f"{id}: {tok}," for tok, time, id in sorted_sent) + "\n")

Suppose humans read the following sentence: "The 'tulip breaking' virus, which many botanists have studied, spreads only through buds, not seeds, and so cultivating the most appealing varieties takes years." 
List the tokens and their IDs in order of their reading cost (high to low) during sentence processing.
Token ID:
0:The, 1:'tulip, 2:breaking', 3:virus,, 4:which, 5:many, 6:botanists, 7:have, 8:studied,, 9:spreads, 10:only, 11:through, 12:buds,, 13:not, 14:seeds,, 15:and, 16:so, 17:cultivating, 18:the, 19:most, 20:appealing, 21:varieties, 22:takes, 23:years., 
Answer:
Answer:
13: not, 8: studied,, 12: buds,, 2: breaking', 22: takes, 17: cultivating, 3: virus,, 20: appealing, 21: varieties, 0: The, 14: seeds,, 15: and, 16: so, 1: 'tulip, 4: which, 11: through, 6: botanists, 5: many, 19: most, 10: only, 9: spreads, 7: have, 18: the, 23: years.,

