## NS

In [None]:
import json
import glob
import os
import math
import pandas as pd

from itertools import groupby
from collections import defaultdict
from statistics import mean
from scipy.stats import pearsonr, spearmanr

model_order = [
    "Llama-2-7b-chat-hf", 
    "Llama-2-13b-chat-hf", 
    "Llama-2-70b-chat-hf", 
    "falcon-7b-instruct",
    "falcon-40b-instruct",
    "text-davinci-002",
    "text-davinci-003"
    ]

In [None]:
df = pd.read_csv("../data/NS/all.txt.annotation.filtered.averaged_rt.csv")
article2sents = defaultdict(list)
for article_id, article in groupby(df[["word", "time", "sent_id", "article"]].iterrows(), key=lambda x: x[1][3]):
    for sent_id, sent in groupby(article, key=lambda x: x[1][2]):
        sent = list(sent)
        sent = {f"{str(i)}: {token[1][0]}": token[1][1] for i, token in enumerate(sent)}
        article2sents[str(article_id)].append(sent)
    # sent = list(sent)
    # sents.append([("".join(tok[1][0].split()).replace("▁", " ").strip(), tok[1][1]) for tok in sent])

In [None]:
files = glob.glob('../results/NS/*/prompt_estimation/prompt_*.json', recursive=True)
files = [file for file in files if "direct" not in file and "surprisal" not in file]

for file in files:
    print(file)
    results = json.load(open(file))
    correls = []
    for article_id, preds in results.items():
        golds = article2sents[article_id]
        # assert len(golds) == len(preds)
        preds = preds[:5]
        for pred, gold in zip(preds, golds):
            # print(pred)
            target = pred.split("\n\n")[3]
            if len(target.split("\n")) < 6:
                continue
            token_ids = [token for token in target.split("\n")[3].rstrip(",").split(", ")]
            answer = [token for token in target.split("\n")[5].split(", ")]
            answer_valid = [a for a in answer if a in token_ids]
            if len(answer_valid) > 1:
                y = [-gold[ans] for ans in answer_valid]
                x = list(range(len(y)))
                correl = spearmanr(x,y)[0]
                if not math.isnan(correl):
                    correls.append(correl)
    with open(file+".correl", "w") as f:
        f.write(str(mean(correls)))


In [None]:
files = glob.glob('../results/NS/*/prompt_estimation/prompt_*.json.correl', recursive=True)
files = [file for file in files if "direct" not in file and "surprisal" not in file]

df_results1 = pd.DataFrame(columns=["model", "prompt", "correl"])
for file in files:
    with open(file) as f:
        print(file)
        model = file.split("/")[-3]
        prompt = file.split("/")[-1].split(".")[0]
        correl = float(f.read().strip())
        df_results1.loc[len(df_results1)] = [model, prompt, float(correl)]

In [None]:
df_result_ns = pd.DataFrame(columns=["model", "result"])
df_result_ns["result"] = df_results1.groupby("model")[["correl"]].mean()["correl"].round(2).apply(str)
df_result_ns["result"] = df_result_ns["result"] + " $\\pm$ " + df_results1.groupby("model")[["correl"]].std()["correl"].round(2).apply(str)
print(df_result_ns.reindex(model_order)["result"].to_latex().replace("\\", ""))

In [None]:
files = glob.glob('../results/NS/*/prompt_estimation/prompt_surprisal*.json', recursive=True)
files = [file for file in files if "direct" not in file]

for file in files:
    article2sups = defaultdict(list)
    model = file.split("/")[3]
    target_file = f"../results/NS/{model}/surprisal.json"
    article2surprisals = json.load(open(target_file))
    all_sups = [sup for _, sents_sups in sorted(article2surprisals.items(), key=lambda x: int(x[0])) for sent_sups in sents_sups for sup in sent_sups]
    # assert len(df) == len(all_sups)
    df["surprisal"] = all_sups
    for article_id, article in groupby(df[["word", "time", "surprisal", "sent_id", "article"]].iterrows(), key=lambda x: x[1][4]):
        for sent_id, sent in groupby(article, key=lambda x: x[1][3]):
            sent = list(sent)
            sent = {f"{str(i)}: {token[1][0]}": token[1][2] for i, token in enumerate(sent)}
            article2sups[str(article_id)].append(sent)

    correls = []
    with open(file+".correl", "w") as f:
        for article_id, sents in article2sents.items():
            assert len(sents) == len(article2sups[article_id])
            sents = sents[:5]
            for sent, sups in zip(sents, article2sups[article_id]):
                assert len(sent) == len(sups)
                times = [time for tok, time in sent.items()]
                sups = [sups[tok] for tok, _ in sent.items()]
                correl = spearmanr(times, sups)[0]
                if not math.isnan(correl):
                    correls.append(correl)
        surprisal_time_correl = mean(correls)
        f.write("surpisal v.s. time: " + str(surprisal_time_correl) + "\n")

        results = json.load(open(file))
        correls = []
        for article_id, preds in results.items():
            golds = article2sups[article_id]
            preds = preds[:5]
            # assert len(golds) == len(preds)
            for pred, gold in zip(preds, golds):
                # print(pred)
                target = pred.split("\n\n")[3]
                if len(target.split("\n")) < 6:
                    continue
                token_ids = [token for token in target.split("\n")[3].rstrip(",").split(", ")]
                answer = [token for token in target.split("\n")[5].split(", ")]
                answer_valid = [a for a in answer if a in token_ids]
                if len(answer_valid) > 1:
                    y = [-gold[ans] for ans in answer_valid]
                    x = list(range(len(y)))
                    correl = spearmanr(x,y)[0]
                    if not math.isnan(correl):
                        correls.append(correl)
        f.write("response v.s. surprisal: " + str(mean(correls)) + "\n")
        
        correls = []
        for article_id, preds in results.items():
            golds = article2sents[article_id]
            preds = preds[:5]
            # assert len(golds) == len(preds)
            for pred, gold in zip(preds, golds):
                # print(pred)
                target = pred.split("\n\n")[3]
                if len(target.split("\n")) < 6:
                    continue
                token_ids = [token for token in target.split("\n")[3].rstrip(",").split(", ")]
                answer = [token for token in target.split("\n")[5].split(", ")]
                answer_valid = [a for a in answer if a in token_ids]
                if len(answer_valid) > 1:
                    y = [-gold[ans] for ans in answer_valid]
                    x = list(range(len(y)))
                    correl = spearmanr(x,y)[0]
                    if not math.isnan(correl):
                        correls.append(correl)
        f.write("response v.s. time: " + str(mean(correls)) + "\n")

In [None]:
files = glob.glob('../results/NS/*/prompt_estimation/prompt_surprisal*.json.correl', recursive=True)
df_results = pd.DataFrame(columns=["model", "prompt", "metalinguistic", "surprisal", "metacognition"])
for file in files:
    with open(file) as f:
        print(file)
        model = file.split("/")[-3]
        prompt = file.split("/")[-1].split(".")[0]
        data = "\n".join(f.readlines())
        correl_surprisal = float(data.split("\n")[0].split(": ")[1])
        correl_self_surprisal = float(data.split("\n")[2].split(": ")[1])
        correl_prompt_surprisal = float(data.split("\n")[4].split(": ")[1])
        df_results.loc[len(df_results)] = [model, prompt, correl_prompt_surprisal, correl_surprisal, correl_self_surprisal]


In [None]:
df_result_ns = pd.DataFrame(columns=["model", "metalinguistic", "surprisal", "metacognition"])
df_result_ns["metalinguistic"] = df_results.groupby("model")[["metalinguistic"]].mean()["metalinguistic"].round(2).apply(str)
df_result_ns["metalinguistic"] = df_result_ns["metalinguistic"] + " $\\pm$ " + df_results.groupby("model")[["metalinguistic"]].std()["metalinguistic"].round(2).apply(str)

df_result_ns["surprisal"] = df_results.groupby("model")[["surprisal"]].mean()["surprisal"].round(2).apply(str)
df_result_ns["surprisal"] = df_result_ns["surprisal"] + " $\\pm$ " + df_results.groupby("model")[["surprisal"]].std()["surprisal"].round(2).apply(str)

df_result_ns["metacognition"] = df_results.groupby("model")[["metacognition"]].mean()["metacognition"].round(2).apply(str)
df_result_ns["metacognition"] = df_result_ns["metacognition"] + " $\\pm$ " + df_results.groupby("model")[["metacognition"]].std()["metacognition"].round(2).apply(str)

df_results = pd.merge(df_results, df_results1, on="model", how="outer")

print(df_result_ns.reindex(model_order)["metalinguistic"].to_latex().replace("\\", ""))
print(df_result_ns.reindex(model_order)["surprisal"].to_latex().replace("\\", ""))
print(df_result_ns.reindex(model_order)["metacognition"].to_latex().replace("\\", ""))

In [None]:
from scipy import stats
print(stats.mannwhitneyu(df_results["metalinguistic"].values.tolist() + df_results["correl"].values.tolist(), y=df_results["surprisal"].values, alternative="two-sided"))

## DC

In [None]:
import pandas as pd
from itertools import groupby
from collections import defaultdict
from statistics import mean
from scipy.stats import spearmanr

df = pd.read_csv("../data/DC/all.txt.annotation.filtered.averaged_rt.csv")
article2sents = defaultdict(list)
for article_id, article in groupby(df[["surface", "time", "sent_id", "article"]].iterrows(), key=lambda x: x[1][3]):
    for sent_id, sent in groupby(article, key=lambda x: x[1][2]):
        sent = list(sent)
        sent = {f"{str(i)}: {''.join(token[1][0].split()).replace('▁', ' ').strip()}": token[1][1] for i, token in enumerate(sent)}
        article2sents[str(article_id)].append(sent)
        # sent = list(sent)
        # sents.append([("".join(tok[1][0].split()).replace("▁", " ").strip(), tok[1][1]) for tok in sent])

In [None]:
import json
import glob
import os
import math

files = glob.glob('../results/DC/*/prompt_estimation/prompt_*.json', recursive=True)
files = [file for file in files if "direct" not in file and "surprisal" not in file]

for file in files:
    print(file)
    results = json.load(open(file))
    correls = []
    for article_id, preds in results.items():
        golds = article2sents[article_id]
        # assert len(golds) == len(preds)
        preds = preds[:5]
        for pred, gold in zip(preds, golds):
            # print(pred)
            target = pred.split("\n\n")[3]
            if len(target.split("\n")) < 6:
                continue
            token_ids = [token for token in target.split("\n")[3].rstrip(",").split(", ")]
            answer = [token for token in target.split("\n")[5].split(", ")]
            answer_valid = [a for a in answer if a in token_ids]
            if len(answer_valid) > 1:
                try:
                    y = [-gold[ans] for ans in answer_valid]
                    x = list(range(len(y)))
                    correl = spearmanr(x,y)[0]
                    if not math.isnan(correl):
                        correls.append(correl)
                except:
                    pass
    if correls:
        with open(file+".correl", "w") as f:
            f.write(str(mean(correls)))


In [None]:
files = glob.glob('../results/DC/*/prompt_estimation/prompt_*.json.correl', recursive=True)
files = [file for file in files if "direct" not in file and "surprisal" not in file]

df_results_1 = pd.DataFrame(columns=["model", "prompt", "correl"])
for file in files:
    with open(file) as f:
        print(file)
        model = file.split("/")[-3]
        prompt = file.split("/")[-1].split(".")[0].split("_")[-1]
        correl = float(f.read().strip())
        df_results_1.loc[len(df_results_1)] = [model, prompt, float(correl)]

In [None]:
df_result_dc = pd.DataFrame(columns=["model", "result"])
df_result_dc["result"] = df_results_1.groupby("model")[["correl"]].mean()["correl"].round(2).apply(str)
df_result_dc["result"] = df_result_dc["result"] + " $\\pm$ " + df_results_1.groupby("model")[["correl"]].std()["correl"].round(2).apply(str)
for line in df_result_dc.reindex(model_order)["result"].to_latex().replace("\\", "").split("\n"):
    if "&" in line:
        print(" & ".join(line.split("&")[1:]))

In [None]:
files = glob.glob('../results/DC/*/prompt_estimation/prompt_surprisal*.json', recursive=True)
files = [file for file in files if "direct" not in file]

for file in files:
    article2sups = defaultdict(list)
    model = file.split("/")[3]
    target_file = f"../results/DC/{model}/surprisal.json"
    article2surprisals = json.load(open(target_file))
    all_sups = [sup for article_id, sents_sups in sorted(article2surprisals.items(), key=lambda x: int(x[0])) for sent_sups in sents_sups for sup in sent_sups]
    # assert len(df) == len(all_sups)
    df["surprisal"] = all_sups
    for article_id, article in groupby(df[["surface", "time", "surprisal", "sent_id", "article"]].iterrows(), key=lambda x: x[1][4]):
        # if int(article_id) < 6:
        for sent_id, sent in groupby(article, key=lambda x: x[1][3]):
            sent = list(sent)
            sent = {f"{str(i)}: {''.join(token[1][0].split()).replace('▁', ' ').strip()}": token[1][2] for i, token in enumerate(sent)}
            article2sups[str(article_id)].append(sent)

    correls = []
    with open(file+".correl", "w") as f:
        for article_id, sents in article2sents.items():
            # if int(article_id) < 6:
            assert len(sents) == len(article2sups[article_id])
            sents = sents[:5]
            for sent, sups in zip(sents, article2sups[article_id]):
                # assert len(sent) == len(sups)
                times = [time for tok, time in sent.items()]
                sups = [sups[tok] for tok, _ in sent.items()]
                correl = spearmanr(times, sups)[0]
                if not math.isnan(correl):
                    correls.append(correl)
        surprisal_time_correl = mean(correls)
        
        if correls:
            f.write("surpisal v.s. time: " + str(surprisal_time_correl) + "\n")

        results = json.load(open(file))
        correls = []
        for article_id, preds in results.items():
            preds = preds[:5]
            golds = article2sups[article_id]
            # assert len(golds) == len(preds)
            for pred, gold in zip(preds, golds):
                # print(pred)
                target = pred.split("\n\n")[3]
                if len(target.split("\n")) < 6:
                    continue
                token_ids = [token for token in target.split("\n")[3].rstrip(",").split(", ")]
                answer = [token for token in target.split("\n")[5].split(", ")]
                answer_valid = [a for a in answer if a in token_ids]
                if len(answer_valid) > 1:
                    try:
                        y = [-gold[ans] for ans in answer_valid]
                        x = list(range(len(y)))
                        correl = spearmanr(x,y)[0]
                        if not math.isnan(correl):
                            correls.append(correl)
                    except:
                        pass
        if correls:
            f.write("response v.s. surprisal: " + str(mean(correls)) + "\n")
        
        correls = []
        for article_id, preds in results.items():
            golds = article2sents[article_id]
            # assert len(golds) == len(preds)
            preds = preds[:5]
            for pred, gold in zip(preds, golds):
                # print(pred)
                target = pred.split("\n\n")[3]
                token_ids = [token for token in target.split("\n")[3].rstrip(",").split(", ")]
                answer = [token for token in target.split("\n")[5].split(", ")]
                answer_valid = [a for a in answer if a in token_ids]
                if len(answer_valid) > 1:
                    try:
                        y = [-gold[ans] for ans in answer_valid]
                        x = list(range(len(y)))
                        correl = spearmanr(x,y)[0]
                        if not math.isnan(correl):
                            correls.append(correl)
                    except:
                        pass
        if correls:
            f.write("response v.s. time: " + str(mean(correls)) + "\n")

In [None]:
files = glob.glob('../results/DC/*/prompt_estimation/prompt_surprisal*.json.correl', recursive=True)
df_results = pd.DataFrame(columns=["model", "prompt", "metalinguistic", "surprisal", "metacognition"])
for file in files:
    with open(file) as f:
        print(file)
        model = file.split("/")[-3]
        prompt = file.split("/")[-1].split(".")[0].split("_")[-1]
        data = [line for line in f.readlines() if line.strip()]
        if len(data) < 3:
            continue
        correl_surprisal = float(data[0].split(": ")[1])
        correl_self_surprisal = float(data[1].split(": ")[1])
        correl_prompt_surprisal = float(data[2].split(": ")[1])
        df_results.loc[len(df_results)] = [model, prompt, correl_prompt_surprisal, correl_surprisal, correl_self_surprisal]


In [None]:
df_result_dc = pd.DataFrame(columns=["model", "metalinguistic", "surprisal", "metacognition"])
df_result_dc["metalinguistic"] = df_results.groupby("model")[["metalinguistic"]].mean()["metalinguistic"].round(2).apply(str)
df_result_dc["metalinguistic"] = df_result_dc["metalinguistic"] + " $\\pm$ " + df_results.groupby("model")[["metalinguistic"]].std()["metalinguistic"].round(2).apply(str)

df_result_dc["surprisal"] = df_results.groupby("model")[["surprisal"]].mean()["surprisal"].round(2).apply(str)
df_result_dc["surprisal"] = df_result_dc["surprisal"] + " $\\pm$ " + df_results.groupby("model")[["surprisal"]].std()["surprisal"].round(2).apply(str)

df_result_dc["metacognition"] = df_results.groupby("model")[["metacognition"]].mean()["metacognition"].round(2).apply(str)
df_result_dc["metacognition"] = df_result_dc["metacognition"] + " $\\pm$ " + df_results.groupby("model")[["metacognition"]].std()["metacognition"].round(2).apply(str)

df_results = pd.merge(df_results, df_results_1)

for line in df_result_dc.reindex(model_order)["metalinguistic"].to_latex().replace("\\", "").split("\n"):
    if "&" in line:
        print(" & ".join(line.split("&")[1:]))

for line in df_result_dc.reindex(model_order)["surprisal"].to_latex().replace("\\", "").split("\n"):
    if "&" in line:
        print(" & ".join(line.split("&")[1:]))

for line in df_result_dc.reindex(model_order)["metacognition"].to_latex().replace("\\", "").split("\n"):
    if "&" in line:
        print(" & ".join(line.split("&")[1:]))

In [None]:
from scipy import stats
print(stats.mannwhitneyu(df_results["metalinguistic"].values.tolist() + df_results["correl"].values.tolist(), y=df_results["surprisal"].values, alternative="two-sided"))