# WP4.1 Systematic Review of Pre-trained LLMs

In [1]:
import pandas as pd
import unicodedata
import re
import os

## Load Files

In [2]:
# Load the CSV of Zotero list of papers
gold_df = pd.read_csv("data/LLM - Survey Proxies.csv")

gold_df = gold_df[["Title", "Item Type"]].dropna().drop_duplicates().reset_index(drop=True)
gold_df["preprint_flag"] = gold_df["Item Type"].apply(lambda x: "preprint" if x == "preprint" else "non-preprint")

print(f"Gold list size: {len(gold_df)}")
gold_df.head(3)

Gold list size: 21


Unnamed: 0,Title,Item Type,preprint_flag
0,"Out of One, Many: Using Language Models to Sim...",journalArticle,non-preprint
1,Examining the Feasibility of Large Language Mo...,preprint,preprint
2,Donald Trumps in the Virtual Polls: Simulating...,preprint,preprint


## Measure Recall

In [3]:
def normalize_title(s: str) -> str:
    # Unicode normalize
    s = unicodedata.normalize("NFKC", str(s))
    # Lowercase
    s = s.lower()
    # Remove punctuation-like characters
    s = re.sub(r"[^\w\s]", " ", s)   # keep letters, numbers, underscore, whitespace
    # Collapse whitespace
    s = re.sub(r"\s+", " ", s).strip()
    return s

In [4]:
# Build lookup from normalized -> original for the gold set
gold_df["norm"] = gold_df["Title"].map(normalize_title)

# If duplicates normalize to the same string, keep the first original as the representative
gold_norm_to_orig = dict(zip(gold_df["norm"], gold_df["Title"]))
gold_norm_to_preprint_flag = dict(zip(gold_df["norm"], gold_df["preprint_flag"]))
gold_norm_set = set(gold_df["norm"])

# Separate sets for preprint and non-preprint
gold_preprint_set = set(gold_df[gold_df["preprint_flag"] == "preprint"]["norm"])
gold_non_preprint_set = set(gold_df[gold_df["preprint_flag"] == "non-preprint"]["norm"])

results = []
missing_rows = []  # for a detailed table of which gold titles are missing per file

In [5]:
# Process each of the 10 Excel files
for i in range(1, 11):
    fname = f"data/savedrecs ({i}).xls"
    # Read first sheet, only the needed column
    try:
        xdf = pd.read_excel(fname, engine="xlrd")
    except Exception as e:
        results.append({
            "file": fname,
            "error": str(e),
            "matches_count": 0,
            "recall": 0.0,
            "recall_non_preprint": 0.0,
            "recall_preprint": 0.0,
        })
        continue

    assert "Article Title" in xdf.columns, f"Expected 'Article Title' in {fname}"

    titles = (
        xdf["Article Title"]
        .dropna()
        .drop_duplicates()
        .astype(str)
        .tolist()
    )
    norm_titles = {normalize_title(t) for t in titles}

    matched_norm = gold_norm_set & norm_titles
    missing_norm = gold_norm_set - norm_titles

    # Calculate matches by type
    matched_preprint = gold_preprint_set & norm_titles
    matched_non_preprint = gold_non_preprint_set & norm_titles

    matches_count = len(matched_norm)
    recall = matches_count / len(gold_norm_set) if gold_norm_set else 0.0
    recall_preprint = len(matched_preprint) / len(gold_preprint_set) if gold_preprint_set else 0.0
    recall_non_preprint = len(matched_non_preprint) / len(gold_non_preprint_set) if gold_non_preprint_set else 0.0

    # Save per file summary
    results.append({
        "file": fname,
        "matches_count": matches_count,
        "total_gold": len(gold_norm_set),
        "recall": recall,
        "recall_non_preprint": recall_non_preprint,
        "recall_preprint": recall_preprint,
        "matched_titles": [gold_norm_to_orig[n] for n in sorted(matched_norm)],
        "missing_titles": [gold_norm_to_orig[n] for n in sorted(missing_norm)],
    })

    # Populate long-form "missing" table
    for n in sorted(missing_norm):
        missing_rows.append({
            "file": fname,
            "missing_gold_title": gold_norm_to_orig[n]
        })

df_WoS_recall = pd.DataFrame(results).sort_values(["recall", "file"], ascending=[False, True]).reset_index(drop=True)


In [6]:
df_WoS_recall

Unnamed: 0,file,matches_count,total_gold,recall,recall_non_preprint,recall_preprint,matched_titles,missing_titles
0,data/savedrecs (2).xls,5,21,0.238095,0.454545,0.0,[AI–Human Hybrids for Marketing Research: Leve...,[Addressing Systematic Non-response Bias with ...
1,data/savedrecs (5).xls,5,21,0.238095,0.454545,0.0,[AI–Human Hybrids for Marketing Research: Leve...,[Addressing Systematic Non-response Bias with ...
2,data/savedrecs (1).xls,4,21,0.190476,0.363636,0.0,[AI–Human Hybrids for Marketing Research: Leve...,[Addressing Systematic Non-response Bias with ...
3,data/savedrecs (10).xls,4,21,0.190476,0.363636,0.0,[AI–Human Hybrids for Marketing Research: Leve...,[Addressing Systematic Non-response Bias with ...
4,data/savedrecs (4).xls,4,21,0.190476,0.363636,0.0,[AI–Human Hybrids for Marketing Research: Leve...,[Addressing Systematic Non-response Bias with ...
5,data/savedrecs (8).xls,4,21,0.190476,0.363636,0.0,[AI–Human Hybrids for Marketing Research: Leve...,[Addressing Systematic Non-response Bias with ...
6,data/savedrecs (7).xls,3,21,0.142857,0.272727,0.0,[AI–Human Hybrids for Marketing Research: Leve...,[Addressing Systematic Non-response Bias with ...
7,data/savedrecs (9).xls,3,21,0.142857,0.272727,0.0,[AI–Human Hybrids for Marketing Research: Leve...,[Addressing Systematic Non-response Bias with ...
8,data/savedrecs (3).xls,2,21,0.095238,0.181818,0.0,[AI–Human Hybrids for Marketing Research: Leve...,[Addressing Systematic Non-response Bias with ...
9,data/savedrecs (6).xls,1,21,0.047619,0.090909,0.0,[Synthetic Replacements for Human Survey Data?...,[Addressing Systematic Non-response Bias with ...


## Measure Recall of '/NEAR'

In [None]:
def normalize_title(s: str) -> str:
    # Unicode normalize
    s = unicodedata.normalize("NFKC", str(s))
    # Lowercase
    s = s.lower()
    # Remove punctuation-like characters
    s = re.sub(r"[^\w\s]", " ", s)   # keep letters, numbers, underscore, whitespace
    # Collapse whitespace
    s = re.sub(r"\s+", " ", s).strip()
    return s

# Build lookup from normalized -> original for the gold set
gold_df["norm"] = gold_df["Title"].map(normalize_title)
# If duplicates normalize to the same string, keep the first original as the representative
gold_norm_to_orig = dict(zip(gold_df["norm"], gold_df["Title"]))
gold_norm_set = set(gold_df["norm"])

results = []
missing_rows = []  # for a detailed table of which gold titles are missing per file

for i in range(1, 11):
    fname = f"data/NEARx_tuning/savedrecs{i}.xls"
    # Read first sheet, only the needed column
    try:
        xdf = pd.read_excel(fname, engine="xlrd")
    except Exception as e:
        results.append({
            "file": fname,
            "error": str(e),
            "matches_count": 0,
            "recall": 0.0,
        })
        continue

    assert "Article Title" in xdf.columns, f"Expected 'Article Title' in {fname}"

    titles = (
        xdf["Article Title"]
        .dropna()
        .drop_duplicates()
        .astype(str)
        .tolist()
    )
    norm_titles = {normalize_title(t) for t in titles}

    matched_norm = gold_norm_set & norm_titles
    missing_norm = gold_norm_set - norm_titles

    matches_count = len(matched_norm)
    recall = matches_count / len(gold_norm_set) if gold_norm_set else 0.0

    # Save per file summary
    results.append({
        "file": fname,
        "matches_count": matches_count,
        "total_gold": len(gold_norm_set),
        "recall": recall,
        "matched_titles": [gold_norm_to_orig[n] for n in sorted(matched_norm)],
        "missing_titles": [gold_norm_to_orig[n] for n in sorted(missing_norm)],
    })

    # Populate long-form “missing” table
    for n in sorted(missing_norm):
        missing_rows.append({
            "file": fname,
            "missing_gold_title": gold_norm_to_orig[n]
        })

summary_df = pd.DataFrame(results).sort_values(["recall", "file"], ascending=[False, True]).reset_index(drop=True)

In [5]:
summary_df

Unnamed: 0,file,matches_count,total_gold,recall,matched_titles,missing_titles
0,savedrecs1.xls,2,13,0.153846,[AI–Human Hybrids for Marketing Research: Leve...,[Addressing Systematic Non-response Bias with ...
1,savedrecs10.xls,2,13,0.153846,[AI–Human Hybrids for Marketing Research: Leve...,[Addressing Systematic Non-response Bias with ...
2,savedrecs2.xls,2,13,0.153846,[AI–Human Hybrids for Marketing Research: Leve...,[Addressing Systematic Non-response Bias with ...
3,savedrecs3.xls,2,13,0.153846,[AI–Human Hybrids for Marketing Research: Leve...,[Addressing Systematic Non-response Bias with ...
4,savedrecs4.xls,2,13,0.153846,[AI–Human Hybrids for Marketing Research: Leve...,[Addressing Systematic Non-response Bias with ...
5,savedrecs5.xls,2,13,0.153846,[AI–Human Hybrids for Marketing Research: Leve...,[Addressing Systematic Non-response Bias with ...
6,savedrecs6.xls,2,13,0.153846,[AI–Human Hybrids for Marketing Research: Leve...,[Addressing Systematic Non-response Bias with ...
7,savedrecs7.xls,2,13,0.153846,[AI–Human Hybrids for Marketing Research: Leve...,[Addressing Systematic Non-response Bias with ...
8,savedrecs8.xls,2,13,0.153846,[AI–Human Hybrids for Marketing Research: Leve...,[Addressing Systematic Non-response Bias with ...
9,savedrecs9.xls,2,13,0.153846,[AI–Human Hybrids for Marketing Research: Leve...,[Addressing Systematic Non-response Bias with ...


In [6]:

print("Per-file recall summary:")
display(summary_df[["file", "matches_count", "total_gold", "recall"]])

print("\nWhich of the 13 are missing per file:")
missing_df = pd.DataFrame(missing_rows)



Per-file recall summary:


Unnamed: 0,file,matches_count,total_gold,recall
0,savedrecs1.xls,2,13,0.153846
1,savedrecs10.xls,2,13,0.153846
2,savedrecs2.xls,2,13,0.153846
3,savedrecs3.xls,2,13,0.153846
4,savedrecs4.xls,2,13,0.153846
5,savedrecs5.xls,2,13,0.153846
6,savedrecs6.xls,2,13,0.153846
7,savedrecs7.xls,2,13,0.153846
8,savedrecs8.xls,2,13,0.153846
9,savedrecs9.xls,2,13,0.153846



Which of the 13 are missing per file:
