In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm

In [None]:
sns.set(style="ticks", font_scale=1.5, font="Liberation Sans")
MODEL = "bert-base-uncased"
ALTERNATIVES = ["each", "every", "few", "half", "much", "many", "most", "all"]
SURPRISAL_COLOR = "steelblue"
HUMAN_MEASURE_LABEL = "Human SI strength rating"

def render(file_name):
    plt.savefig(f"./figures/{file_name}", dpi=300, bbox_inches="tight")
    print(f"Rendered figure to {file_name}")

# Read data

In [None]:
human = pd.read_csv("./data/some-all/some_database.tsv", sep="\t")
human.head()

In [None]:
print("Number of participants:", human.workerid.nunique())

In [None]:
def load_model_results(scale):
    results = []
    for strong in ALTERNATIVES:
        df = pd.read_csv(f"./data/{scale}/model_output/{MODEL}_{strong}.csv")
        df["model"] = MODEL
        df["strong"] = strong
        df["predicted_strong"] = (df["predicted_token"] == strong)
        results.append(df)

    df = pd.concat(results).rename(columns={"Item_ID": "item_number"})
    df["prob_at_strong"] = np.exp(-df["surprisal_at_strong"])
    return df

model_df = load_model_results("some-all")
model_df.head()

In [None]:
model_df.groupby(["model", "strong"]).size()

## Combine model and human data

In [None]:
# Which variables from the human experiment do we want to carry over?
relevant_num_vars = ["Rating", "StrengthSome", "SentenceLength"]
relevant_cat_vars = ["Partitive", "Mention", "Subjecthood", "Modification"]

g = human.set_index("Item")
    
# Add these variables to the model results.
for v in relevant_num_vars:
    model_df[v] = model_df.apply(
        lambda row: g.loc[row.Item][v].mean() if row.Item in g.index else None,
        axis=1
    )
for v in relevant_cat_vars:
    model_df[v] = model_df.apply(
        lambda row: g.loc[row.Item][v].values[0] if row.Item in g.index else None,
        axis=1
    )

In [None]:
# make sure we aren't double-counting items
model_df.groupby(["model", "strong"]).size()

In [None]:
# Only care about surprisal for <some, all>.
df_all = model_df[model_df.strong=="all"]

# Figure 2a: String-based surprisal

In [None]:
# Set plotting variables.
x = f"surprisal_at_strong"
y = "Rating"

# Plot line of best fit.
ax = sns.regplot(
    data=df_all.dropna(), x=x, y=y, 
    color=SURPRISAL_COLOR, marker="o",
    scatter_kws=dict(alpha=0.1)
)
r, p = stats.spearmanr(df_all.dropna()[x], df_all.dropna()[y])
print(MODEL, f"Spearman r={r:.3f}, p={p:.3e}")
r, p = stats.pearsonr(df_all.dropna()[x], df_all.dropna()[y])
print(MODEL, f"Pearson r={r:.3f}, p={p:.3e}")


# Add stats to title.
if p < 0.0001:
    p_str = "p < 0.0001"
else:
    p_str = f"p={p:.3e}"
ax.set_title(f"$r={r:.3f}, {p_str}$", color="dimgrey", size=16)

# Set labels and size.
ax.set_ylabel(HUMAN_MEASURE_LABEL)
ax.set_xlabel("Surprisal of \"all\"")
ax.set_ylim(0.5,7.5)
plt.gcf().set_size_inches(4,3.5)

# Render and show plot.
render(f"{MODEL}_within-scale_surprisal.pdf")
plt.show()

In [None]:
######## NOTE: This plot is not included in the paper, but the stats are reported in Footnote 4.

# Set plotting variables.
x = f"prob_at_strong"
y = "Rating"

# Plot line of best fit.
ax = sns.regplot(
    data=df_all.dropna(), x=x, y=y, 
    color=SURPRISAL_COLOR, marker="o",
    scatter_kws=dict(alpha=0.1)
)
r, p = stats.spearmanr(df_all.dropna()[x], df_all.dropna()[y])
print(MODEL, f"Spearman r={r:.3f}, p={p:.3e}")
r, p = stats.pearsonr(df_all.dropna()[x], df_all.dropna()[y])
print(MODEL, f"Pearson r={r:.3f}, p={p:.3e}")


# Add stats to title.
if p < 0.0001:
    p_str = "p < 0.0001"
else:
    p_str = f"p={p:.3e}"
ax.set_title(f"$r={r:.3f}, {p_str}$", color="dimgrey", size=16)

# Set labels and size.
ax.set_ylabel(HUMAN_MEASURE_LABEL)
ax.set_xlabel("Probability of \"all\"")
plt.gcf().set_size_inches(4,3.5)

# Show plot.
plt.show()

# Figure 2b: Weighted average surprisal

In [None]:
# Read word vectors.
glove_file = "glove.6B.300d.txt"
with open(glove_file, 'r') as f:
    glove_vectors = {}
    for line in f:
        vals = line.rstrip().split(' ')
        glove_vectors[vals[0]] = np.array([float(x) for x in vals[1:]]).reshape(1, -1)
print("Successfully loaded GloVe vectors")

In [None]:
# Helper function for getting cosine similarity between word vectors
def get_sim(w1, w2, vectors):
    try:
        v1, v2 = vectors[w1], vectors[w2]
        sim = cosine_similarity(v1, v2)
        return sim[0][0]
    except:
        return None

# Helper function for getting similarity for top k scalemates and strong scalemate
def get_sim_scalemates(dists, vectors, topk=None):
    if topk is not None:
        dists = dists[dists["rank"]<=topk]
    dists["cosine_sim_strong"] = dists.apply(
        lambda row: get_sim(row.scalemate, "all", vectors),
        axis=1
    )
    return dists

# Get weighted average surprisal over full alternative set
def get_weighted_avg_surprisal(dists, vectors, **kwargs):
    print("Getting similarity scores")
    dists = get_sim_scalemates(dists, vectors, **kwargs)
    print("Computing weighted average surprisals")
    data = []
    sims_data = []
    for item in dists["item"].unique():
        d = dists[dists["item"]==item]
        probs = d[~d.cosine_sim_strong.isna()].prob
        sims = d[~d.cosine_sim_strong.isna()].cosine_sim_strong
        sims_data.append(dict(item=item, sims=sims.tolist()))
        weights = sims + 1 # translate from [-1, 1] to [0, 2] - avoid weirdness of negative weights
        if sum(sims) != 0:
            wavg_surp = -np.log(np.average(probs, weights=weights))
            data.append(dict(
                item=item,
                weighted_avg_surprisal=wavg_surp
            ))
    return pd.DataFrame(data), pd.DataFrame(sims_data)

In [None]:
mdist = model_df.set_index(["Item", "strong"]).sort_index()
dists = []
for (item, strong), row in mdist.iterrows():
    dists.append(dict(
        scalemate=strong,
        item=item,
        surprisal=row.surprisal_at_strong,
        prob=row.prob_at_strong,
        is_strong_scalemate=(strong=="all")
    ))
dists = pd.DataFrame(dists).sort_values(by=["item", "scalemate"])
dists.head(10)

In [None]:
def get_wavg(*args, **kwargs):
    wavg, sims = get_weighted_avg_surprisal(*args, **kwargs)
    wavg = wavg.set_index("item").weighted_avg_surprisal
    return wavg, sims

topk = None
embs = glove_vectors 

wavg, sims = get_wavg(dists, embs, topk=topk)
for i, row in tqdm(df_all.iterrows(), total=len(df_all.index)):
    df_all.loc[i, "weighted_avg_surprisal"] = wavg.loc[row.Item] if row.Item in wavg else None

In [None]:
# Set plotting variables.
x = f"weighted_avg_surprisal"
y = "Rating"

# Plot line of best fit.
ax = sns.regplot(
    data=df_all.dropna(), x=x, y=y, 
    color=SURPRISAL_COLOR, marker="o",
    scatter_kws=dict(alpha=0.1)
)
r, p = stats.pearsonr(df_all.dropna()[x], df_all.dropna()[y])
print(MODEL, f"Pearson r={r:.3f}, p={p:.3e}")

# Add stats to title.
if p < 0.0001:
    p_str = "p < 0.0001"
else:
    p_str = f"p={p:.3e}"
ax.set_title(f"$r={r:.3f}, {p_str}$", color="dimgrey", size=16)

# Set labels and size.
ax.set_ylabel(HUMAN_MEASURE_LABEL)
ax.set_xlabel("Weighted average surprisal")
plt.gcf().set_size_inches(4,3.5)
ax.set_ylim(0.5,7.5)

# Render and show plot.
render(f"{MODEL}_within-scale_weighted-avg-surprisal.pdf")
plt.show()

# Table 2: Multivariate analyses

In [None]:
def get_sig_code(p):
    if p < 0.001:
        return "***"
    elif p < 0.01:
        return "**"
    elif p < 0.05:
        return "*"
    elif p < 0.1:
        return "."
    else:
        return ""
    
def get_sig_code_num(p):
    if p < 0.0001:
        return "< 0.0001"
    elif p < 0.001:
        return "< 0.001"
    elif p < 0.01:
        return "< 0.01"
    elif p < 0.05:
        return "< 0.05"
    elif p < 0.1:
        return "< 0.1"
    else:
        return p

def fit_reg_models(df):
    all_results = []
    formula = "Rating ~ cPartitive + cStrengthSome + cRedMention + cSubjecthood + cModification + cSentenceLengthLog + csurprisal_at_strong + cweighted_avg_surprisal"
    m = smf.ols(formula, data=df).fit()
    # print(m.summary())
    coeffs = m.params.to_frame().reset_index().set_axis(['variable', 'coeff'], axis=1).set_index('variable')
    pvals = m.pvalues.to_frame().reset_index().set_axis(['variable', 'pval'], axis=1).set_index('variable')
    results = coeffs.join(pvals).reset_index()
    results = results[results.variable != "Intercept"]
    results["sig"] = results["pval"] < 0.05
    results["sig_code_num"] = results["pval"].apply(get_sig_code_num)
    results["sig_code"] = results["pval"].apply(get_sig_code)
    all_results.append(results)
    return pd.concat(all_results)

s = df_all.copy()
s["Partitive"] = s.Partitive.map({"yes": 2, "no": 1})
s["RedMention"] = s.Mention.map({"new": 1, "med": 2, "old": 2})
s["Modification"] = s.Modification.map({"modified": 1, "unmodified": 2})
s["Subjecthood"] = s.Subjecthood.map({"subject": 2, "other": 1})
s["SentenceLengthLog"] = np.log(s.SentenceLength)

# Center everything
for c in ["Partitive", "StrengthSome", "RedMention", "Subjecthood", "Modification", "SentenceLengthLog", 
          "surprisal_at_strong", "weighted_avg_surprisal"]:
    s["c"+c] = s[c] - s[c].mean()

stat_df = fit_reg_models(s)
stat_df