# SRE Corpus Result Section

## Imports

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ot
import pandas as pd
import json
from sklearn.manifold import MDS
import matplotlib

matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import cosine
import numpy as np

from scipy.spatial.distance import cdist
from scipy.cluster.hierarchy import dendrogram, linkage

import matplotlib.patheffects as path_effects

## Organizing Data From Experiment

In [None]:
human_prior_rating_trials = pd.read_csv("../data/qof-ratings/human_ratings/1215DenseRatingCheck.csv")
remnant_rating_trials = pd.read_csv("../data/qof-ratings/human_ratings/1215DenseRatingRemnant.csv")
remnant_rating_trials_more = pd.read_csv("../data/qof-ratings/human_ratings/1231DenseRatingRemnant.csv")
human_rating_trials = pd.concat([human_prior_rating_trials, remnant_rating_trials, remnant_rating_trials_more]).reset_index()
human_rating_responses = human_rating_trials["answer"].dropna().apply(json.loads).apply(pd.Series)
human_ratings = human_rating_responses["answer"].apply(pd.Series)
human_ratings_full = pd.merge(
    human_rating_responses.drop(columns=["tones", "answer"]),
    human_ratings,
    left_index=True,
    right_index=True
).groupby("sentence").agg(
    lambda x: np.mean(x.dropna().astype(float))
)
human_ratings_full = human_ratings_full.reindex(sorted(human_ratings_full.columns), axis=1)
human_ratings_full

In [None]:
gpt_rating_trials = pd.read_csv(f"../data/qof-ratings/gpt_ratings.csv", sep="|")
gpt_ratings_full = pd.pivot_table(
    gpt_rating_trials,
    index="sentence",
    columns="tone",
    values="current_rating"
)
human_ratings_full = human_ratings_full.loc[gpt_ratings_full.index.str.replace("'", "`"), :]
gpt_ratings_full.index = gpt_ratings_full.index.str.replace("'", "`")
gpt_ratings_full

In [None]:
assert not (human_ratings_full.values==0).any()

In [None]:
assert not (gpt_ratings_full.values==0).any()

In [None]:
assert (gpt_ratings_full.index == human_ratings_full.index).all()

In [None]:
assert (gpt_ratings_full.columns == human_ratings_full.columns).all()

## Correlation Matrix Plotting

In [None]:
human_sorting_label = dendrogram(linkage(human_ratings_full.values.T))["leaves"]
gpt_sorting_label = human_sorting_label # dendrogram(linkage(gpt_ratings_full.values.T))["leaves"]

In [None]:
fig, ax = plt.subplots(figsize=(25, 20))
sns.heatmap(
    human_ratings_full.corr().iloc[human_sorting_label, human_sorting_label],
    vmin=-1, vmax=1,
    cmap="coolwarm"
)
plt.title("Human Conversation Tone Correlation Matrix", fontsize=30)
ax.set_xticklabels(ax.get_xticklabels(), fontsize=18)
ax.set_yticklabels(ax.get_yticklabels(), fontsize=18)
cbar = ax.collections[0].colorbar
cbar.ax.tick_params(labelsize=18)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(25, 20))
sns.heatmap(
    gpt_ratings_full.corr().iloc[gpt_sorting_label, gpt_sorting_label],
    vmin=-1, vmax=1,
    cmap="coolwarm"
)

plt.title("GPT Conversation Tone Correlation Matrix", fontsize=30)
ax.set_xticklabels(ax.get_xticklabels(), fontsize=18)
ax.set_yticklabels(ax.get_yticklabels(), fontsize=18)
cbar = ax.collections[0].colorbar
cbar.ax.tick_params(labelsize=18)

plt.show()

In [None]:
human_ratings_full_copy = human_ratings_full.copy()
human_ratings_full_copy.columns = [f"Human: {x}" for x in human_ratings_full_copy.columns]

gpt_ratings_full_copy = gpt_ratings_full.copy()
gpt_ratings_full_copy.columns = [f"GPT: {x}" for x in gpt_ratings_full_copy.columns]
gpt_ratings_full_copy.index = gpt_ratings_full_copy.index.str.replace("'", "`")

In [None]:
fig, ax = plt.subplots(figsize=(50, 40))
cross_corr_matrix = pd.concat(
    [
        human_ratings_full_copy.T.iloc[human_sorting_label],
        gpt_ratings_full_copy.T.iloc[gpt_sorting_label]
    ]
).T.corr()
sns.heatmap(
    cross_corr_matrix.iloc[:40, 40:]
)
ax.tick_params(axis='both', which='major', labelsize=30)
ax.tick_params(axis='both', which='minor', labelsize=30)
cbar = ax.collections[0].colorbar
# here set the labelsize by 20
cbar.ax.tick_params(labelsize=30)

In [None]:
fig, ax = plt.subplots(figsize=(50, 40))
sns.heatmap(
    cross_corr_matrix
)
ax.tick_params(axis='both', which='major', labelsize=22)
ax.tick_params(axis='both', which='minor', labelsize=22)
cbar = ax.collections[0].colorbar
# here set the labelsize by 20
cbar.ax.tick_params(labelsize=20)


## Cross Correlation MDS Analyses

### Obtaining MDS Solutions

In [None]:
cross_corr_matrix_MDS = MDS(random_state=42).fit_transform(cross_corr_matrix.values)
indices = cross_corr_matrix.index.str.split(": ")
MDS_with_source_tones = pd.DataFrame(cross_corr_matrix_MDS)\
    .assign(source=[elem[0] for elem in indices], tone=[elem[1] for elem in indices])
MDS_with_source_tones = MDS_with_source_tones.rename(columns={0: "MDS_x", 1: "MDS_y"})
same_tone_indices = pd.merge(
    MDS_with_source_tones.query("source=='Human'"),
    MDS_with_source_tones.query("source=='GPT'"),
    left_on="tone", right_on="tone"
)

In [None]:
cross_corr_matrix_as_coords = cross_corr_matrix.reset_index()
cross_corr_matrix_as_coords.columns = range(cross_corr_matrix_as_coords.columns.size)
cross_corr_matrix_as_coords["source"] = cross_corr_matrix_as_coords[0].apply(lambda x: x.split(": ")[0])
cross_corr_matrix_as_coords["tone"] = cross_corr_matrix_as_coords[0].apply(lambda x: x.split(": ")[1])
cross_corr_matrix_as_coords = cross_corr_matrix_as_coords.drop(columns=0)

### Obtaining Feature Ratings and Arrowmark

In [None]:
from sklearn.linear_model import LinearRegression
from matplotlib.cm import rainbow
import json

In [None]:
gpt_tone_features = pd.read_csv("../data/tone-feature-ratings/GPT-tones-features.csv", sep="|")
gpt_tone_features = gpt_tone_features.groupby(["feature", "tone"]).mean()["current_rating"].reset_index()
human_tone_features = pd.DataFrame(
    list(pd.read_csv("../data/tone-feature-ratings/human-tones-features.csv")["answer"]\
        .dropna()\
        .apply(json.loads))
).dropna()
human_tone_features["answer"] = human_tone_features["answer"].apply(lambda x: int(list(x.values())[0]))
human_tone_features = human_tone_features.groupby(["feature", "tones"]).mean().reset_index()

In [None]:
def get_MDS_biplot_dir(biplot_direction_side, feature_df):
    # def get_MDS_biplot_directions(MDS_coord, tone_index_to_sort_by):
    feature_df = feature_df[feature_df["tones"].isin(same_tone_indices["tone"])]
    cc_MDS_sided = MDS_with_source_tones.query(f"source=='{biplot_direction_side}'")
    cc_MDS_sided = cc_MDS_sided[cc_MDS_sided["tone"].isin(same_tone_indices["tone"])]
    tone_index_to_sort_by = cc_MDS_sided["tone"]
    feature_coeff_dict = {}
    feature_raw_dict = {}
    for feature in feature_df["feature"].unique():
        cc_MDS_sided_feated = feature_df\
            .query(f"feature=='{feature}'")\
            .set_index("tones")\
            .loc[tone_index_to_sort_by]["answer"]
        linreg = LinearRegression().fit(
            cc_MDS_sided[["MDS_x", "MDS_y"]].values,
            # cc_MDS_sided_feated
            (cc_MDS_sided_feated - np.mean(cc_MDS_sided_feated)) / np.std(cc_MDS_sided_feated)
        )
        feature_coeff_dict[feature] = linreg.coef_
        feature_raw_dict[feature] = cc_MDS_sided_feated
    return feature_coeff_dict, feature_raw_dict

In [None]:
gpt_biplot_dir, gpt_feature_dir = get_MDS_biplot_dir(
    "GPT",
    gpt_tone_features.rename(
        columns={
            "current_rating": "answer",
            "tone": "tones"
        }
    )
)
gpt_biplot_dir = {
    k: gpt_biplot_dir[k] for k in ["arousal", "Informational", "positive in valence", "Relational"]
}
gpt_feature_dir = {
    k: gpt_feature_dir[k] for k in ["arousal", "Informational", "positive in valence", "Relational"]
}
human_biplot_dir, human_feature_dir = get_MDS_biplot_dir(
    "Human",
    human_tone_features
)

### Obtaining Explained Variance for Arrowmarks

In [None]:
def get_one_feature_vec(feature, ccm_MDS, biplot_direction_side, feature_df, target_ccm_MDS=None):
    indices = cross_corr_matrix.index.str.split(": ")
    MDS_with_source_tones = pd.DataFrame(ccm_MDS)\
        .assign(source=[elem[0] for elem in indices], tone=[elem[1] for elem in indices])
    MDS_with_source_tones = MDS_with_source_tones.rename(columns={0: "MDS_x", 1: "MDS_y"})

    feature_df = feature_df[feature_df["tones"].isin(same_tone_indices["tone"])]
    cc_MDS_sided = MDS_with_source_tones.query(f"source=='{biplot_direction_side}'")
    cc_MDS_sided = cc_MDS_sided[cc_MDS_sided["tone"].isin(same_tone_indices["tone"])]
    tone_index_to_sort_by = cc_MDS_sided["tone"]
    
    cc_MDS_sided_feated = feature_df\
        .query(f"feature=='{feature}'")\
        .set_index("tones")\
        .loc[tone_index_to_sort_by]["answer"]
    
    linreg = LinearRegression().fit(
        cc_MDS_sided[["MDS_x", "MDS_y"]].values,
        # cc_MDS_sided_feated
        (cc_MDS_sided_feated - np.mean(cc_MDS_sided_feated)) / np.std(cc_MDS_sided_feated)
    )
    f_vec = linreg.coef_
    return f_vec

In [None]:
def get_one_feature_variance(feature, ccm_MDS, biplot_direction_side, feature_df, target_ccm_MDS=None):
    f_vec = get_one_feature_vec(feature, ccm_MDS, biplot_direction_side, feature_df, target_ccm_MDS=None)
    
    if target_ccm_MDS is None:
        target_ccm_MDS = ccm_MDS
    
    normed_f_vec = f_vec / np.linalg.norm(f_vec)
    mds_cov = np.cov(target_ccm_MDS.T)
    return (
        normed_f_vec.reshape((1, 2)) @ mds_cov @ normed_f_vec.reshape((2, 1)) / np.trace(mds_cov)
    )

In [None]:
gpt_exp_vars = {}

for gpt_features in gpt_biplot_dir:
    gpt_exp_vars[gpt_features] = get_one_feature_variance(
        gpt_features, cross_corr_matrix_MDS, "GPT", gpt_tone_features.rename(
            columns={
                "current_rating": "answer",
                "tone": "tones"
            }
        )
    )[0][0]

In [None]:
human_exp_vars = {}

for human_features in human_biplot_dir:
    human_exp_vars[human_features] = get_one_feature_variance(
        human_features, cross_corr_matrix_MDS, "GPT", human_tone_features.rename(
            columns={
                "current_rating": "answer",
                "tone": "tones"
            }
        )
    )[0][0]

### Cross Correlation MDS-Biplot with Features

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))
projection = lambda pt1, pt2: np.dot(pt1, pt2) / np.dot(pt2, pt2)
for pt, word in zip(cross_corr_matrix_MDS, cross_corr_matrix.index):
    if "Human" in word:
        projected_score_on_valence = 1 - cosine(pt, human_biplot_dir["positive in valence"])
        projected_score_on_arousal = 1 - cosine(pt, human_biplot_dir["aroused"])
        color = "red"
    elif "GPT" in word:
        projected_score_on_valence = 1 - cosine(pt, gpt_biplot_dir["positive in valence"])
        projected_score_on_arousal = 1 - cosine(pt, gpt_biplot_dir["aroused"])
        color = "blue"
    plt.scatter(
        x=pt[0], y=pt[1], c=color  # [(projected_score_on_valence / 2 + 0.5, 0, projected_score_on_arousal / 2 + 0.5)]
    )
    ax.annotate(word, [pt[0] - 0.1, pt[1] + 0.05], ha="center")

for row_id in same_tone_indices.index:
    gpt_aligned_pt = [same_tone_indices.iloc[row_id, 4], same_tone_indices.iloc[row_id, 5]]
    human_pt = [same_tone_indices.iloc[row_id, 0], same_tone_indices.iloc[row_id, 1]]
    plt.plot(
        [gpt_aligned_pt[0], human_pt[0]],
        [gpt_aligned_pt[1], human_pt[1]],
        # color = "red",
        # alpha = (1 - euclidean(translate_aligned_pt, rewrite_pt) / 5) ** 3
        path_effects=[path_effects.SimpleLineShadow(offset=(0, 0), shadow_color='black', alpha=0.3, linewidth=12)]
    )

for feature in gpt_biplot_dir:
    coord_x = gpt_biplot_dir[feature][0] * 4
    coord_y = gpt_biplot_dir[feature][1] * 4
    plt.arrow(0, 0, coord_x, coord_y, head_width=0.02, color="blue", width=0.01)
    ax.annotate(
        feature, [coord_x, coord_y], [coord_x * 1.1 + int(coord_x > 0) * 0.15, coord_y * 1.1], color="blue", ha="center"
    )

for feature in human_biplot_dir:
    coord_x = human_biplot_dir[feature][0] * 4
    coord_y = human_biplot_dir[feature][1] * 4
    plt.arrow(0, 0, coord_x, coord_y, head_width=0.02, color="red", width=0.01)
    ax.annotate(
        feature, [coord_x, coord_y], [coord_x * 1.1 + 0.05, coord_y * 1.1 - 0.03], color="red", ha="center"
    )

ax.axis("off")

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))
projection = lambda pt1, pt2: np.dot(pt1, pt2) / np.dot(pt2, pt2)
for pt, word in zip(cross_corr_matrix_MDS, cross_corr_matrix.index):
    if "Human" in word:
        projected_score_on_valence = 1 - cosine(pt, human_biplot_dir["positive in valence"])
        projected_score_on_arousal = 1 - cosine(pt, human_biplot_dir["aroused"])
        color = "red"
    elif "GPT" in word:
        projected_score_on_valence = 1 - cosine(pt, gpt_biplot_dir["positive in valence"])
        projected_score_on_arousal = 1 - cosine(pt, gpt_biplot_dir["aroused"])
        color = "blue"
    plt.scatter(
        x=pt[0], y=pt[1], c=color  # [(projected_score_on_valence / 2 + 0.5, 0, projected_score_on_arousal / 2 + 0.5)]
    )
    # ax.annotate(word, [pt[0] - 0.1, pt[1] + 0.05], ha="center")

for row_id in same_tone_indices.index:
    gpt_aligned_pt = [same_tone_indices.iloc[row_id, 4], same_tone_indices.iloc[row_id, 5]]
    human_pt = [same_tone_indices.iloc[row_id, 0], same_tone_indices.iloc[row_id, 1]]
    plt.plot(
        [gpt_aligned_pt[0], human_pt[0]],
        [gpt_aligned_pt[1], human_pt[1]],
        # color = "red",
        # alpha = (1 - euclidean(translate_aligned_pt, rewrite_pt) / 5) ** 3
        path_effects=[path_effects.SimpleLineShadow(offset=(0, 0), shadow_color='black', alpha=0.3, linewidth=12)]
    )

for feature in gpt_biplot_dir:
    coord_x = gpt_biplot_dir[feature][0] * 4
    coord_y = gpt_biplot_dir[feature][1] * 4
    plt.arrow(0, 0, coord_x, coord_y, head_width=0.02, color="blue", width=0.01)
    # ax.annotate(
    #     feature, [coord_x, coord_y], [coord_x * 1.1 + int(coord_x > 0) * 0.15, coord_y * 1.1], color="blue", ha="center"
    # )

for feature in human_biplot_dir:
    coord_x = human_biplot_dir[feature][0] * 4
    coord_y = human_biplot_dir[feature][1] * 4
    plt.arrow(0, 0, coord_x, coord_y, head_width=0.02, color="red", width=0.01)
    # ax.annotate(
    #     feature, [coord_x, coord_y], [coord_x * 1.1 + 0.05, coord_y * 1.1 - 0.03], color="red", ha="center"
    # )

ax.axis("off")

### Bootstrap Analyses for Feature Vector

In [None]:
gpt_feature_shared_explained_var = {}
for MDS_seed in range(5000):
    cur_ccm_mds = MDS(random_state=MDS_seed).fit_transform(cross_corr_matrix.values)
    for gpt_features in gpt_biplot_dir:
        if gpt_features not in gpt_feature_shared_explained_var:
            gpt_feature_shared_explained_var[gpt_features] = []
        gpt_feature_shared_explained_var[gpt_features].append(
            get_one_feature_variance(gpt_features, cur_ccm_mds, "GPT", gpt_tone_features.rename(
        columns={
            "current_rating": "answer",
            "tone": "tones"
        }
    ))[0][0]
        )

In [None]:
for gpt_features in gpt_feature_shared_explained_var:
    print(
        (
            gpt_features,
            np.mean(gpt_feature_shared_explained_var[gpt_features]),
            np.std(gpt_feature_shared_explained_var[gpt_features]),
            (
                np.percentile(gpt_feature_shared_explained_var[gpt_features], 2.5),
                np.percentile(gpt_feature_shared_explained_var[gpt_features], 97.5)
            )
        )
    )

In [None]:
human_feature_human_space_explained_var = {}
for MDS_seed in range(5000):
    cur_ccm_mds = MDS(random_state=MDS_seed).fit_transform(cross_corr_matrix.values)
    for human_features in human_biplot_dir:
        if human_features not in human_feature_human_space_explained_var:
            human_feature_human_space_explained_var[human_features] = []
        human_feature_human_space_explained_var[human_features].append(
            get_one_feature_variance(human_features, cur_ccm_mds, "Human", human_tone_features, cur_ccm_mds[:40])[0][0]
        )

In [None]:
for human_features in human_feature_human_space_explained_var:
    print(
        (
            human_features,
            np.mean(human_feature_human_space_explained_var[human_features]),
            np.std(human_feature_human_space_explained_var[human_features]),
            (
                np.percentile(human_feature_human_space_explained_var[human_features], 2.5),
                np.percentile(human_feature_human_space_explained_var[human_features], 97.5)
            )
        )
    )

In [None]:
def get_angle_between_vecs(v1, v2):
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    return (np.dot(v1 / norm_v1, v2 / norm_v2))

cosines_bootstrap_stats = {human_features: [] for human_features in human_biplot_dir}
for MDS_seed in range(5000):
    cur_ccm_mds = MDS(random_state=MDS_seed).fit_transform(cross_corr_matrix.values)
    for human_features, gpt_features in zip(human_biplot_dir, gpt_biplot_dir):
        human_angle = get_one_feature_vec(human_features, cur_ccm_mds, "Human", human_tone_features, cur_ccm_mds[:40])
        gpt_angle = get_one_feature_vec(
            gpt_features, cur_ccm_mds, "GPT",
            gpt_tone_features.rename(
                columns={
                    "current_rating": "answer",
                    "tone": "tones"
                }
            )
        )
        cosines_bootstrap_stats[human_features].append(get_angle_between_vecs(human_angle, gpt_angle))

In [None]:
for human_features in human_biplot_dir:
    print(
        f"""
            {human_features}
            mean: {np.mean(cosines_bootstrap_stats[human_features])}
            lower_ci: {np.percentile(cosines_bootstrap_stats[human_features], 2.5)}
            higher_ci: {np.percentile(cosines_bootstrap_stats[human_features], 97.5)}
        """
    )

## Correlation Matrix Splithalf Reliability

In [None]:
def get_halfsplit_correlation(df):
    df_shuffled = df.sample(frac=1.0)
    halfsplit_a, halfsplit_b = df_shuffled[:len(df_shuffled) // 2], df_shuffled[len(df_shuffled) // 2:]
    halfsplit_a_corr_triu = halfsplit_a.corr().values[np.triu_indices(40, 1)]
    halfsplit_b_corr_triu = halfsplit_b.corr().values[np.triu_indices(40, 1)]
    return np.corrcoef(halfsplit_a_corr_triu, halfsplit_b_corr_triu)[0, 1]

In [None]:
def get_cross_species_corr(df1, df2):
    halfsplit_a, halfsplit_b = df1.sample(frac=1.0, replace=True), df2.sample(frac=1.0, replace=True)
    halfsplit_a_corr_triu = halfsplit_a.corr().values[np.triu_indices(40, 1)]
    halfsplit_b_corr_triu = halfsplit_b.corr().values[np.triu_indices(40, 1)]
    return np.corrcoef(halfsplit_a_corr_triu, halfsplit_b_corr_triu)[0, 1]

In [None]:
cross_species_halfsplit_corr = []
while len(cross_species_halfsplit_corr) <= 5000:
    corr_value = get_cross_species_corr(human_ratings_full, gpt_ratings_full)
    if not np.isnan(corr_value):
        cross_species_halfsplit_corr.append(corr_value)
plt.hist(cross_species_halfsplit_corr)
np.mean(cross_species_halfsplit_corr), np.percentile(cross_species_halfsplit_corr, 2.5), np.percentile(cross_species_halfsplit_corr, 97.5)

In [None]:
human_halfsplit_corr = []
for _ in range(5000):
    human_halfsplit_corr.append(get_halfsplit_correlation(human_ratings_full))
plt.hist(human_halfsplit_corr)

In [None]:
gpt_halfsplit_corr = []
for _ in range(5000):
    appended = get_halfsplit_correlation(gpt_ratings_full)
    if np.isnan(appended):
        continue
    gpt_halfsplit_corr.append(appended)
plt.hist(gpt_halfsplit_corr)

In [None]:
human_gpt_halfsplit_corr = []
for _ in range(5000):
    human_gpt_halfsplit_corr.append(
        get_halfsplit_correlation(
            pd.concat(
                [
                    human_ratings_full_copy.T,
                    gpt_ratings_full_copy.T
                ]
            ).T
        )
    )
plt.hist(human_gpt_halfsplit_corr)


## Cross Correlation Diagonals

In [None]:
from matplotlib.colors import LinearSegmentedColormap


fig, ax = plt.subplots(figsize=(15, 12))
sns.heatmap(cross_corr_matrix.iloc[:40, 40:],
    vmin=-1, vmax=1,
    cmap = LinearSegmentedColormap.from_list('my_gradient', (
    # Edit this gradient at https://eltos.github.io/gradient/#00876C-FFFAA8-D43D51
    (0.000, (0.000, 0.529, 0.424)),
    (0.500, (1.000, 0.980, 0.659)),
    (1.000, (0.831, 0.239, 0.318)))))
plt.title("Conversation Tones Cross-Correlation Matrix", fontsize=16)
ax.set_xticklabels([s._text.split(": ")[1] for s in ax.get_xticklabels()], fontsize=12)
ax.set_yticklabels([s._text.split(": ")[1] for s in ax.get_yticklabels()], fontsize=12)
cbar = ax.collections[0].colorbar
cbar.ax.tick_params(labelsize=12)

cross_corr_quadrant = cross_corr_matrix.iloc[40:, :40].stack().reset_index()\
    .rename(
        columns={
            "level_0": "gpt_tones",
            "level_1": "human_tones",
            0: "correlation"
        }
    )

cross_corr_quadrant[["_1", "GPT_tone_name"]] = cross_corr_quadrant["gpt_tones"].str.split(": ", expand=True)
cross_corr_quadrant[["_1", "human_tone_name"]] = cross_corr_quadrant["human_tones"].str.split(": ", expand=True)
cross_corr_quadrant = cross_corr_quadrant.drop(columns=["_1"])
cross_corr_consistencies = cross_corr_quadrant[
    cross_corr_quadrant["GPT_tone_name"] == cross_corr_quadrant["human_tone_name"]
].sort_values("correlation", ascending=False)

In [None]:
def make_human_halves():
    human_rating_responses = human_rating_trials["answer"].dropna().apply(json.loads).apply(pd.Series)
    human_ratings = human_rating_responses["answer"].apply(pd.Series)
    
    testing_data = pd.merge(
            human_rating_responses.drop(columns=["tones", "answer"]),
            human_ratings,
            left_index=True,
            right_index=True
        )\
            .melt("sentence").dropna()\
            .groupby(["sentence", "variable"])\
            .apply(lambda s: s.sample(frac=1))\
            .reset_index(drop=True)

    first_half_ind = testing_data.groupby(["sentence", "variable"])\
            .apply(lambda s: s.sample(frac=1/2))\
            .index.get_level_values(2)
    
    def get_pivot_version(df):
        return_tbl = df.pivot_table(
            index="sentence",
            columns="variable",
            values="value",
            aggfunc=lambda x: np.mean(x.dropna().astype(float))
        )
        return_tbl = return_tbl.loc[gpt_ratings_full.index.str.replace("'", "`"), :]
        return return_tbl
    
    return get_pivot_version(testing_data.loc[first_half_ind]),\
        get_pivot_version(testing_data.drop(first_half_ind))

def make_gpt_halves():
    first_half_ind = gpt_rating_trials.groupby(["sentence", "tone"])\
            .apply(lambda s: s.sample(frac=1/2))\
            .index.get_level_values(2)
    
    def get_pivot_version(df):
        return_tbl = df.pivot_table(
            index="sentence",
            columns="tone",
            values="current_rating"
        )
        return_tbl.index = return_tbl.index.str.replace("'", "`")
        return return_tbl
    
    return get_pivot_version(gpt_rating_trials.loc[first_half_ind]),\
        get_pivot_version(gpt_rating_trials.drop(first_half_ind))

def cross_corr_attenuate_bootstrap():
    human_half_a, human_half_b = make_human_halves()
    gpt_half_a, gpt_half_b = make_gpt_halves()
    cross_corr_a = np.corrcoef(
        human_half_a.T.iloc[human_sorting_label],
        human_half_b.T.iloc[human_sorting_label]
    )[:40, 40:]
    cross_corr_b = np.corrcoef(
        gpt_half_a.T.iloc[gpt_sorting_label],
        gpt_half_b.T.iloc[gpt_sorting_label]
    )[:40, 40:]
    return np.diagonal(cross_corr_a), np.diagonal(cross_corr_b)

In [None]:
def one_cc_bootstrap(random_state):
    cross_corr_matrix = pd.concat(
        [
            human_ratings_full_copy.sample(frac=0.9, random_state=random_state).T.iloc[human_sorting_label],
            gpt_ratings_full_copy.sample(frac=0.9, random_state=random_state).T.iloc[gpt_sorting_label]
        ]
    ).T.corr()
    cross_corr_quad = cross_corr_matrix.iloc[:40, 40:]
    if np.isnan(cross_corr_quad.values).any():
        return one_cc_bootstrap(random_state)
    return np.diagonal(cross_corr_quad.values)

In [None]:
diagonal_splithalf_a = []
diagonal_splithalf_b = []
for s in range(100):
    curr_half_corr = cross_corr_attenuate_bootstrap()
    if not np.isnan(curr_half_corr[0]).any():
        diagonal_splithalf_a.append(curr_half_corr[0])
    if not np.isnan(curr_half_corr[1]).any():
        diagonal_splithalf_b.append(curr_half_corr[1])

In [None]:
cc_diagonal_vals = np.vstack([one_cc_bootstrap(random_num) for random_num in range(5000)])
diagonal_means_a = np.vstack(diagonal_splithalf_a).mean(axis=0)
diagonal_means_b = np.vstack(diagonal_splithalf_b).mean(axis=0)
attenuated_corr = np.mean(cc_diagonal_vals, axis=0) / np.sqrt(diagonal_means_a * diagonal_means_b)

In [None]:
data_to_plot = pd.Series(attenuated_corr, index=human_ratings_full.columns[human_sorting_label])\
    .reset_index()\
    .rename(columns={"index": "tone", 0: "value"})
cross_corr_consistencies_to_plot = cross_corr_consistencies[["GPT_tone_name", "correlation"]]\
    .rename(columns={"GPT_tone_name": "tone", "correlation": "value"})

In [None]:
bargraph_data = pd.DataFrame(
    data = {
        "mean": np.mean(cc_diagonal_vals, axis=0),
        "attenuated_mean": attenuated_corr,
        "lower_ci": np.percentile(cc_diagonal_vals, 2.5, axis=0),
        "high_ci": np.percentile(cc_diagonal_vals, 97.5, axis=0),
        "tone_name": cross_corr_quadrant[
            cross_corr_quadrant["GPT_tone_name"] == cross_corr_quadrant["human_tone_name"]
        ]["human_tone_name"]
    }
).sort_values("mean", ascending=True)

In [None]:
fig, ax = plt.subplots(figsize=(4, 15))
sns.barplot(
    data=bargraph_data,
    y="tone_name",
    x="mean",
    orientation="horizontal"
)

x_coords = [p.get_width() for p in ax.patches]
y_coords = [p.get_y() + 0.5* p.get_height() for p in ax.patches]
for x, y, tone_info in zip(
    x_coords,
    y_coords,
    bargraph_data["tone_name"]
):
    target_row = bargraph_data.query(f"tone_name=='{tone_info}'")
    plt.errorbar(
        x, y,
        xerr=(target_row.iloc[:, 0] - target_row.iloc[:, 1], target_row.iloc[:, 2] - target_row.iloc[:, 0]),
        fmt="none",
        c= "k",
        capsize=5
    )
    

plt.xticks(rotation=90)
plt.ylim(-0.6, 39.6)

plt.title("Conversation Tones Cross Correlation Matrix \nDiagonal Terms")
plt.ylabel("Conversation Tone Name")

plt.savefig('figures/3c-errorbar.pdf', dpi=300, bbox_inches='tight')

## Similarity Judgment Analyses

### Organizing Similarity Data

In [None]:
human_sjt = pd.read_csv(
    "../data/similarity_judgments/human_sjts.csv"
)[["answer", "word_1", "word_2"]].dropna()

In [None]:
def get_sim_matrix_human(original_df, corr_index):
    sims = original_df.pivot_table(
        index="word_1",
        columns="word_2",
        values="answer"
    ).iloc[corr_index, corr_index].fillna(0)
    vals = sims.values + sims.T.values
    for i in range(40):
        vals[i, i] /= 2
    return vals

human_on_tone_sim = get_sim_matrix_human(human_sjt, human_sorting_label) / 5

In [None]:
def get_halfsplit_correlation(df, sim_triu, sorting_label):
    df_shuffled_corr = None
    while df_shuffled_corr is None or np.isnan(df_shuffled_corr).any():
        df_shuffled_corr = df.sample(frac=1.0, replace=True).corr().iloc[sorting_label, sorting_label].values[np.triu_indices(40, 1)]
    return np.corrcoef(df_shuffled_corr, sim_triu)[0, 1]

In [None]:
def get_sim_matrix_gpt(original_df, corr_index):
    sims = original_df.pivot_table(
        index="tone_a",
        columns="tone_b",
        values="current_rating"
    ).iloc[corr_index, corr_index].fillna(0)
    vals = sims.values + sims.T.values
    for i in range(40):
        vals[i, i] /= 2
    return vals
gpt_sjt = pd.read_csv("../data/similarity_judgments/gpt_sjts.csv", sep="|")
gpt_on_tone_sim = get_sim_matrix_gpt(
    gpt_sjt,
    gpt_sorting_label
)

In [None]:
def get_one_bootstrap_sim(sjt_df, sorting_label, grouping_cols, mat_method):
    return mat_method(
        sjt_df.groupby(grouping_cols)\
            .apply(lambda s: s.sample(frac=1, replace=True))\
            .reset_index(drop=True),
        sorting_label
    )


### Plotting Similarity Matrices

In [None]:
fig, ax = plt.subplots(figsize=(18, 15))
sns.heatmap(gpt_on_tone_sim,
    vmin=0, vmax=1,
    cmap="coolwarm")
ax.set_xticklabels(cross_corr_matrix.index[40:])
ax.set_yticklabels(cross_corr_matrix.index[40:])

plt.xticks(rotation=90)
plt.yticks(rotation=0)

plt.title("GPT Conversation Tone Similarity Judgment Matrix", fontsize=16)
ax.set_xticklabels([s._text.split(": ")[1] for s in ax.get_xticklabels()], fontsize=12)
ax.set_yticklabels([s._text.split(": ")[1] for s in ax.get_yticklabels()], fontsize=12)
cbar = ax.collections[0].colorbar
cbar.ax.tick_params(labelsize=12)

plt.savefig('figures/3d-gpt.pdf', dpi=300, bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize=(18, 15))
sns.heatmap(human_on_tone_sim,
    vmin=0, vmax=1,
    cmap="coolwarm")
ax.set_xticklabels(cross_corr_matrix.index[:40])
ax.set_yticklabels(cross_corr_matrix.index[:40])

plt.xticks(rotation=90)
plt.yticks(rotation=0)

plt.title("Human Conversation Tone Similarity Judgment Matrix", fontsize=16)
ax.set_xticklabels([s._text.split(": ")[1] for s in ax.get_xticklabels()], fontsize=12)
ax.set_yticklabels([s._text.split(": ")[1] for s in ax.get_yticklabels()], fontsize=12)
cbar = ax.collections[0].colorbar
cbar.ax.tick_params(labelsize=12)

plt.savefig('figures/3d-human.pdf', dpi=300, bbox_inches='tight')

### Reliability and Correlation Analyses

In [None]:
human_boot = [
    get_halfsplit_correlation(
        human_ratings_full, human_on_tone_sim[np.triu_indices(40, 1)], human_sorting_label
    ) for _ in range(5000)
]
gpt_boot = [
    get_halfsplit_correlation(
        gpt_ratings_full, gpt_on_tone_sim[np.triu_indices(40, 1)], gpt_sorting_label
    ) for _ in range(5000)
]

In [None]:
gpt_corr_human_sim_boot = [
    get_halfsplit_correlation(
        gpt_ratings_full, human_on_tone_sim[np.triu_indices(40, 1)], human_sorting_label
    ) for _ in range(5000)
]
human_corr_gpt_sim_boot = [
    get_halfsplit_correlation(
        human_ratings_full, gpt_on_tone_sim[np.triu_indices(40, 1)], gpt_sorting_label
    ) for _ in range(5000)
]

In [None]:
def get_halfsplit_human_sim_corr():
    first_half_inds = human_sjt.groupby(["word_1", "word_2"])\
        .apply(lambda s: s.sample(2)).index\
        .get_level_values(2)
    first_half_sjt = human_sjt.loc[first_half_inds]
    second_half_sjt = human_sjt.drop(index=first_half_inds)
    first_half_sim = get_sim_matrix_human(first_half_sjt, human_sorting_label)
    second_half_sim = get_sim_matrix_human(second_half_sjt, human_sorting_label)
    return np.corrcoef(
        first_half_sim[np.triu_indices(40, 1)],
        second_half_sim[np.triu_indices(40, 1)]
    )[0, 1]
    
def get_halfsplit_gpt_sim_corr():
    first_half_inds = gpt_sjt.groupby(["tone_a", "tone_b"])\
        .apply(lambda s: s.sample(2)).index\
        .get_level_values(2)
    first_half_sjt = gpt_sjt.loc[first_half_inds]
    second_half_sjt = gpt_sjt.drop(index=first_half_inds)
    first_half_sim = get_sim_matrix_gpt(first_half_sjt, gpt_sorting_label)
    second_half_sim = get_sim_matrix_gpt(second_half_sjt, gpt_sorting_label)
    return np.corrcoef(
        first_half_sim[np.triu_indices(40, 1)],
        second_half_sim[np.triu_indices(40, 1)]
    )[0, 1]

In [None]:
np.mean(gpt_sim_halves), np.percentile(gpt_sim_halves, 2.5), np.percentile(gpt_sim_halves, 97.5)

In [None]:
# human_sim_halves = [
#     get_halfsplit_human_sim_corr()
#     for _ in range(5000)
# ]
gpt_sim_halves = [
    get_halfsplit_gpt_sim_corr()
    for _ in range(5000)
]

## CC Alignment Space Analysis

In [None]:
intergroup_distances = np.linalg.norm(
    np.vstack(
        [
            same_tone_indices["MDS_x_x"] - same_tone_indices["MDS_x_y"],
            same_tone_indices["MDS_y_x"] - same_tone_indices["MDS_y_y"],
        ]
    ),
    axis=0
)
intergroup_distances_df = pd.DataFrame(
    data={
        "tone": same_tone_indices["tone"],
        "distances": intergroup_distances,
        "-corr": -cross_corr_consistencies\
            .set_index("GPT_tone_name")\
            .loc[same_tone_indices["tone"]]["correlation"].values
    }
)

In [None]:
root_cross_corr_embeddings = pd.DataFrame(
    index=cross_corr_matrix.index,
    data=[[np.array(coord)] for coord in cross_corr_matrix.values.tolist()]
)
root_cross_corr_embeddings["source"] = pd.Series(root_cross_corr_embeddings.index).apply(lambda x: x.split(": ")[0]).values
root_cross_corr_embeddings["tone"] = pd.Series(root_cross_corr_embeddings.index).apply(lambda x: x.split(": ")[1]).values
root_cross_corr_embeddings = root_cross_corr_embeddings.rename(columns={0:"coordinates"})
root_cross_corr_embeddings = pd.merge(
    root_cross_corr_embeddings.query("source == 'Human'").drop(columns="source"),
    root_cross_corr_embeddings.query("source == 'GPT'").drop(columns="source"),
    left_on="tone",
    right_on="tone",
    suffixes=("_human", "_gpt")
)
root_cross_corr_embeddings["distances"] = np.linalg.norm(
    np.vstack(
        root_cross_corr_embeddings["coordinates_gpt"] - root_cross_corr_embeddings["coordinates_human"]
    ),
    axis=1
)

In [None]:
def get_random_cc_matrix(random_state):
    cross_corr_matrix = pd.concat(
        [
            human_ratings_full_copy.sample(frac=0.9, random_state=random_state).T.iloc[human_sorting_label],
            gpt_ratings_full_copy.sample(frac=0.9, random_state=random_state).T.iloc[gpt_sorting_label]
        ]
    ).T.corr()
    if np.isnan(cross_corr_matrix.values).any():
        return one_cc_bootstrap(random_state + 5000)
    cross_corr_matrix_MDS = MDS(random_state=42).fit_transform(cross_corr_matrix.values)
    indices = cross_corr_matrix.index.str.split(": ")
    MDS_with_source_tones = pd.DataFrame(cross_corr_matrix_MDS)\
        .assign(source=[elem[0] for elem in indices], tone=[elem[1] for elem in indices])
    MDS_with_source_tones = MDS_with_source_tones.rename(columns={0: "x", 1: "y"})
    same_tone_indices = pd.merge(
        MDS_with_source_tones.query("source=='Human'"),
        MDS_with_source_tones.query("source=='GPT'"),
        left_on="tone", right_on="tone",
        suffixes=("_human", "_gpt")
    )
    return same_tone_indices.iloc[:, [3, 0, 1, 4, 5]]

In [None]:
distance_series = []
for seed in range(5000):
    cur_random_MDS = get_random_cc_matrix(seed)
    distance = ((cur_random_MDS.iloc[:,1] - cur_random_MDS.iloc[:,3]) ** 2 + (cur_random_MDS.iloc[:,2] - cur_random_MDS.iloc[:,4]) ** 2) ** 0.5
    distance_series.append(distance)

In [None]:
distances_df = pd.DataFrame(distance_series)
distances_df.columns = get_random_cc_matrix(0).iloc[:, 0]

In [None]:
dist_bargraph_data = pd.DataFrame(
    data = {
        "mean": np.mean(distances_df, axis=0),
        "lower_ci": np.percentile(distances_df, 2.5, axis=0),
        "higher_ci": np.percentile(distances_df, 97.5, axis=0),
        "tone_name": distances_df.columns
    }
).sort_values("mean", ascending=False)

In [None]:
fig, ax = plt.subplots(figsize=(15, 3))
sns.barplot(
    data=dist_bargraph_data,
    x="tone_name",
    y="mean"
)

x_coords = [p.get_x() + 0.5*p.get_width() for p in ax.patches]
y_coords = [p.get_height() for p in ax.patches]
for x, y, tone_info in zip(
    x_coords,
    y_coords,
    dist_bargraph_data["tone_name"]
):
    target_row = dist_bargraph_data.query(f"tone_name=='{tone_info}'")
    plt.errorbar(
        x, y,
        yerr=(target_row.iloc[:, 0] - target_row.iloc[:, 1], target_row.iloc[:, 2] - target_row.iloc[:, 0]),
        fmt="none",
        c= "k",
        capsize=5
    )
    

plt.xticks(rotation=90)
plt.xlim(-0.6, 39.6)

plt.title("Conversation Tones Cross Correlation Alignment, Literal Pair Distances")
# ax.set_xticklabels(ax.get_xticklabels(), fontsize=18)
# ax.set_yticklabels(ax.get_yticklabels(), fontsize=18)
plt.ylabel("Conversation Tone Name")
plt.savefig('figures/4b-distances.pdf', dpi=300, bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize=(13, 3))

sns.barplot(
    root_cross_corr_embeddings[["tone", "distances"]].sort_values("distances", ascending=False),
    x="tone",
    y="distances"
)

plt.title("Euclidean Distance of Conversation Tones in Cross-Correlational Shared Space")
plt.xlabel("Conversation Tone")
plt.ylabel("Euclidean Distances")
plt.xticks(rotation=80)

""

In [None]:
fig, ax = plt.subplots(figsize=(5, 10))

sns.barplot(
    intergroup_distances_df[["tone", "distances"]].sort_values("distances", ascending=False),
    y="tone",
    x="distances",
    orientation="horizontal"
)

plt.title("Euclidean Distance of Conversation Tones \n in Cross-Correlational Shared Space")
plt.ylabel("Conversation Tone")
plt.xlabel("Euclidean Distances")

In [None]:
human_coords = np.vstack(np.vstack(root_cross_corr_embeddings["coordinates_human"]))
gpt_coords = np.vstack(np.vstack(root_cross_corr_embeddings["coordinates_gpt"]))
human_containing_nn = NearestNeighbors().fit(human_coords)
gpt_containing_nn = NearestNeighbors().fit(gpt_coords)
ind_to_words = lambda x: [root_cross_corr_embeddings["tone"][ind] for ind in x]

human_to_gpt_nn_map = {
    ind: gpt_containing_nn.kneighbors(human_coords[ind].reshape((1,80)))
    for ind in range(human_coords.shape[0])
}
gpt_to_human_nn_map = {
    ind: human_containing_nn.kneighbors(gpt_coords[ind].reshape((1,80)))
    for ind in range(gpt_coords.shape[0])
}

# human_to_gpt_nn_map = {
#     root_cross_corr_embeddings["tone"][ind]: ind_to_words(gpt_containing_nn.kneighbors(human_coords[ind].reshape((1,80)))[1])
#     for ind in range(human_coords.shape[0])
# }
# gpt_to_human_nn_map = {
#     root_cross_corr_embeddings["tone"][ind]: ind_to_words(human_containing_nn.kneighbors(gpt_coords[ind].reshape((1,80)))[1])
#     for ind in range(gpt_coords.shape[0])
# }

In [None]:
fig, ax = plt.subplots(figsize=(15, 10))
y_mult = 1.05
y_axes = y_mult * np.arange(40)
plt.scatter(2 * np.ones(40), y_axes, color="red")
plt.scatter(3 * np.ones(40), y_axes, color="blue")
plt.scatter(3.5 * np.ones(40), y_axes, color="blue")
plt.scatter(4.5 * np.ones(40), y_axes, color="red")

for human_tone in human_to_gpt_nn_map:
    human_matched_target = human_to_gpt_nn_map[human_tone][1][0][0]
    plt.plot(
        [2, 3],
        [human_tone * y_mult, human_matched_target * y_mult]
    )
    ax.annotate(root_cross_corr_embeddings["tone"][human_tone], [1.9, human_tone * y_mult], ha="right", va="center")
    ax.annotate(root_cross_corr_embeddings["tone"][human_tone], [4.6, human_tone * y_mult], ha="left", va="center")

for gpt_tone in gpt_to_human_nn_map:
    gpt_matched_target = gpt_to_human_nn_map[gpt_tone][1][0][0]
    plt.plot(
        [3.5, 4.5],
        [gpt_tone * y_mult, gpt_matched_target * y_mult]
    )
    ax.annotate(root_cross_corr_embeddings["tone"][gpt_tone], [3.25, gpt_tone * y_mult], ha="center", va="center")
plt.xlim((1.5, 5))
ax.axis("off")

In [None]:
def get_one_connection_bootstrap(n_iter=5000):
    human_to_gpt_nn_map = {ind: [] for ind in range(40)}
    gpt_to_human_nn_map = {ind: [] for ind in range(40)}
    for s in range(n_iter):
        test_cross_corr_matrix = pd.concat(
            [
                human_ratings_full_copy.sample(frac=0.9, random_state=s).T.iloc[human_sorting_label],
                gpt_ratings_full_copy.sample(frac=0.9, random_state=s).T.iloc[gpt_sorting_label]
            ]
        ).T.corr()
        root_cross_corr_embeddings = pd.DataFrame(
            index=test_cross_corr_matrix.index,
            data=[[np.array(coord)] for coord in test_cross_corr_matrix.values.tolist()]
        )
        root_cross_corr_embeddings["source"] = pd.Series(root_cross_corr_embeddings.index).apply(lambda x: x.split(": ")[0]).values
        root_cross_corr_embeddings["tone"] = pd.Series(root_cross_corr_embeddings.index).apply(lambda x: x.split(": ")[1]).values
        root_cross_corr_embeddings = root_cross_corr_embeddings.rename(columns={0:"coordinates"})
        root_cross_corr_embeddings = pd.merge(
            root_cross_corr_embeddings.query("source == 'Human'").drop(columns="source"),
            root_cross_corr_embeddings.query("source == 'GPT'").drop(columns="source"),
            left_on="tone",
            right_on="tone",
            suffixes=("_human", "_gpt")
        )
        root_cross_corr_embeddings["distances"] = np.linalg.norm(
            np.vstack(
                root_cross_corr_embeddings["coordinates_gpt"] - root_cross_corr_embeddings["coordinates_human"]
            ),
            axis=1
        )

        human_coords = np.vstack(np.vstack(root_cross_corr_embeddings["coordinates_human"]))
        gpt_coords = np.vstack(np.vstack(root_cross_corr_embeddings["coordinates_gpt"]))
        human_containing_nn = NearestNeighbors(n_neighbors=1).fit(human_coords)
        gpt_containing_nn = NearestNeighbors(n_neighbors=1).fit(gpt_coords)

        for ind in range(gpt_coords.shape[0]):
            human_to_gpt_nn_map[ind].append(gpt_containing_nn.kneighbors(human_coords[ind].reshape((1,80)))[1][0][0])
            gpt_to_human_nn_map[ind].append(human_containing_nn.kneighbors(gpt_coords[ind].reshape((1,80)))[1][0][0])
    
    for key in human_to_gpt_nn_map:
        human_to_gpt_nn_map[key] = (
            pd.Series(human_to_gpt_nn_map[key])\
                .value_counts() / n_iter
        ).to_dict()
    
    
    for key in gpt_to_human_nn_map:
        gpt_to_human_nn_map[key] = (
            pd.Series(gpt_to_human_nn_map[key])\
                .value_counts() / n_iter
        ).to_dict()
    
    return human_to_gpt_nn_map, gpt_to_human_nn_map

In [None]:
connection_data = get_one_connection_bootstrap(n_iter=100)

In [None]:
fig, ax = plt.subplots(figsize=(15, 10))
y_mult = 1.05
y_axes = y_mult * np.arange(40)
plt.scatter(2 * np.ones(40), y_axes, color="red")
plt.scatter(3 * np.ones(40), y_axes, color="blue")
plt.scatter(3.5 * np.ones(40), y_axes, color="blue")
plt.scatter(4.5 * np.ones(40), y_axes, color="red")

for human_tone in human_to_gpt_nn_map:
    edge_set = connection_data[0][human_tone]
    for edge_target, edge_weight in edge_set.items():
        human_matched_target = edge_target
        plt.plot(
            [2, 3],
            [human_tone * y_mult, human_matched_target * y_mult],
            color="black",
            alpha=edge_weight
        )
    ax.annotate(root_cross_corr_embeddings["tone"][human_tone], [1.9, human_tone * y_mult], ha="right", va="center")
    ax.annotate(root_cross_corr_embeddings["tone"][human_tone], [4.6, human_tone * y_mult], ha="left", va="center")

for gpt_tone in gpt_to_human_nn_map:
    edge_set = connection_data[1][gpt_tone]
    for edge_target, edge_weight in edge_set.items():
        gpt_matched_target = edge_target
        plt.plot(
            [3.5, 4.5],
            [gpt_tone * y_mult, gpt_matched_target * y_mult],
            color="black",
            alpha=edge_weight
        )
    ax.annotate(root_cross_corr_embeddings["tone"][gpt_tone], [3.25, gpt_tone * y_mult], ha="center", va="center")
plt.xlim((1.5, 5))
ax.axis("off")
plt.savefig("./figures/4c.pdf", dpi=300)

## Alignment Benchmarking

In [None]:
import alignment
import external_ot
# importlib.reload(external_ot)

def simple_ot(human_ratings_full, gpt_ratings_full):
    METRIC = "cosine"
    p = ot.unif(40)
    q = ot.unif(40)
    # wp_human_ratings_full = get_resorted_table(human_ratings_full, same_tone_indices["human_index"])
    # wp_gpt_ratings_full = get_resorted_table(gpt_ratings_full, same_tone_indices["gpt_index"])
    wp_distances_human = cdist(human_ratings_full.to_numpy().T, human_ratings_full.to_numpy().T, metric=METRIC)
    wp_distances_GPT = cdist(gpt_ratings_full.to_numpy().T, gpt_ratings_full.to_numpy().T, metric=METRIC)

    gw, log = ot.gromov.entropic_gromov_wasserstein(
        wp_distances_human, wp_distances_GPT, p, q, 'square_loss', epsilon=0.001, log=True
    )
    ot_human_MDS = MDS().fit_transform(human_ratings_full.T)
    ot_gpt_MDS = MDS().fit_transform(gpt_ratings_full.T)
    
    BTA = ot_human_MDS.T @ (ot_gpt_MDS.T @ gw.T).T
    svd_UVT = np.linalg.svd(BTA)
    Q = svd_UVT[0] @ svd_UVT[2]
    aligned_gpt_MDS = ot_gpt_MDS @ Q.T
    return ot_human_MDS, aligned_gpt_MDS, 0

def simple_procrustes(human_ratings_full, gpt_ratings_full):
    ot_human_MDS = MDS().fit_transform(human_ratings_full.T)
    ot_gpt_MDS = MDS().fit_transform(gpt_ratings_full.T)
    BTA = ot_human_MDS.T @ ot_gpt_MDS
    svd_UVT = np.linalg.svd(BTA)
    Q = svd_UVT[0] @ svd_UVT[2]
    aligned_gpt_MDS = ot_gpt_MDS @ Q.T
    return ot_human_MDS, aligned_gpt_MDS, 0

In [None]:
def get_nn_devices(embeddings_mat):
    nn_devices = {
        "human_contained": {},
        "gpt_contained": {}
    }
    for k in range(1, 6):
        nn_devices["human_contained"][k] = NearestNeighbors(n_neighbors=k).fit(embeddings_mat[:40])
        nn_devices["gpt_contained"][k] = NearestNeighbors(n_neighbors=k).fit(embeddings_mat[40:])
    return nn_devices

cross_corr_nn_devices = get_nn_devices(cross_corr_matrix.values)

In [None]:
cross_corr_nn_cache = {}
for tone in cross_corr_matrix.index:
    tone_source, tone_name = tone.split(": ")
    ind_to_investigate = "gpt_contained" if tone_source == "Human" else "human_contained"
    if tone_source not in cross_corr_nn_cache:
        cross_corr_nn_cache[tone_source] = {}
    cross_corr_nn_cache[tone_source][tone_name] = {}
    for k in range(1, 6):
        tone_kneighbors = cross_corr_nn_devices[ind_to_investigate][k].kneighbors(
            cross_corr_matrix.loc[tone].values.reshape(1, -1)
        )
        cross_corr_nn_cache[tone_source][tone_name][k] = human_ratings_full.columns[human_sorting_label][tone_kneighbors[1][0]]

In [None]:
def get_ot_nn_info(human_emb_source, gpt_emb_source, alignment_func, seed=0):
    np.random.seed(seed)
    human_MDS, gpt_MDS, _ = alignment_func(human_emb_source, gpt_emb_source)
    stoc_ot_senemb_nns = get_nn_devices(np.concatenate([human_MDS, gpt_MDS]))
    stoc_ot_senemb_nn_cache = {}
    for tone_id, tone_name in enumerate(human_ratings_full.columns):
        ind_to_investigate = "gpt_contained"
        if "Human" not in stoc_ot_senemb_nn_cache:
            stoc_ot_senemb_nn_cache["Human"] = {}
        stoc_ot_senemb_nn_cache["Human"][tone_name] = {}
        for k in range(1, 6):
            tone_kneighbors = stoc_ot_senemb_nns[ind_to_investigate][k].kneighbors(
                human_MDS[tone_id].reshape(1, -1)
            )
            stoc_ot_senemb_nn_cache["Human"][tone_name][k] = human_ratings_full.columns[tone_kneighbors[1][0]]
        
        ind_to_investigate = "human_contained"
        if "GPT" not in stoc_ot_senemb_nn_cache:
            stoc_ot_senemb_nn_cache["GPT"] = {}
        stoc_ot_senemb_nn_cache["GPT"][tone_name] = {}
        for k in range(1, 6):
            tone_kneighbors = stoc_ot_senemb_nns[ind_to_investigate][k].kneighbors(
                human_MDS[tone_id].reshape(1, -1)
            )
            stoc_ot_senemb_nn_cache["GPT"][tone_name][k] = gpt_ratings_full.columns[tone_kneighbors[1][0]]
    return stoc_ot_senemb_nn_cache, {
        "human_MDS": human_MDS,
        "gpt_MDS": gpt_MDS,
        "human_tones": human_emb_source.columns.to_list(),
        "gpt_tones": gpt_emb_source.columns.to_list()
    }

In [None]:
def lex_inc_nn_subcache(source_ratings, target_ratings, seed=0):
    np.random.seed(seed)
    curr_subcache = {
        tone_name: {i: [] for i in range(1, 6)}
        for tone_name in human_ratings_full.columns
    }
    nearest_neighbor = alignment.induce_one_side(
        source_ratings=source_ratings,
        target_ratings=target_ratings,
        csls_neighborhood=5,
        translation_csls_neighborhood=5,
        direction="backward",
        n_induced_entries=1
    )
    for from_tone, to_tone in zip(nearest_neighbor["from"], nearest_neighbor["to"]):
        curr_subcache[from_tone][1].append(to_tone)
    for k in range(2, 6):
        nn_data = alignment.induce_one_side(
            source_ratings=source_ratings,
            target_ratings=target_ratings,
            csls_neighborhood=5,
            translation_csls_neighborhood=5,
            direction="backward",
            n_induced_entries=k
        )
        for from_tone, to_tone in nn_data.items():
            curr_subcache[from_tone][k] = (to_tone)
    return curr_subcache

def get_knn_matchings(cache_1, cache_2):
    matches = {}
    total_investigations = {}
    for tone_source in cache_1:
        for tone in cache_1[tone_source]:
            for neighbor_k in cache_1[tone_source][tone]:
                if neighbor_k not in matches:
                    matches[neighbor_k] = 0
                    total_investigations[neighbor_k] = 0
                total_investigations[neighbor_k] += neighbor_k
                matches[neighbor_k] += np.in1d(
                    cache_1[tone_source][tone][neighbor_k],
                    cache_2[tone_source][tone][neighbor_k]
                ).sum()
    return matches, total_investigations

### Gathering Alignment Seeds

In [None]:
def get_matching_data_on_NN(seed):
    stoc_ot_senemb_nn_cache, stoc_ot_senemb_mds = get_ot_nn_info(human_ratings_full, gpt_ratings_full, external_ot.master_ot, seed)
    simple_proc_senemb_nn_cache, simple_proc_senemb_mds = get_ot_nn_info(human_ratings_full, gpt_ratings_full, simple_procrustes, seed)
    lexicon_induction_sememb_nn_cache = {
        "Human": lex_inc_nn_subcache(human_ratings_full, gpt_ratings_full, seed),
        "GPT": lex_inc_nn_subcache(gpt_ratings_full, human_ratings_full, seed)
    }
    nn_caches = {
        "stoc_ot_senemb_nn_cache": stoc_ot_senemb_nn_cache,
        "simple_proc_senemb_nn_cache": simple_proc_senemb_nn_cache,
        "lexicon_induction_sememb_nn_cache": lexicon_induction_sememb_nn_cache,
    }
    match_rates = {}
    for cache_key, cache_dict in nn_caches.items():
        matchings = get_knn_matchings(cross_corr_nn_cache, cache_dict)
        match_rates[cache_key] = [matchings[0][i] / matchings[1][i] for i in range(1, 6)]

    match_rates_df = pd.DataFrame(
        data = match_rates
    )
    return match_rates_df, {
        "stoc_ot_senemb_mds": stoc_ot_senemb_mds,
        "simple_proc_senemb_mds": simple_proc_senemb_mds,
    }

In [None]:
for seed in range(100):
    current_seed_matchings, current_seed_data = get_matching_data_on_NN(seed)
    for source_name in current_seed_data:
        pd.concat([
            pd.DataFrame(
                index=[f"human: {x}" for x in current_seed_data[source_name]["human_tones"]],
                data=current_seed_data[source_name]["human_MDS"],
            ),
            pd.DataFrame(
                index=[f"gpt: {x}" for x in current_seed_data[source_name]["gpt_tones"]],
                data=current_seed_data[source_name]["gpt_MDS"],
            )
        ]).to_csv(f"../data/alignment_benchmarking/normal/seed{seed}_{source_name}.csv")
    # current_seed_matchings.to_csv(f"../data/alignment_benchmarking/normal/seed{seed}_macthing_rates.csv")
    del current_seed_matchings
    del current_seed_data

In [None]:
methods_of_align = ["stoc_ot_senemb_mds", "simple_proc_senemb_mds", "lexicon_induction_sememb_nn_cache"]

### Gather kNN matching rate

In [None]:
mix_df = []
for s in range(100):
    mix_df.append(pd.read_csv(f"../data/alignment_benchmarking/normal/seed{s}_macthing_rates.csv").drop(columns="Unnamed: 0"))

### Gather correlation of dissimilarity and distance matrices

In [None]:
target_human_sim = human_on_tone_sim[np.triu_indices(40, 1)]
target_gpt_sim = gpt_on_tone_sim[np.triu_indices(40, 1)]
human_data_bli = []
gpt_data_bli = []
tone_order = [s.split(": ")[1] for s in cross_corr_matrix.index[:40]]

while len(gpt_data_bli) < 5000:
    try:
        src_embeddings, tgt_embeddings = alignment.induce_one_side(
            source_ratings=human_ratings_full,
            target_ratings=gpt_ratings_full,
            csls_neighborhood=5,
            translation_csls_neighborhood=5,
            direction="backward",
            return_embeddings=True
        ).values()
        human_dist = cdist(pd.DataFrame(src_embeddings)[tone_order].T, pd.DataFrame(src_embeddings)[tone_order].T)
        human_data_bli.append(
            np.corrcoef(target_human_sim, -human_dist[np.triu_indices(40, 1)])[0, 1]
        )
        # print("appended to human")

        gpt_dist = cdist(pd.DataFrame(tgt_embeddings)[tone_order].T, pd.DataFrame(tgt_embeddings)[tone_order].T)
        gpt_data_bli.append(
            np.corrcoef(target_gpt_sim, -gpt_dist[np.triu_indices(40, 1)])[0, 1]
        )
        # print("appended to gpt")
    except KeyError:
        a = 0
        # print("keyError passed")

In [None]:
distance_arr = {method_name: {"human": [], "gpt": []} for method_name in methods_of_align}
target_ind_order = cross_corr_matrix.index.str.lower()
for s in range(100):
    for method_name in methods_of_align:
        testing_data = pd.read_csv(f"../data/alignment_benchmarking/normal/seed{s}_{method_name}.csv")
        testing_data["tone_split_info"] = testing_data["Unnamed: 0"].str.split(": ")
        testing_data["tone_source"] = testing_data["tone_split_info"].str[0]
        testing_data["tone_name"] = testing_data["tone_split_info"].str[1]
        testing_data = testing_data.set_index("Unnamed: 0").loc[target_ind_order]
        
        
        human_data = testing_data.query("tone_source=='human'").loc[:, ["0", "1"]]
        human_dist = cdist(human_data, human_data)
        distance_arr[method_name]["human"].append(
            np.corrcoef(target_human_sim, -human_dist[np.triu_indices(40, 1)])[0, 1]
        )
        
        gpt_data = testing_data.query("tone_source=='gpt'").loc[:, ["0", "1"]]
        gpt_dist = cdist(gpt_data, gpt_data)
        distance_arr[method_name]["gpt"].append(
            np.corrcoef(target_gpt_sim, -gpt_dist[np.triu_indices(40, 1)])[0, 1]
        )
    

In [None]:
for method_name in methods_of_align:
    curr_df = pd.DataFrame(data=distance_arr[method_name])
    print(
        f"""
        {method_name}
        
        HUMAN:
        mean: {curr_df["human"].mean()}
        low_ci: {np.percentile(curr_df["human"], 2.5)}
        high_ci: {np.percentile(curr_df["human"], 97.5)}
        
        GPT:
        mean: {curr_df["gpt"].mean()}
        low_ci: {np.percentile(curr_df["gpt"], 2.5)}
        high_ci: {np.percentile(curr_df["gpt"], 97.5)}
        """
    )