# SRE Results Section

## Imports

In [None]:
import numpy as np
import os
import json
import pandas as pd
import gensim.downloader
import random
from sklearn.manifold import MDS
from sklearn.neighbors import KernelDensity

from nltk.stem.porter import PorterStemmer
from scipy.spatial.distance import cosine, jensenshannon, euclidean
from scipy.stats import entropy, gaussian_kde

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.patches as patches
import matplotlib.patches as mpatches
import matplotlib.patheffects as patheffects
import seaborn as sns

import matplotlib

matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

## Data Gathering

### Human Data Organization

In [None]:
human_trial_fifty_df = pd.read_csv("../data/human_en_fifty/data/SRETrial.csv")
human_node_fifty_df = pd.read_csv("../data/human_en_fifty/data/SRENode.csv")
human_trial_thirty_df = pd.read_csv("../data/human_en_thirty/data/SRETrial.csv")
human_node_thirty_df = pd.read_csv("../data/human_en_thirty/data/SRENode.csv")
human_trial_twenty_df = pd.read_csv("../data/human_en_twenty/data/SRETrial.csv")
human_node_twenty_df = pd.read_csv("../data/human_en_twenty/data/SRENode.csv")

In [None]:
def get_compact_trial_df(human_trial_df, human_node_df):
    human_node_degrees = human_node_df[["id", "degree"]]
    human_trial_df_compact = human_trial_df.query("failed==False")[["id", "origin_id", "network_id", "previous_sample", "obtained_response", "time_taken"]]
    human_trial_df_compact = human_trial_df_compact\
        .rename(columns = {"id": "trial_id"})\
        .set_index("trial_id")
    human_trial_df_compact["previous_sample"] = human_trial_df_compact["previous_sample"].map(json.loads)
    human_trial_df_compact["provided_prompt"] = human_trial_df_compact["previous_sample"].map(lambda x: x["obtained_response"])
    human_trial_df_compact["node_mode"] = human_trial_df_compact["previous_sample"].map(lambda x: x["current_mode"])
    human_trial_df_compact = human_trial_df_compact\
                        .drop(columns=["previous_sample"])\
                        .iloc[:, [0, 1, 5, 4, 2, 3]].dropna()
    human_trial_df_compact = human_trial_df_compact.merge(human_node_degrees, left_on="origin_id", right_on="id").drop(columns="id")
    return human_trial_df_compact.query("network_id > 2")

In [None]:
def get_map_between_old_new(old_compact, new_comapct, degree_incr):
    old_new_network_id_map = new_comapct\
        .query("degree==0")[["network_id", "provided_prompt"]]\
        .merge(
            old_compact.query(f"degree=={degree_incr}")[["network_id", "obtained_response"]],
            left_on="provided_prompt",
            right_on="obtained_response"
        )\
        .drop_duplicates(subset=["network_id_y", "obtained_response"])[["network_id_x", "network_id_y"]]
    old_new_network_id_map = {pt[0]: pt[1] for pt in old_new_network_id_map.values}
    return old_new_network_id_map

In [None]:
human_fifty_compact = get_compact_trial_df(human_trial_fifty_df, human_node_fifty_df)
human_thirty_compact = get_compact_trial_df(human_trial_thirty_df, human_node_thirty_df)
old_new_network_id_map = get_map_between_old_new(human_fifty_compact, human_thirty_compact, 49)
human_thirty_compact["network_id"] = human_thirty_compact["network_id"].replace(to_replace=old_new_network_id_map)
human_thirty_compact["degree"] = human_thirty_compact["degree"] + 50
human_eighty_compact = pd.concat([human_fifty_compact, human_thirty_compact])
human_twenty_compact = get_compact_trial_df(human_trial_twenty_df, human_node_twenty_df)
old_new_network_id_map = get_map_between_old_new(human_eighty_compact, human_twenty_compact, 79)
human_twenty_compact["network_id"] = human_twenty_compact["network_id"].replace(to_replace=old_new_network_id_map)
human_twenty_compact["degree"] = human_twenty_compact["degree"] + 80
human_trial_df_compact = pd.concat([human_eighty_compact, human_twenty_compact])
human_practice_trials, human_chain_trials = human_trial_df_compact.query("network_id <= 2"), human_trial_df_compact.query("network_id > 2")
human_chain_trials_only_tones_c = human_chain_trials.query("node_mode=='c'").drop(columns="provided_prompt").rename(columns={"obtained_response": "withholding_tone"})
human_chain_trials_only_tones_s = human_chain_trials.query("node_mode=='s' and degree==0").drop(columns="obtained_response").rename(columns={"provided_prompt": "withholding_tone"})
# In the above interpretation, tones from iteration involve only tones that are sampled at the particular iteration.
human_chain_trials_only_tones = pd.concat([human_chain_trials_only_tones_c, human_chain_trials_only_tones_s])
human_chain_trials_only_tones = human_chain_trials_only_tones.sort_values(["network_id", "degree"])
human_chain_trials_only_tones["withholding_tone"] = human_chain_trials_only_tones["withholding_tone"].str.lower()
human_wanted_words = human_chain_trials_only_tones["withholding_tone"].value_counts().index
human_chain_trials_only_tones = human_chain_trials_only_tones[human_chain_trials_only_tones["withholding_tone"].isin(human_wanted_words)]

In [None]:
human_tone_occurrence_table_whole = pd.pivot_table(
    human_chain_trials_only_tones,
    index="withholding_tone",
    columns="degree",
    values="origin_id",
    aggfunc="count"
).fillna(0)
human_tone_occurrence_table_whole

### GPT Data Organization

In [None]:
gpt_df = pd.read_csv("../data/totalGPTData.csv", sep="|", engine='python')
gpt_tones_df = gpt_df.query("node_mode == 'c'")
gpt_tones_df["node_response"] = gpt_tones_df["node_response"].str.lower()

In [None]:
gpt_tone_occurrence_table_whole = pd.pivot_table(
    gpt_tones_df.reset_index(),
    index="node_response",
    columns="node_order",
    values="index",
    aggfunc="count"
).fillna(0).iloc[:, :100]
gpt_tone_occurrence_table_whole

### Deciding Tones to work with

In [None]:
def get_true_count(tone_array, all_40_tones):
    return pd.merge(
        tone_array / np.sum(tone_array),
        pd.DataFrame(data = [0 for _ in range(len(all_40_tones))], index=all_40_tones),
        left_index=True,
        right_index=True,
        how="right",
    ).fillna(0)["count"]

In [None]:
human_top_24_tones = human_chain_trials_only_tones["withholding_tone"].value_counts()
gpt_top_24_tones = gpt_tones_df["node_response"].value_counts()
human_top_24_tones = (human_top_24_tones / np.sum(human_top_24_tones)).iloc[:24]
gpt_top_24_tones = (gpt_top_24_tones / np.sum(gpt_top_24_tones)).iloc[:24]
all_40_tones = pd.Index(set(gpt_top_24_tones.index).union(human_top_24_tones.index))

human_total_td_data = []
gpt_total_td_data = []

for _ in range(250):
    human_tone_dist_data = human_chain_trials_only_tones\
        .sample(frac=1, replace=True)["withholding_tone"]\
        .value_counts()
    gpt_tone_dist_data = gpt_tones_df\
        .sample(frac=1, replace=True)["node_response"]\
        .value_counts()
    human_tone_dist_data = get_true_count(
        human_tone_dist_data / human_tone_dist_data.sum(),
        all_40_tones
    )
    gpt_tone_dist_data = get_true_count(
        gpt_tone_dist_data / gpt_tone_dist_data.sum(),
        all_40_tones
    )
    
    human_total_td_data.append(human_tone_dist_data)
    gpt_total_td_data.append(gpt_tone_dist_data)

human_top_24_tones = pd.concat([human_top_24_tones] + human_total_td_data)
gpt_top_24_tones = pd.concat([gpt_top_24_tones] + gpt_total_td_data)

human_top_24_tones = pd.DataFrame(human_top_24_tones).reset_index()
human_top_24_tones.index = human_top_24_tones["index"].values
gpt_top_24_tones = pd.DataFrame(gpt_top_24_tones).reset_index()
gpt_top_24_tones.index = gpt_top_24_tones["index"].values

top_tones_merged = human_top_24_tones.merge(
    gpt_top_24_tones,
    how="outer",
    left_index=True,
    right_index=True,
    suffixes=("_human", "_gpt")
).fillna(0)
top_tones_merged_for_ordering = top_tones_merged\
    .assign(total_popularity=top_tones_merged.loc[:,"count_human"] + top_tones_merged.loc[:,"count_gpt"])\
    .sort_values("total_popularity", ascending=False)

In [None]:
histogram_corrs = []
for hd, gd in zip(human_total_td_data, gpt_total_td_data):
    histogram_corrs.append(
        np.corrcoef(hd.sort_index(), gd.sort_index())[0, 1]
    )

In [None]:
organized_top_tones_merged = top_tones_merged[["index_human", "count_human", "count_gpt"]]
organized_top_tones_merged.columns = ["index", "Human Conversation Tones", "GPT Conversation Tones"]
tone_overlay = organized_top_tones_merged.melt("index").sort_values("value", ascending=False)
tone_overlay = tone_overlay.groupby(["index", "variable"]).agg(
    [np.mean, lambda x:np.percentile(x, 2.5), lambda y:np.percentile(y, 97.5)]
)
tone_overlay.columns = tone_overlay.columns.droplevel(0)
tone_overlay = tone_overlay.reset_index()
tone_overlay.columns = ["index", "variable", "mean", "lower_ci", "higher_ci"]
tone_overlay = tone_overlay.sort_values("mean", ascending=False)

In [None]:
histogram_order = tone_overlay.groupby("index").agg({"mean": np.mean}).sort_values("mean").index

### Plotting for Distribution of Conversation Tones

In [None]:
fig, ax = plt.subplots(figsize=(8, 30))
target_df = tone_overlay.query("variable=='Human Conversation Tones'")
ax = sns.barplot(
    data=target_df,
    y="index",
    x="mean",
    order=histogram_order,
    color="orange"
)
plt.xticks(rotation=90)
ax.tick_params(axis='both', which='major', labelsize=16)
plt.xlabel("Conversation Tones", fontsize=20)
plt.ylabel("Frequency of Tones", fontsize=20)
# plt.setp(ax.get_legend().get_texts(), fontsize='18')
# plt.setp(ax.get_legend().get_title(), fontsize='20')

x_coords = [p.get_width() for p in ax.patches]
y_coords = [p.get_y() + 0.5 * p.get_height() for p in ax.patches]
for x, y, tone_info in zip(
    x_coords,
    y_coords,
    [
        (tone_source, tone_name)
        for tone_source in ["Human Conversation Tones"]
        for tone_name in histogram_order #target_df["index"]
    ]
):
    target_row = tone_overlay.query(f"index=='{tone_info[1]}' and variable=='{tone_info[0]}'")
    print(tone_info)
    plt.errorbar(
        x, y,
        xerr=(target_row.iloc[:, -3] - target_row.iloc[:, -2], target_row.iloc[:, -1] - target_row.iloc[:, -3]),
        fmt="none",
        c= "k",
        capsize=5,
        
    )
    
plt.ylim(-0.8, 40)
plt.xlim(0, 0.25)

# ax.get_legend().set_title("Conversation Tone Source Coloring")

In [None]:
fig, ax = plt.subplots(figsize=(8, 30))
target_df = tone_overlay.query("variable=='GPT Conversation Tones'")
ax = sns.barplot(
    data=target_df,
    y="index",
    x="mean",
    order=histogram_order,
    color="blue"
)
plt.xticks(rotation=90)
ax.tick_params(axis='both', which='major', labelsize=16)
plt.xlabel("Conversation Tones", fontsize=20)
plt.ylabel("Frequency of Tones", fontsize=20)
# plt.setp(ax.get_legend().get_texts(), fontsize='18')
# plt.setp(ax.get_legend().get_title(), fontsize='20')

x_coords = [p.get_width() for p in ax.patches]
y_coords = [p.get_y() + 0.5* p.get_height() for p in ax.patches]
for x, y, tone_info in zip(
    x_coords,
    y_coords,
    [
        (tone_source, tone_name)
        for tone_source in ["GPT Conversation Tones"]
        for tone_name in histogram_order #target_df["index"]
    ]
):
    target_row = tone_overlay.query(f"index=='{tone_info[1]}' and variable=='{tone_info[0]}'")
    print(tone_info)
    plt.errorbar(
        x, y,
        xerr=(target_row.iloc[:, -3] - target_row.iloc[:, -2], target_row.iloc[:, -1] - target_row.iloc[:, -3]),
        fmt="none",
        c= "k",
        capsize=5
    )
    
plt.ylim(-0.8, 40)

# ax.get_legend().set_title("Conversation Tone Source Coloring")

In [None]:
fig, ax = plt.subplots(figsize=(8, 30))
ax = sns.barplot(
    data=tone_overlay,
    y="index",
    x="mean",
    hue="variable",
    hue_order=["Human Conversation Tones", "GPT Conversation Tones"],
    order=histogram_order,
    palette=["C1", "C0"]
)
plt.xticks(rotation=75)
ax.tick_params(axis='both', which='major', labelsize=16)
plt.xlabel("Conversation Tones", fontsize=20)
plt.ylabel("Frequency of Tones", fontsize=20)
plt.setp(ax.get_legend().get_texts(), fontsize='18')
plt.setp(ax.get_legend().get_title(), fontsize='20')

x_coords = [p.get_width() for p in ax.patches]
y_coords = [p.get_y() + 0.5* p.get_height() for p in ax.patches]
for x, y, tone_info in zip(
    x_coords,
    y_coords,
    [
        (tone_source, tone_name)
        for tone_source in ["Human Conversation Tones", "GPT Conversation Tones"]
        for tone_name in histogram_order
    ]
):
    target_row = tone_overlay.query(f"index=='{tone_info[1]}' and variable=='{tone_info[0]}'")
    plt.errorbar(
        x, y,
        xerr=(target_row.iloc[:, -3] - target_row.iloc[:, -2], target_row.iloc[:, -1] - target_row.iloc[:, -3]),
        fmt="none",
        c= "k",
        capsize=5
    )
    
plt.ylim(-0.8, 40)

ax.get_legend().set_title("Conversation Tone Source Coloring")

## Reliability of Human, GPT Sample Distributions

In [None]:
def get_halfsplit_tone_distribution_correlation(tone_df, tone_column_label, all_tones_list, network_id_col="network_id"):
    shuffled_df = tone_df.sample(frac=1).reset_index(drop=True)
    shuffled_sentence_inds = tone_df[network_id_col].unique()
    np.random.shuffle(shuffled_sentence_inds)
    sentence_inds_half_a, sentence_inds_half_b = shuffled_sentence_inds[:len(shuffled_sentence_inds) // 2], shuffled_sentence_inds[len(shuffled_sentence_inds) // 2:]
    df_halfsplit_a, df_halfsplit_b = shuffled_df[shuffled_df[network_id_col].isin(sentence_inds_half_a)], shuffled_df[shuffled_df[network_id_col].isin(sentence_inds_half_b)]
    def resulting_distribution(tone_series, all_tones_list):
        distribution_dict = {}
        distrbution_arr = np.array([])
        for tone in all_tones_list:
            tone_count = np.sum(tone_series == tone)
            distribution_dict[tone] = tone_count
            distrbution_arr = np.append(distrbution_arr, tone_count)
        return distribution_dict, distrbution_arr

    distribution_half_a = resulting_distribution(df_halfsplit_a[tone_column_label], all_tones_list)
    distribution_half_b = resulting_distribution(df_halfsplit_b[tone_column_label], all_tones_list)
    return np.corrcoef(
        distribution_half_a[1], distribution_half_b[1]
    )[0, 1]

In [None]:
all_human_tones = human_chain_trials_only_tones["withholding_tone"].unique()
human_dist_corrs = np.array([])
np.random.seed(42)
for _ in range(5000):
    human_dist_corrs = np.append(
        human_dist_corrs,
        get_halfsplit_tone_distribution_correlation(
            human_chain_trials_only_tones, "withholding_tone", all_human_tones
        )
    )
plt.hist(human_dist_corrs)

In [None]:
all_gpt_tones = gpt_tones_df["node_response"].unique()
gpt_dist_corrs = np.array([])
np.random.seed(42)
for _ in range(5000):
    gpt_dist_corrs = np.append(
        gpt_dist_corrs,
        get_halfsplit_tone_distribution_correlation(
            gpt_tones_df, "node_response", all_gpt_tones, "chain_id"
        )
    )
plt.hist(gpt_dist_corrs)

## Sample Space UMAP Embeddings

In [None]:
from sent2vec.vectorizer import Vectorizer
from umap import UMAP
import matplotlib.cm as cm

human_chain_trials_only_sentences = human_chain_trials.query("node_mode=='s'")
gpt_df_sentences = gpt_df.query("node_mode == 's'")

In [None]:
# Run this cell to get embeddings again. Otherwise, they are attached in the repository already.

# temp_human_chain_trials_semb = human_chain_trials_only_sentences
# temp_gpt_chain_trials_semb = gpt_df_sentences
# merged_sentence = pd.concat(
#     [
#         temp_gpt_chain_trials_semb[["node_order", "chain_id", "node_response"]]\
#             .rename(
#             columns={
#                 "chain_id": "network_id",
#                 "node_order": "degree",
#                 "node_response": "obtained_response"
#             }
#             ).assign(reponse_source=["gpt" for _ in range(temp_gpt_chain_trials_semb.shape[0])]),
#         temp_human_chain_trials_semb[["network_id", "degree", "obtained_response"]]\
#             .assign(reponse_source=["human" for _ in range(temp_human_chain_trials_semb.shape[0])])
#     ]
# )
# merging_vectorizer = Vectorizer()
# merging_vectorizer.run(merged_sentence["obtained_response"].tolist())
# merging_vectors = merging_vectorizer.vectors
# merged_sembs = merged_sentence.assign(sentence_embeddings=merging_vectors)
# merged_vectors_transformed = UMAP(random_state=42).fit_transform(merging_vectors)
# merged_sembs = merged_sembs.assign(sentence_embeddings_umap=merged_vectors_transformed.tolist())

In [None]:
# Reads the previously computed embeddings

merged_sembs = pd.read_csv("../data/all_chains_semb_by_bert.csv")
merged_sembs["sentence_embeddings_umap"] = merged_sembs["sentence_embeddings_umap"].apply(json.loads)
merged_sembs["degree"] = merged_sembs["degree"].apply(int)

### Investigate Sentence Space Entropy

In [None]:
def get_blocked_sentence_space(data_source):
    testing_data = pd.merge(
        merged_sembs[["reponse_source", "degree"]],
        pd.DataFrame(data=data_source["sentence_embeddings_umap"].to_list(), columns=["x", "y"]),
        left_index=True,
        right_index=True
    )
    x_interval = (testing_data["x"].max() - testing_data["x"].min()) / 50
    y_interval = (testing_data["y"].max() - testing_data["y"].min()) / 50
    testing_data["x_blocked"] = (testing_data["x"] - testing_data["x"].min()) // x_interval
    testing_data["y_blocked"] = (testing_data["y"] - testing_data["y"].min()) // y_interval
    return testing_data

def get_entropy_vec(entropy_df):
    entropy_df_tbl = entropy_df.pivot_table(
        index="x_blocked",
        columns="y_blocked",
        aggfunc="size",
        fill_value=0
    )
    entropy_df_tbl = entropy_df_tbl.reindex(np.arange(50), axis=0, fill_value=0)
    entropy_df_tbl = entropy_df_tbl.reindex(np.arange(50), axis=1, fill_value=0)
    entropy_vec = entropy_df_tbl.values.flatten()
    return entropy_vec / entropy_vec.sum()

In [None]:
merged_sembs_entropy = get_blocked_sentence_space(merged_sembs)
human_entropies = [
    entropy(get_entropy_vec(merged_sembs_entropy.query("reponse_source=='human'").sample(frac=1.0, replace=True, random_state=s)))
    for s in range(5000)
]
gpt_entropies = [
    entropy(get_entropy_vec(merged_sembs_entropy.query("reponse_source=='gpt'").sample(frac=1.0, replace=True, random_state=s)))
    for s in range(5000)
]

In [None]:
fig, ax = plt.subplots()
plt.plot(
    np.arange(101),
    [
        entropy(
            get_entropy_vec(
                merged_sembs_entropy.query(f"reponse_source=='human' and degree=={deg}")
            )
        )
        for deg in np.arange(101)
    ],
    label="Human entropy (indivudal iteration)"
)
plt.plot(
    np.arange(101),
    [
        entropy(
            get_entropy_vec(
                merged_sembs_entropy.query(f"reponse_source=='gpt' and degree=={deg}")
            )
        )
        for deg in np.arange(101)
    ],
    label="GPT entropy (indivudal iteration)"
)
plt.legend()

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))

plt.plot(
    np.arange(1, 100),
    [
        np.linalg.norm(
            get_entropy_vec(
                merged_sembs_entropy.query(f"reponse_source=='human' and degree=={deg}")
            ) - get_entropy_vec(
                merged_sembs_entropy.query(f"reponse_source=='gpt' and degree=={deg}")
            )
        )
        for deg in np.arange(1, 100)
    ],
    label="difference between human and GPT distribution (indivudal iteration)"
)
plt.plot(
    np.arange(1, 101),
    [
        np.linalg.norm(
            get_entropy_vec(
                merged_sembs_entropy.query(f"reponse_source=='human' and degree<={deg}")
            ) - get_entropy_vec(
                merged_sembs_entropy.query(f"reponse_source=='gpt' and degree<={deg}")
            )
        )
        for deg in np.arange(1, 101)
    ],
    label="difference between human and GPT distribution (cumulative throughout iteration)"
)
plt.legend()

### Plotting of Sentence Joint Embedding Space

In [None]:
merged_sembs["degree_response_source"] = pd.Series(zip(merged_sembs["degree"], merged_sembs["reponse_source"])).values
fig, ax = plt.subplots(figsize=(20, 20))
human_colormap = cm.Reds(np.linspace(0, 1, 1 + max(merged_sembs.query("reponse_source=='human'")["degree"].unique())))
gpt_colormap = cm.Blues(np.linspace(0, 1, 1 + max(merged_sembs.query("reponse_source=='gpt'")["degree"].unique())))
colormap_maps = {
    "human": human_colormap, "gpt": gpt_colormap
}
colormap_corrs = {
    "human": human_colormap[len(human_colormap) // 2], "gpt": gpt_colormap[len(gpt_colormap) // 2]
}
for elem in merged_sembs["degree_response_source"].unique():
    embeddings_to_work_with = np.vstack(
        merged_sembs[merged_sembs["degree_response_source"]==elem]["sentence_embeddings_umap"].to_list()
    )
    plt.scatter(
        x=embeddings_to_work_with[:, 0],
        y=embeddings_to_work_with[:, 1],
        c = np.vstack([colormap_corrs[elem[1]] for _ in range(len(embeddings_to_work_with))]),
        alpha=0.03
        # c=np.vstack([colormap_maps[elem[1]][elem[0]] for _ in range(len(embeddings_to_work_with))])
    )

circle_centroids = [
    (-7.5, 8.5),
    # (-1.5, 0),
    # (1, 0),
    (-1, 0),
    (1, -4.5),
    (-5, -2),
    (3.5, -0.5),
    (-3, -5),
    (10.5, -8),
    (-2, -12)
]
circle_radii = [
    3.5,
    # 1.3,
    # 1.3,
    2.5,
    2,
    1.5,
    1.75,
    2,
    3.5,
    2
    
]
for centroid, radius in zip(circle_centroids, circle_radii):
    ax.add_patch(
        plt.Circle(centroid, radius, fill=False)
    )
plt.xlim((-24, 24))
plt.ylim((-24, 24))

plt.legend(
    [
        mpatches.Circle((0.0, 0.0), 0.05, facecolor=colormap_corrs["human"]),
        mpatches.Circle((0.0, 0.0), 0.05, facecolor=colormap_corrs["gpt"])
    ],
    ["Human Sentence Embeddings", "GPT Sentence Embeddings"],
    prop={'size': 18}
)

In [None]:

generations_to_look_at = [
    0, 1, 3, 5, 15, 30, 40, 50, 60, 70, 80, 90, 100
]
for i in range(len(generations_to_look_at) - 1):
    fig, ax = plt.subplots(figsize=(20, 20))
    for elem in [(j, source) for j in range(generations_to_look_at[i], generations_to_look_at[i+1] + 1) for source in ["gpt", "human"]]:#merged_sembs["degree_response_source"].unique():
        if elem not in list(merged_sembs["degree_response_source"].unique()):
            print(elem)
            continue
        embeddings_to_work_with = np.vstack(
            merged_sembs[merged_sembs["degree_response_source"]==elem]["sentence_embeddings_umap"].to_list()
        )
        ax.scatter(
            x=embeddings_to_work_with[:, 0],
            y=embeddings_to_work_with[:, 1],
            c = np.vstack([colormap_corrs[elem[1]] for _ in range(len(embeddings_to_work_with))]),
            alpha=0.5
        )
        ax.set_title(f"Distribution at Iteration {generations_to_look_at[i+1]}", fontsize=35)
    for centroid, radius in zip(circle_centroids, circle_radii):
        ax.add_patch(
            plt.Circle(centroid, radius, fill=False)
        )
    ax.set_xlim(xmin=-24, xmax=24)
    ax.set_ylim(ymin=-24, ymax=24)

### Similar Analysis on Tone Embedding Space instead

In [None]:
temp_human_chain_trials_temb = human_chain_trials_only_tones
temp_gpt_chain_trials_temb = gpt_tones_df
merged_tones = pd.concat(
    [
        temp_gpt_chain_trials_temb[["node_order", "chain_id", "node_response"]]\
            .rename(
                columns={
                    "chain_id": "network_id",
                    "node_order": "degree",
                    "node_response": "obtained_response"
                }
            ).assign(reponse_source=["gpt" for _ in range(temp_gpt_chain_trials_temb.shape[0])]),
        temp_human_chain_trials_temb[["network_id", "degree", "withholding_tone"]]\
            .rename(
                columns={
                    "withholding_tone": "obtained_response"
                }
            )\
            .assign(reponse_source=["human" for _ in range(temp_human_chain_trials_temb.shape[0])])
    ]
)
merging_vectorizer_temb = Vectorizer()
merging_vectorizer_temb.run(merged_tones["obtained_response"].tolist())

In [None]:
merging_vectors_temb = merging_vectorizer_temb.vectors
merged_tembs = merged_tones.assign(sentence_embeddings=merging_vectors_temb)
merged_vectors_transformed = UMAP(random_state=42).fit_transform(merging_vectors_temb)
merged_tembs = merged_tembs.assign(sentence_embeddings_umap=merged_vectors_transformed.tolist())
merged_tembs["degree_response_source"] = pd.Series(zip(merged_tembs["degree"], merged_tembs["reponse_source"])).values

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))
human_colormap = cm.Reds(np.linspace(0, 1, 1 + max(merged_tembs.query("reponse_source=='human'")["degree"].unique())))
gpt_colormap = cm.Blues(np.linspace(0, 1, 1 + max(merged_tembs.query("reponse_source=='gpt'")["degree"].unique())))
colormap_maps = {
    "human": human_colormap, "gpt": gpt_colormap
}
colormap_corrs = {
    "human": human_colormap[len(human_colormap) // 2], "gpt": gpt_colormap[len(gpt_colormap) // 2]
}
for elem in merged_tembs["degree_response_source"].unique():
    embeddings_to_work_with = np.vstack(
        merged_tembs[merged_tembs["degree_response_source"]==elem]["sentence_embeddings_umap"].to_list()
    )
    plt.scatter(
        x=embeddings_to_work_with[:, 0],
        y=embeddings_to_work_with[:, 1],
        c = np.vstack([colormap_corrs[elem[1]] for _ in range(len(embeddings_to_work_with))]),
        alpha=0.03
        # c=np.vstack([colormap_maps[elem[1]][elem[0]] for _ in range(len(embeddings_to_work_with))])
    )

In [None]:

generations_to_look_at = [
    0, 1, 3, 5, 15, 30, 40, 50, 60, 70, 80, 90, 100
]
for i in range(len(generations_to_look_at) - 1):
    fig, ax = plt.subplots(figsize=(20, 20))
    for elem in [(j, source) for j in range(generations_to_look_at[i], generations_to_look_at[i+1] + 1) for source in ["gpt", "human"]]:#merged_sembs["degree_response_source"].unique():
        if elem not in list(merged_tembs["degree_response_source"].unique()):
            print(elem)
            continue
        embeddings_to_work_with = np.vstack(
            merged_tembs[merged_tembs["degree_response_source"]==elem]["sentence_embeddings_umap"].to_list()
        )
        ax.scatter(
            x=embeddings_to_work_with[:, 0],
            y=embeddings_to_work_with[:, 1],
            c = np.vstack([colormap_corrs[elem[1]] for _ in range(len(embeddings_to_work_with))]),
            alpha=0.7
        )
        ax.set_title(f"Distribution at Iteration {generations_to_look_at[i+1]}", fontsize=30)
    # for centroid, radius in zip(circle_centroids, circle_radii):
    #     ax.add_patch(
    #         plt.Circle(centroid, radius, fill=False)
    #     )
    ax.set_xlim(xmin=-34, xmax=34)
    ax.set_ylim(ymin=-34, ymax=34)

### Incorporation of Wordclouds into Sentence Embedding Space

In [None]:
import wordcloud
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer

Customizing wordcloud settings before use:

In [None]:
extended_stopwords = wordcloud.STOPWORDS
other_stopwords = [
    "really", "m", "much", "will", "s", "", "oh", "john", "david"
]
for other_stopword in other_stopwords:
    extended_stopwords.add(other_stopword)

In [None]:
def get_all_words_counter(target_partition, response_source):
    vectorizer = TfidfVectorizer()
    if response_source in ["human", "gpt"]:
        target_partition = target_partition.query(f"reponse_source=='{response_source}'")
    target_data = target_partition["obtained_response_parsed"]
    
    if len(target_data) == 0: return None
    
    vectorizer = vectorizer.fit(target_data)
    vectorized_data = vectorizer.transform([" ".join(target_data)]).toarray()[0] #.max(axis=1)
    
    vectorized_data = vectorized_data / vectorized_data.max()
    vectorized_features = vectorizer.get_feature_names_out()
    all_words_tfidf = {
        word: tfidf for word, tfidf in zip(vectorized_features, vectorized_data)
    }
    for word in extended_stopwords:
        if word in all_words_tfidf:
            del all_words_tfidf[word]
    return all_words_tfidf

def get_wordcloud_color(
    human_freqs, gpt_freqs,
):
    def subfunction(word, font_size=None, position=None, orientation=None, random_state=None, font_path=None):
        def helper(word, freqs):
            if freqs is None or word not in freqs:
                return 0
            else:
                return freqs[word]
        ratio_to_color = lambda x: int(40 + x * 200)
        human_freq = ratio_to_color(helper(word, human_freqs))
        gpt_freq = ratio_to_color(helper(word, gpt_freqs))
        return f"rgb({human_freq}, 0, {gpt_freq})"
    return subfunction

Obtaining the wordclouds at cusotm centroids, radii:

In [None]:
def get_wordcloud_at_cluster(target_centroid, epsilon):
    merged_sembs_radius = merged_sembs.assign(
        radius_from_centroid=merged_sembs["sentence_embeddings_umap"].apply(
            lambda x: np.linalg.norm(np.array(x) - target_centroid, ord=2)
        )
    )
    
    target_partition = merged_sembs_radius[merged_sembs_radius["radius_from_centroid"] <= epsilon]
    
    if len(target_partition) == 0:
        return None
    # print(merged_sembs_radius)
    target_partition["obtained_response_parsed"] = target_partition["obtained_response"]\
            .str.replace("[,.!?:;]", " ")\
            .str.replace(r"\s+", " ")\
            .str.replace(r"`", "'")\
            .str.replace(r'"', "")\
            .str.lower()
    comment_words = "".join(target_partition["obtained_response_parsed"].to_list())
    
    generated_wordcloud = WordCloud(
        background_color ='white',
        min_font_size = 10,
        collocations = False,
        max_words=80,
        prefer_horizontal=1
    )
    filtered_words = generated_wordcloud.process_text(comment_words)
    
    def sample_and_regen(sentence, filtered_words):
        return " ".join([word for word in sentence.split(" ") if word in filtered_words])
    
    target_partition["obtained_response_parsed"] = target_partition["obtained_response_parsed"]\
        .apply(lambda x: sample_and_regen(x, filtered_words))
    
    human_freqs = get_all_words_counter(target_partition, "human")
    gpt_freqs = get_all_words_counter(target_partition, "gpt")
    all_freqs = get_all_words_counter(target_partition, "all")
    print(all_freqs)
        
    generated_wordcloud.color_func = get_wordcloud_color(human_freqs, gpt_freqs)
    
    generated_wordcloud.generate_from_frequencies(
        {word: val ** 3 for word, val in all_freqs.items()}
    )
    # generated_wordcloud.generate(comment_words)
    
    
 
    # # plot the WordCloud image                       
    plt.figure(figsize = (8, 8), facecolor = None)
    plt.imshow(generated_wordcloud)
    plt.axis("off")
    plt.tight_layout(pad = 0)
    
    plt.show()
    
    return generated_wordcloud

In [None]:
for centroid, radii in zip(circle_centroids, circle_radii):
    print(centroid, radii)
    get_wordcloud_at_cluster(centroid, radii)

In [None]:
fig_dims = 12
fig, ax = plt.subplots(fig_dims, fig_dims, figsize=(100, 100))
fig_radius = 48 // fig_dims / 2
for row in range(fig_dims):
    for col in range(fig_dims):
        get_centroid_x = lambda x: (-24 + fig_radius) + x * (48 // fig_dims)
        get_centroid_y = lambda x: -get_centroid_x(x)
        centroid = (get_centroid_x(col), get_centroid_y(row))
        generated_wordcloud = get_wordcloud_at_cluster(centroid, fig_radius)
        # print(centroid, fig_radius)
        if generated_wordcloud is not None:
            ax[row, col].imshow(generated_wordcloud)
        ax[row, col].axis('off')

In [None]:
fig_dims = 16
fig, ax = plt.subplots(fig_dims, fig_dims, figsize=(100, 100))
fig_radius = 48 // fig_dims / 2
for row in range(fig_dims):
    for col in range(fig_dims):
        get_centroid_x = lambda x: (-24 + fig_radius) + x * (48 // fig_dims)
        get_centroid_y = lambda x: -get_centroid_x(x)
        centroid = (get_centroid_x(col), get_centroid_y(row))
        generated_wordcloud = get_wordcloud_at_cluster(centroid, fig_radius)
        # print(centroid, fig_radius)
        if generated_wordcloud is not None:
            ax[row, col].imshow(generated_wordcloud)
        ax[row, col].axis('off')

In [None]:
fig_dims = 24
fig, ax = plt.subplots(fig_dims, fig_dims, figsize=(100, 100))
fig_radius = 48 // fig_dims / 2
for row in range(fig_dims):
    for col in range(fig_dims):
        get_centroid_x = lambda x: (-24 + fig_radius) + x * (48 // fig_dims)
        get_centroid_y = lambda x: -get_centroid_x(x)
        centroid = (get_centroid_x(col), get_centroid_y(row))
        generated_wordcloud = get_wordcloud_at_cluster(centroid, fig_radius)
        # print(centroid, fig_radius)
        if generated_wordcloud is not None:
            ax[row, col].imshow(generated_wordcloud)
        ax[row, col].axis('off')

## Experiment on Sample Independnece

### Data Processing

In [None]:
def get_human_sample_at_chain(chain_id, human_trial_df, human_tones_arr):
    return pd.merge(
        left=human_trial_df.query(f"network_id=={chain_id}"),
        right=pd.DataFrame(human_tones_arr).rename(columns={0: "tone"}),
        left_on="withholding_tone",
        right_on="tone",
        how="outer"
    ).reset_index().fillna(0).pivot_table(
        values="index",
        aggfunc="count",
        index="degree",
        columns="tone"
    ).fillna(0)

In [None]:
human_chain_trial_samples = {
    network_id: get_human_sample_at_chain(network_id, human_chain_trials_only_tones, all_human_tones)
    for network_id in range(3, 93)
}

In [None]:
def get_table_at_chain_gpt(chain_trial_df, all_tones_arr, chain_id):
    return pd.merge(
        left=chain_trial_df.query(f"chain_id=={chain_id}").reset_index(),
        right=pd.DataFrame(all_tones_arr).rename(columns={0: "tone"}),
        left_on="node_response",
        right_on="tone",
        how="outer"
    ).reset_index().fillna(0).pivot_table(
        values="index",
        aggfunc="count",
        index="node_order",
        columns="tone"
    ).fillna(0)

In [None]:
gpt_chain_trial_samples = {
    network_id: get_table_at_chain_gpt(gpt_tones_df, all_gpt_tones, network_id)
    for network_id in range(91)
}

In [None]:
def blockify_df(df_to_block, time_block_width_fn):
    group_allocation_column = time_block_width_fn(df_to_block.index)
    df_to_block = df_to_block.assign(group_allocation=group_allocation_column)
    return df_to_block.groupby("group_allocation").sum()

In [None]:
human_chain_trial_samples_blocked = {
    df_key: blockify_df(df_to_block, lambda x: x)
    for df_key, df_to_block in human_chain_trial_samples.items()
}
gpt_chain_trial_samples_blocked = {
    df_key: blockify_df(df_to_block, lambda x: x)
    for df_key, df_to_block in gpt_chain_trial_samples.items()
}

### Correlation Measuremnt: $C_{j,t}, C_{j, {t+1}}$

In [None]:
all_distances_within_chain = []
for n in range(3, 93):
    test_matrix = human_chain_trial_samples_blocked[n].values
    for i in range(len(test_matrix) - 1):
        for j in range(i + 2, len(test_matrix) - 1):
            all_distances_within_chain.append(np.corrcoef(test_matrix[i], test_matrix[j])[0, 1])
plt.hist(all_distances_within_chain, bins=60)
np.percentile(all_distances_within_chain, 2.5), np.percentile(all_distances_within_chain, 97.5), np.mean(all_distances_within_chain)

In [None]:
all_distances_within_chain = []
for n in range(0, 91):
    if n == 81: continue
    test_matrix = gpt_chain_trial_samples_blocked[n].values
    for i in range(len(test_matrix) - 1):
        for j in range(i + 2, len(test_matrix) - 1):
            corr_val = np.corrcoef(test_matrix[i], test_matrix[j])[0, 1]
            all_distances_within_chain.append(corr_val)

all_distances_within_chain = np.array(all_distances_within_chain)[~np.isnan(all_distances_within_chain)]
plt.hist(all_distances_within_chain, bins=60)
np.percentile(all_distances_within_chain, 2.5), np.percentile(all_distances_within_chain, 97.5), np.mean(all_distances_within_chain)

In [None]:
all_distances_within_chain = []
all_distances_within_chain_across_i = {
    "means": [],
    "pr2.5": [],
    "pr97.5": [],
    "median": []
}

for i in range(100):
    for n in range(0, 91):
        if n == 81 or i not in gpt_chain_trial_samples_blocked[n].index:
            continue
        test_matrix = gpt_chain_trial_samples_blocked[n].values
        for j in range(i + 2, len(test_matrix) - 1):
            corr_val = np.corrcoef(test_matrix[i], test_matrix[j])[0, 1]
            if not np.isnan(corr_val):
                all_distances_within_chain.append(corr_val)
    all_distances_within_chain_across_i["means"].append(np.mean(all_distances_within_chain))
    all_distances_within_chain_across_i["pr2.5"].append(np.percentile(all_distances_within_chain, 2.5))
    all_distances_within_chain_across_i["pr97.5"].append(np.percentile(all_distances_within_chain, 97.5))
    all_distances_within_chain_across_i["median"].append(np.median(all_distances_within_chain))

In [None]:
plt.plot(
    np.arange(len(all_distances_within_chain_across_i["means"])),
    all_distances_within_chain_across_i["means"], label="means"
)
plt.plot(
    np.arange(len(all_distances_within_chain_across_i["means"])),
    all_distances_within_chain_across_i["median"], label="median"
)
plt.plot(
    np.arange(len(all_distances_within_chain_across_i["means"])),
    all_distances_within_chain_across_i["pr2.5"], label="pr2.5"
)
plt.plot(
    np.arange(len(all_distances_within_chain_across_i["means"])),
    all_distances_within_chain_across_i["pr97.5"], label="pr97.5"
)
plt.legend()

### Correlation Measurement: $C_{i,t}, C_{j,t}$

In [None]:
all_distances_within_block = []
for i in range(20):
    for k in range(3, 92):
        test_matrix_a = human_chain_trial_samples_blocked[k].values
        if len(test_matrix_a) <= i:
            continue
        for n in range(k + 1, 93):
            test_matrix_b = human_chain_trial_samples_blocked[n].values
            if len(test_matrix_b) <= i:
                continue
            all_distances_within_block.append(np.corrcoef(test_matrix_a[i], test_matrix_b[i])[0, 1])
plt.hist(all_distances_within_block, bins=20)
(np.percentile(all_distances_within_block, 2.5), np.percentile(all_distances_within_block, 97.5), np.mean(all_distances_within_block))

In [None]:
all_distances_within_block = []
for i in range(20):
    for k in range(0, 89):
        if k == 81: continue
        test_matrix_a = gpt_chain_trial_samples_blocked[k].values
        if len(test_matrix_a) <= i:
            continue
        for n in range(k + 1, 90):
            if n == 81: continue
            test_matrix_b = gpt_chain_trial_samples_blocked[n].values
            if len(test_matrix_b) <= i:
                continue
            corr_value = np.corrcoef(test_matrix_a[i], test_matrix_b[i])[0, 1]
            all_distances_within_block.append(corr_value)
all_distances_within_block = np.array(all_distances_within_block)[~np.isnan(all_distances_within_block)]
plt.hist(all_distances_within_block, bins=20)
(np.percentile(all_distances_within_block, 2.5), np.percentile(all_distances_within_block, 97.5), np.mean(all_distances_within_block))