In [None]:
import pandas as pd
from analysis import *
import matplotlib.pyplot as plt
import numpy as np
import math

In [None]:
surge_annotations = data.surge_evaluation.annotation_dataframe()
surge_annotations_comparative = data.surge_evaluation.comparative_annotation_dataframe()

surge_annotations

In [None]:
from itertools import combinations
from scipy.stats import ttest_ind

def t_tests(df: pd.DataFrame):
    """
    :param df: (bot, data point) x 1 -> score
    :return: p values of test on each bot pair (pd.Series)
    """
    bots = set(df.index.get_level_values(0))
    bot_pairs = list(combinations(bots, 2))
    result = {}
    for ba, bb in bot_pairs:
        a = df.xs(ba).to_numpy().squeeze()
        b = df.xs(bb).to_numpy().squeeze()
        t, p = ttest_ind(a, b)
        result[(ba, bb)] = p
    result_series = pd.Series(result.values(), result)
    return result_series

pvalues0 = get_singly_annotated(surge_annotations).groupby(
    [sym.category, sym.label]
).apply(
    t_tests
)
pvalues0

In [None]:
# shape => bot pair
# color => evaluation procedure
# x-axis => label

# Build the plot
plt.rcParams["figure.figsize"] = (10,10)

fig, ax = plt.subplots()

def plot_by_category(ax, df, category, color, xaxis_start, symbols):
    extracted = df[df["category"] == category]

    botpairs_pvalues = extracted.iloc[:,2:]
    botpairs = botpairs_pvalues.columns.tolist()
    labels = extracted['label'].tolist()
    xaxis_end = xaxis_start + len(labels)

    for col in botpairs_pvalues:
        pvalues = botpairs_pvalues[col]
        xs = np.arange(xaxis_start, xaxis_end)
        ax.scatter(xs,
                   pvalues,
                   marker=symbols[tuple(sorted(col))],
                   edgecolors=color,
                   facecolors='none')

    return labels, xaxis_end

likert_turn_color = "blue"
likert_dialogue_color = "red"
comparative_color = "green"
behavior_color = "orange"

symbols = {
    tuple(sorted(('emora', 'blender2_3B'))): 'o',
    tuple(sorted(('emora', 'bart_fid_rag_bcb'))): 'v',
    tuple(sorted(('emora', 'rerank_blender'))): 's',
    tuple(sorted(('bart_fid_rag_bcb', 'blender2_3B'))): 'P',
    tuple(sorted(('rerank_blender', 'blender2_3B'))): '*',
    tuple(sorted(('rerank_blender', 'bart_fid_rag_bcb'))): 'D'
}

pvalues = pvalues0.reset_index()
ltl, likert_dialogue_start = plot_by_category(ax, pvalues, "likert turn", likert_turn_color, 0, symbols)
ldl, comparative_start = plot_by_category(ax, pvalues, "likert dialogue", likert_dialogue_color, likert_dialogue_start, symbols)
cl, behavior_start = plot_by_category(ax, pvalues, "comparative", comparative_color, comparative_start, symbols)
bl, misc_start = plot_by_category(ax, pvalues, "behavior", behavior_color, behavior_start, symbols)

category_range = {likert_dialogue_start: likert_turn_color, comparative_start: likert_dialogue_color, behavior_start: comparative_color, misc_start: behavior_color}
xaxis_colors = {}
prev_idx = 0
for idx, color in category_range.items():
    for i in range(prev_idx, idx):
        xaxis_colors[i] = color
    prev_idx = idx

ax.set_xlabel("Label")
ax.set_ylabel("P-value")
xpos = np.arange(len(pvalues))
plt.yscale('log')
plt.ylim(math.pow(10, -67), 1)
ax.set_xticks(xpos)
ax.set_xticklabels(ltl+ldl+cl+bl, rotation=90)
for tickloc, ticklabel in zip(plt.gca().get_xticks(), plt.gca().get_xticklabels()):
    ticklabel.set_color(xaxis_colors[tickloc])
ax.set_title('Evaluation Sensitivity to Bot Differences')
ax.yaxis.grid(True)

# Save the figure and show
plt.tight_layout()
plt.show()

In [None]:
# ^ y-axis log scale 0.001, 0.01, 0.1,