In [476]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import random

from analysis import *
import evaluation_data_definitions as edd

import nltk

In [None]:
surge_annotations_comparative

In [None]:
def to_file(f):
    def fn_to_file(*args, load=None, reload=None, **kwargs):
        if load:
            return pd.read_csv(load)
        result = f(*args, **kwargs)
        if reload:
            result.to_csv(reload)
        return result
    return fn_to_file

In [None]:
def prettify(df, float_prec=None, col_types=None, sort_by=None, to_csv=None, index=True, header=True):
    if col_types:
        for col, type in col_types.items():
            df[col] = df[col].astype(type)
    if sort_by:
        df.sort_values(sort_by, ascending=False, inplace=True)
    if float_prec:
        df = df.round(float_prec)
    if to_csv:
        df.to_csv(to_csv, float_format=f"%.{float_prec}f", header=header, index=index)
    return df

In [None]:
@to_file
def across_evaluations(annotations, evaluation_fn):
    """
    :param annotations: iterable of annotations df to apply evaluation_fn to
    :param evaluation_fn: function (input is annotations df, output is results df)
    :return: results dataframe where first index level codes which evaluation (integer id)
    """
    results = [evaluation_fn(annotation) for annotation in annotations]
    all_results = pd.concat(results, keys=range(len(results)))
    all_results.index.set_names('round', level=0, inplace=True)
    return all_results

# 3 Behavior Evaluation Procedure

### Behavior Examples

In [None]:
def get_example(
        evaluation,
        category,
        label,
        mark,
        bot=None,
        context=0,
        seed=123,
        annotations: pd.DataFrame = None
):
    if annotations is None:
        annotations = evaluation.annotation_dataframe()
    labels = annotations.xs((category, label), level=(1, 2)).reset_index()
    options = labels[labels[0] == mark]
    if bot:
        options = options[options[sym.bot] == bot]
    try:
        example = options.sample(1, random_state=seed)
    except ValueError:
        return f'No samples for {category} {label} {mark} {bot}\n'
    eid = example[sym.item].item()
    if isinstance(eid, tuple):
        did, tid = eid
        turns = evaluation.dialogues[did].turns[max(0, tid-context):tid+1]
        botstring = '' if not bot else f'{bot}~~~\n'
        contextstring = ''.join((
            (
                f'User:  {turn.user_turn}\n'
                f'Sys:   {turn.bot_turn}\n'
            )
            for turn in turns[:-1]
        ))
        turn = turns[-1]
        turnstring = (
            f'User:  {turn.user_turn}\n'
            f'Sys:   {turn.bot_turn}\n'
            f'Label: {label} = {mark}\n'
        )
        return botstring + contextstring + turnstring
    else:
        dialogue = evaluation.dialogues[eid]
        turns = [
            turn
            for turn_pair in dialogue.turns
            for turn in (turn_pair.user_turn, turn_pair.bot_turn)
        ]
        return '\n'.join([f'{dialogue.bot}~~~', *turns, f'Label: {label} = {mark}\n'])


In [None]:
seeds = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
for s, b in zip(seeds, behavior):
    print(get_example(
        data.surge_evaluation,
        category.behavior, b, context=0, mark=1,
        seed=s,
        annotations=surge_annotations
    ))

### Behavior Annotation Pilot Agreements

In [None]:
def agreement_dataframe(annotations, load=None, reload=None, ci=True):
    if load:
        return pd.read_csv(load)
    doubly_annotated = annotations.iloc[:,:2].dropna().astype(int)
    label_groups = doubly_annotated.groupby(level=[sym.category, sym.label])
    kappas = label_groups.apply(fleiss_kappa, ci=ci)
    alphas = label_groups.apply(krippendorfs_alpha, ci=ci)
    agreements = pd.concat((alphas, kappas), axis=1)
    if reload:
        agreements.to_csv(reload)
    return agreements

In [None]:
def agreement_summaries(evaluations, load=None, reload=None):
    if load:
        return pd.read_csv(load)
    summaries = []
    for evaluation in evaluations:
        annotations = evaluation.annotation_dataframe()
        agreement = agreement_dataframe(annotations, ci=False)
        macros = agreement.dropna().mean()
        summaries.append(macros)
    if reload:
        ...
    sum_df = pd.concat(summaries, axis=1).transpose()
    sum_df.set_axis(
        [stat.kripp_alpha, 'x', stat.fleiss_kappa, stat.n],
        inplace=True, axis=1
    )
    sum_df.drop('x', axis=1, inplace=True)
    return sum_df

In [None]:
# todo - include ALL pilot annotations in agreement calculation (not just double annotation)
agreement_summaries(data.annotation_pilots)

### Behavior Annotation Pilot Screening

In [None]:
@to_file
def screening_rates_by_label(evaluation: edd.OnboardingEvaluation):
    perfs = {}
    workers_passed = {}
    workers_attempted = {}
    for did, dialogue in evaluation.dialogues.items():
        for attempt in dialogue.attempts:
            work_unit = evaluation.work_units[attempt.work_unit_id]
            round = int(did.split('_')[-1])
            task = work_unit.task
            labels = work_unit.labels
            num_mistakes = len(attempt.mistakes)
            worker = work_unit.worker_id
            accuracy = attempt.performance
            perfs.setdefault(task, []).append((num_mistakes, accuracy))
            workers_attempted.setdefault(task, set()).add(worker)
            if attempt.passed:
                workers_passed.setdefault(task, set()).add(worker)
    screening = {}
    for task, ls in perfs.items():
        mistakes, accuracies = zip(*ls)
        avg_m = sum(mistakes) / len(mistakes)
        avg_a = (
            sum(accuracies) / len(accuracies)
            if all((a is not None for a in accuracies)) else None
        )
        n = len(mistakes)
        attempted = len(workers_attempted.get(task, ()))
        passed = len(workers_passed.get(task, ()))
        screening[task] = {
            'attempted': attempted, 'passed': passed,
            'mistakes': avg_m, 'accuracy': avg_a, 'n': n
        }
    return pd.DataFrame(screening.values(), screening)

In [None]:
across_evaluations(
    data.annotation_pilots_onboarding[2:4],
    screening_rates_by_label,
    reload='results/annotation_pilot_screening.csv'
)

# 4 Model Selection

### Bot Pilot Summary Statistics

In [None]:
@to_file
def interactor_summary_stats(evaluation: edd.Evaluation):
    num_dialogues = len(evaluation.dialogues)
    mean_turns = (
        sum((
            2*len(d.turns)
            for d in evaluation.dialogues.values()
        ))
        / num_dialogues
    )
    user_turn_len = (
        sum((
            len(nltk.word_tokenize(t.user_turn))
            for d in evaluation.dialogues.values()
            for t in d.turns
        ))
        / sum((
            len(d.turns)
            for d in evaluation.dialogues.values()
        ))
    )
    num_interactors = len({
        unit.worker_id
        for unit in evaluation.work_units.values()
    })
    summary = {
        'dialogues': num_dialogues,
        'mean turns': mean_turns,
        'user turn length': user_turn_len,
        'interactors': num_interactors,
    }
    return pd.DataFrame(summary.values(), summary)

In [None]:
across_evaluations(
    data.bot_pilots, interactor_summary_stats,
    load='results/bot_pilot_summary.csv'
)

### Bot Pilots Likert Quality

In [None]:
@to_file
def evaluate_interactive_likert(annotations):
    likert_annotations = annotations.xs(category.likert_dialogue, level=sym.category)
    label_groups = likert_annotations.groupby(level=[sym.bot, sym.label])
    means = label_groups.apply(mean_and_ci)
    return means

qdf = evaluate_interactive_likert(
    data.bot_pilots[0].annotation_dataframe(),
    reload='results/bot_pilot_interactive_likert.csv'
).xs(scale.quality, level=sym.label)
qdf = prettify(qdf, float_prec=3, col_types={"n": "int"}, sort_by="mean", to_csv="results/paper/bot_pilot_interactive_likert_quality.csv")
qdf

### Bot Pilot Comparative Quality

In [None]:
def get_singly_annotated(df: pd.DataFrame, seed=None):
    if len(df.columns) == 1:
        return df.astype(int)
    previous_state = random.getstate()
    random.seed(seed)
    df = df.iloc[:,:2]
    mask = df[1].isna()
    singly_annotated = df.iloc[:,0][mask]
    doubly_annotated = df[~mask]
    selection = [random.randint(0, 1) for _ in range(len(doubly_annotated))]
    indices = list(range(len(doubly_annotated)))
    select_annotated = doubly_annotated.values[indices, selection]
    select_annotated = pd.DataFrame(select_annotated, index=doubly_annotated.index)
    annotations = pd.concat((singly_annotated, select_annotated))
    random.setstate(previous_state)
    return annotations.astype(int)

In [None]:
@to_file
def evaluate_comparisons(annotations):
    single_annotated = get_singly_annotated(annotations)
    prop_dfs = []
    for cmp, cmp_label in {-1: 'lose', 0: 'tie', 1: 'win'}.items():
        annotated = single_annotated == cmp
        annotated = annotated.astype(int)
        groups = annotated.groupby(level=[sym.bot, sym.bot_cmp, sym.label])
        props = groups.apply(prop_and_ci)
        props.rename(columns={stat.proportion: cmp_label}, inplace=True)
        prop_dfs.append(props)
    result = pd.concat(prop_dfs, axis=1)
    prop_dfs = []
    for cmp, cmp_label in {-1: 'lose', 0: 'tie', 1: 'win'}.items():
        annotated = single_annotated == cmp
        annotated = annotated.astype(int)
        groups = annotated.groupby(level=[sym.bot, sym.label])
        props = groups.apply(prop_and_ci)
        props.rename(columns={stat.proportion: cmp_label}, inplace=True)
        prop_dfs.append(props)
    result_vs_all = pd.concat(prop_dfs, axis=1)
    others_idx = {sym.bot_cmp: 'others'}
    result_vs_all = result_vs_all.assign(**others_idx)
    levels = [sym.bot, sym.bot_cmp, sym.label]
    result_vs_all = result_vs_all.set_index(sym.bot_cmp, append=True)
    result_vs_all = result_vs_all.reset_index().set_index(levels)
    result = pd.concat((result_vs_all, result))
    return result

cmp_annot = data.bot_pilots[0].comparative_annotation_dataframe()

cmp = evaluate_comparisons(
    cmp_annot,
    reload='results/bot_pilot_interactive_comparative.csv'
)
cmp

### Bot Pilot Conversation Examples

In [None]:
seeds = [1, 1, 1, 1, 1, 1, 1]
for s, b in zip(seeds, bot):
    example = get_example(
        data.bot_pilots[0],
        category.likert_dialogue, label=scale.quality, bot=b, context=0, mark=1,
        seed=s
    )
    print(example)

# 5 Conversation Collection

In [None]:
"""
Time results to collect conversations
"""

### Conversation Data Summary Statistics

In [None]:
df = interactor_summary_stats(
    data.dialogue_collection,
    load='results/conversation_summary_stats.csv'
)
df = prettify(df, float_prec=3, to_csv="results/paper/conversation_data_summary.csv", index=False, header=False)
df

# 6 Evaluation

In [None]:
"""
Timing results for training and collection (per task)
"""

### Worker Group Completed Work

In [None]:
data.surge_evaluation.annotation_counts()

In [None]:
data.student_evaluation.annotation_counts()

In [None]:
data.mturk_evaluation.annotation_counts()

### Worker Group Screening

In [None]:
screening = across_evaluations(
    [data.annotation_pilots_onboarding[-2], data.student_onboarding, data.mturk_onboarding, data.annotation_pilots_onboarding[-1], data.surge_onboarding],
    screening_rates_by_label,
    reload='results/evaluation_screening.csv'
)


### Agreements

In [None]:
agreements = agreement_dataframe(
    surge_annotations, load='results/surge_agreements.csv'
)
agreements = prettify(agreements, float_prec=3, sort_by=["category", "Krippendorff's alpha"], col_types={"n": int, "n.1": int}, to_csv='results/paper/surge_agreements.csv', index=False)
agreements

In [None]:
# Build the plot
plt.rcParams["figure.figsize"] = (10,5)

fig, ax = plt.subplots()

def plot_by_category(ax, df, category, color, xaxis_start):
    extracted = df[df["category"] == category]
    lower_bound = extracted["Krippendorff's alpha"] - extracted["CI low"]
    upper_bound = extracted["CI high"] - extracted["Krippendorff's alpha"]
    xaxis_end = xaxis_start + len(extracted)
    ax.errorbar(np.arange(xaxis_start, xaxis_end),
                extracted["Krippendorff's alpha"],
                yerr=[lower_bound, upper_bound],
                fmt='o',
                elinewidth=1,
                color=color)
    return xaxis_end

likert_turn_color = "blue"
likert_dialogue_color = "red"
comparative_color = "green"
behavior_color = "orange"

likert_dialogue_start = plot_by_category(ax, agreements, "likert turn", likert_turn_color, 0)
comparative_start = plot_by_category(ax, agreements, "likert dialogue", likert_dialogue_color, likert_dialogue_start)
behavior_start = plot_by_category(ax, agreements, "comparative", comparative_color, comparative_start)
misc_start = plot_by_category(ax, agreements, "behavior", behavior_color, behavior_start)

category_range = {likert_dialogue_start: likert_turn_color, comparative_start: likert_dialogue_color, behavior_start: comparative_color, misc_start: behavior_color}
xaxis_colors = {}
prev_idx = 0
for idx, color in category_range.items():
    for i in range(prev_idx, idx):
        xaxis_colors[i] = color
    prev_idx = idx

ax.set_ylabel("Krippendorf's alpha")
xpos = np.arange(len(agreements))
ax.set_xlabel("Evaluation Label")
ax.set_xticks(xpos)
ax.set_xticklabels(agreements["label"], rotation=90)
for tickloc, ticklabel in zip(plt.gca().get_xticks(), plt.gca().get_xticklabels()):
    ticklabel.set_color(xaxis_colors[tickloc])
ax.set_title('Interannotator Agreement')
ax.yaxis.grid(True)

# Save the figure and show
plt.tight_layout()
# plt.savefig('bar_plot_with_error_bars.png')
plt.show()

In [None]:
all_agreements = across_evaluations(
    [
        e.annotation_dataframe() for e in
        (data.student_evaluation, data.mturk_evaluation, data.surge_evaluation)
    ],
    agreement_dataframe,
    load='results/evaluation_agreements.csv'
)
all_agreements = prettify(all_agreements, float_prec=3, sort_by=["round", "category", "Krippendorff's alpha"], col_types={"n": int, "n.1": int}, to_csv='results/paper/all_agreements.csv', index=False)
all_agreements

# 7 Comprehensive Analysis

### Likert Dialogue

In [None]:
bots = ['Blender2', 'Emora', 'BartFidRAG', 'RerankBlender']
# https://blog.finxter.com/how-to-plot-matplotlibs-color-palette-and-choose-your-plot-color/
graphing_bot_colors = {
    'blender2_3B': 'purple',
    'bart_fid_rag_bcb': 'royalblue',
    'emora': 'turquoise',
    'rerank_blender': 'green'
}
bot_transformer = {
    'blender2_3B': 'Blender2',
    'emora': 'Emora',
    'rerank_blender': 'Blender-Decode',
    'bart_fid_rag_bcb': 'BART-FiDRAG'
}
dimensions_transformer = {
    'consistent': 'CO',
    'emotional': 'EU',
    'engaging': 'EN',
    'grammatical': 'GR',
    'informative': 'IN',
    'proactive': 'PR',
    'quality': 'OQ',
    'relevant': 'RE'
}

In [None]:
def grouped_barplot(df, title, ylabel, xlabel, ylim, value_col='mean', rot=45, fig_size=(10,5)):

    plt.rcParams["figure.figsize"] = fig_size

    df['lower'] = df[value_col] - df["CI low"]
    cilow = df.pivot(index='label', columns='bot', values='lower')
    df['upper'] = df["CI high"] - df[value_col]
    cihigh = df.pivot(index='label', columns='bot', values='upper')

    err = []
    for col in cilow:
        err.append([cilow[col].values, cihigh[col].values])

    df0 = df.pivot(index='label', columns='bot', values=value_col)
    ax = df0.plot(
        kind='bar',
        ylim=ylim,
        title=title,
        rot=rot,
        yerr=err,
        color=[graphing_bot_colors[bot] for bot in df0.columns]
    )
    ax.legend(
        [bot_transformer[bot] for bot in df0.columns],
        ncol=2
    )
    ax.set_ylabel(ylabel)
    ax.set_xlabel(xlabel)
    ax.set_xticklabels([dimensions_transformer[d] if d in dimensions_transformer else behaviors_transformer[d] for d in df0.index])

In [None]:
def evaluate_likert_ratings(annotations, category, load=None, reload=None):
    if load:
        return pd.read_csv(load)
    single_annotated = get_singly_annotated(annotations)
    likert_annotations = single_annotated.xs(category, level=sym.category)
    label_groups = likert_annotations.groupby(level=[sym.bot, sym.label])
    means = label_groups.apply(mean_and_ci)
    if reload:
        means.to_csv(reload)
    return means

In [None]:
surge_likert_dialogue_ratings = evaluate_likert_ratings(
    surge_annotations, category.likert_dialogue,
    load='results/surge_likert_dialogue_ratings.csv'
)
sldr = prettify(surge_likert_dialogue_ratings, float_prec=3, col_types={"n": int}, sort_by=["bot", "mean"], to_csv="results/paper/surge_likert_dialogue_ratings.csv", index=False)
sldr

In [None]:
grouped_barplot(sldr, title="Average Dialogue Likert Rating", ylabel="Likert Rating", xlabel='Label', ylim=(2.5,4.55), rot=0, fig_size=(10,3))

### Likert Turn

In [None]:
surge_likert_turn_ratings = evaluate_likert_ratings(
    surge_annotations, category.likert_turn,
    load='results/surge_likert_turn_ratings.csv'
)
sltr = prettify(surge_likert_turn_ratings, float_prec=3, col_types={"n": int}, sort_by=["bot", "mean"], to_csv="results/paper/surge_likert_turn_ratings.csv", index=False)
sltr

In [None]:
grouped_barplot(sltr, title="Average Turn Likert Rating", ylabel="Likert Rating", xlabel='Label', ylim=(2.5,5.0), rot=0)

### Comparative

In [None]:
comparison_df = evaluate_comparisons(
    surge_annotations_comparative,
    reload='results/surge_comparisons.csv'
)
comparison_df

In [None]:
# each bot is a dataframe
botvothers = comparison_df[comparison_df.index.get_level_values('bot comp') == 'others'][['win', 'tie', 'lose']]
print(comparison_df.columns)
botvothers['CI low'] = comparison_df.iloc[:, 9]
botvothers['CI high'] = comparison_df.iloc[:, 10]
botvothers.reset_index(level=['bot comp'], inplace=True)
botvothers.drop('bot comp', inplace=True, axis='columns')
toplot = botvothers.reorder_levels(['label', 'bot']).sort_index()
toplot

In [None]:
from matplotlib.text import Text

def plot_comparative(df0, title, value_col, fig_size):
    # https://stackoverflow.com/questions/59922701/pandas-how-can-i-group-a-stacked-bar-chart
    plt.rcParams["figure.figsize"] = fig_size

    df0['lower'] = df0[value_col] - df0["CI low"]
    df0['upper'] = df0["CI high"] - df0[value_col]

    errLow = df0[['lower']].reset_index(['bot', 'label']).pivot(index='label', columns='bot', values='lower')
    errHi = df0[['upper']].reset_index(['bot', 'label']).pivot(index='label', columns='bot', values='upper')

    # 4 x 2 x 8 (bots x low, hi x labels)
    err = []
    for col in errLow:
        err.append([errLow[col].values, errHi[col].values])

    df0 = df0.unstack(level=-1)
    fig, ax = plt.subplots()

    groups = []
    for i in df0.columns:
        if i[1] not in groups:
            groups.append(i[1])

    (df0['win']+df0['tie']+df0['lose']).plot(kind='bar', color=[graphing_bot_colors[i] for i in groups], alpha=0.2, rot=0, ax=ax)
    (df0['win']+df0['tie']).plot(kind='bar', color=[graphing_bot_colors[i] for i in groups], alpha=0.4, rot=0, ax=ax)
    df0['win'].plot(kind='bar', color=[graphing_bot_colors[i] for i in groups], rot=0, ax=ax, yerr=err)

    h, l = ax.get_legend_handles_labels()
    markers = {}
    for h, l, (wtl, bot) in zip(h, l, df0.columns):
        markers.setdefault(bot, []).append((h,l))
    wtl_dummies = [plt.plot([],marker="", ls="")[0]]*4
    bot_dummies = [plt.plot([],marker="", ls="")[0]]*4
    handles = wtl_dummies
    labels = ["", "Lose:", "Tie:", "Win:"]
    for i, (bot, symbols) in enumerate(markers.items()):
        handles.append(bot_dummies[i])
        labels.append(bot_transformer[bot])
        handles.extend([s[0] for s in symbols])
        labels.extend(["" for s in symbols])
    leg = plt.legend(handles, labels, ncol=5, loc='upper right', bbox_to_anchor=(0.67, -0.35), labelspacing=0.25)
    for i, vpack in enumerate(leg._legend_handle_box.get_children()):
        if i == 0: # row titles
            for hpack in vpack.get_children():
                hpack.get_children()[0].set_width(0)
        else:
            for j, hpack in enumerate(vpack.get_children()):
                if j > 0: # bot win/tie/lose markers
                    hpack.get_children()[0].get_children()[0].set_width(50)
                else: # column titles
                    hpack.get_children()[0].set_width(0)
    ax.set_title(title)
    ax.set_ylabel('Proportion')
    ax.set_xlabel('Label')
    ax.set_xticklabels([dimensions_transformer[d] for d in df0.index])

    plt.tight_layout()
    plt.show()
    return df0

In [None]:
df = plot_comparative(toplot, 'Comparative Evaluation Results', 'win', (15,5))

### Behaviors

In [None]:
def evaluate_behavior_rates(annotations, load=None, reload=None):
    if load:
        return pd.read_csv(load)
    single_annotated = get_singly_annotated(annotations)
    behavior_annotations = single_annotated.xs(category.behavior, level=sym.category)
    label_groups = behavior_annotations.groupby(level=[sym.bot, sym.label])
    means = label_groups.apply(prop_and_ci)
    if reload:
        means.to_csv(reload)
    return means

In [None]:
surge_behavior_rates = evaluate_behavior_rates(
    surge_annotations,
    load='results/surge_behavior_rates.csv'
)
sbr = prettify(surge_behavior_rates,  float_prec=3, col_types={"n": int}, sort_by=["bot", "proportion"], to_csv="results/paper/surge_behavior_rates.csv", index=False)
sbr

In [None]:
behaviors_transformer = {
    'correct fact': 'CF',
    'empathetic': 'EM',
    'follow up': 'FU',
    'life info': 'LI',
    'preference info': 'PI',
    'uninterpretable': 'UI',
    'antisocial': 'AS',
    'commonsense contradiction': 'CC',
    'ignore': 'IG',
    'incorrect fact': '~CF',
    'irrelevant': 'IR',
    'lack of empathy': '~EM',
    'partner contradiction': 'PC',
    'redundant': 'RD',
    'self contradiction': 'SC',
    'topic switch': 'TS'
}

to_maximize = {'correct fact', 'empathetic', 'follow up', 'life info', 'preference info'}
maximize = sbr[sbr['label'].isin(to_maximize)]
grouped_barplot(maximize, title="Rates of Desirable Behaviors", ylabel="Behavior", xlabel='Label', ylim=(0,0.7), value_col="proportion", rot=0, fig_size=(20,5))

to_minimize = {'uninterpretable', 'antisocial', 'commonsense contradiction', 'ignore', 'incorrect fact', 'irrelevant', 'lack of empathy', 'partner contradiction', 'redundant', 'self contradiction', 'topic switch'}
minimize = sbr[sbr['label'].isin(to_minimize)]
grouped_barplot(minimize, title="Rates of Undesirable Behaviors", ylabel="Behavior", xlabel='Label', ylim=(0,0.35), value_col="proportion", rot=0, fig_size=(20,5))

# 8 Evaluation Metric Assessment

### Metric Sensitivity

In [518]:
from itertools import combinations
from scipy.stats import ttest_ind

def t_tests(df: pd.DataFrame):
    """
    :param df: (bot, data point) x 1 -> score
    :return: p values of test on each bot pair (pd.Series)
    """
    bots = set(df.index.get_level_values(0))
    bot_pairs = list(combinations(bots, 2))
    result = {}
    for ba, bb in bot_pairs:
        a = df.xs(ba).to_numpy().squeeze()
        b = df.xs(bb).to_numpy().squeeze()
        t, p = ttest_ind(a, b)
        result[(ba, bb)] = p
    result_series = pd.Series(result.values(), result)
    return result_series

get_singly_annotated(surge_annotations).xs(
    category.likert_turn,
    level=sym.category
).groupby(
    sym.label
).apply(
    t_tests
)

Unnamed: 0_level_0,blender2_3B,blender2_3B,blender2_3B,emora,emora,rerank_blender
Unnamed: 0_level_1,emora,rerank_blender,bart_fid_rag_bcb,rerank_blender,bart_fid_rag_bcb,bart_fid_rag_bcb
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
consistent,0.3202258,0.397445,0.001641548,0.06150188,3.082214e-05,0.01828072
emotional,1.45859e-12,0.4873241,8.042664e-10,3.0898e-10,0.5902241,6.072199e-08
engaging,0.03388186,5.737756e-05,1.548385e-13,0.0445063,4.034663e-08,0.000571606
grammatical,1.259973e-05,8.340986e-83,9.079602e-30,5.853912e-61,1.514652e-15,1.987924e-16
informative,2.358728e-10,7.32129e-15,2.114359e-20,1.607407e-40,9.844346e-49,0.1410483
proactive,1.599599e-05,0.08555421,8.080369e-51,0.01403651,2.4659459999999997e-78,7.902666e-58
quality,4.717359e-08,4.606636e-27,2.810324e-22,1.950697e-07,9.602646e-06,0.5985842
relevant,8.285739e-17,7.896203e-23,2.426902e-09,0.1561723,0.03532057,0.0004657267


### Predictive Validity

In [519]:
from statsmodels.miscmodels.ordinal_model import OrderedModel

def regressions(df, quality_column_name=None):
    """
    :param df: dialogue x (*features, quality) -> value
    :return: *(coef, low, high), mcfadden r^2
    """
    if not quality_column_name:
        quality_column_name = df.columns[-1]
    qualities = df[quality_column_name]
    features = [f for f in df.columns if f != quality_column_name]
    model = OrderedModel(qualities, df[features], distr='logit')
    results = model.fit()
    coefs = {f: results.params[f] for f in features}
    prsqrd = results.prsquared
    result = {**coefs, stat.mcfad_r2: prsqrd}
    return pd.Series(result.values(), result)

def dialogue_metrics(ev):
    df: pd.DataFrame = ev.annotation_dataframe()
    df = get_singly_annotated(df, seed=123)
    reindexed = df.reset_index()
    items = reindexed[sym.item]
    dialogues = [e[0] if isinstance(e, tuple) else e for e in items]
    reindexed['dialogue'] = dialogues
    reindexed.set_index(
        [sym.bot, sym.category, sym.label, 'dialogue', sym.item],
        inplace=True, verify_integrity=True
    )
    ld = reindexed.xs(category.likert_dialogue, level=sym.category)
    ld = ld.droplevel(sym.bot).droplevel(sym.item)
    ld.columns = ['score']
    ldq = ld.xs(scale.quality, level=sym.label)
    ldq.columns = ['quality']

    lt = reindexed.xs(category.likert_turn, level=sym.category)
    lt = lt.groupby([sym.label, 'dialogue']).mean()
    lt.columns = ['score']
    ltq = lt.xs(scale.quality, level=sym.label)
    ltq.columns = ['quality']

    be = reindexed.xs(category.behavior, level=sym.category)
    be = be.groupby([sym.label, 'dialogue']).mean()
    be.columns = ['score']

    ds = pd.concat(
        [lt, be, ld],
        keys=[category.likert_turn, category.behavior, category.likert_dialogue],
        names=[sym.category, sym.label, 'dialogue']
    )
    likert_dialogue_quality_features = ds.join(ldq, on='dialogue')
    likert_turn_quality_features = ds.join(ltq, on='dialogue')
    return likert_dialogue_quality_features, likert_turn_quality_features

@to_file
def dialogue_quality_regressions(ev):
    ldq, ltq = dialogue_metrics(ev)
    groups = ldq.groupby(
        [sym.category, sym.label]
    )
    result = groups.apply(regressions)
    return result

"""

"""

dialogue_quality_regressions(
    data.surge_evaluation,
    load='results/dialogue_quality_regressions.csv'
)

Unnamed: 0,category,label,score,McFadden's pseudo-R-squared
0,likert turn,consistent,0.437186,0.008301
1,likert turn,emotional,0.459415,0.010492
2,likert turn,engaging,0.364656,0.009
3,likert turn,grammatical,-0.038475,5.4e-05
4,likert turn,informative,-0.024559,3.3e-05
5,likert turn,proactive,0.418151,0.011262
6,likert turn,quality,0.451505,0.009052
7,likert turn,relevant,0.411045,0.010554
8,behavior,antisocial,6.022125,0.00119
9,behavior,commonsense contradiction,-3.765228,0.024278


### Agreement Between Static and Interactive Evaluators