In [1]:
import pandas as pd

import random

from analysis import *
import evaluation_data_definitions as edd

import nltk

In [2]:
surge_annotations = data.surge_evaluation.annotation_dataframe()
surge_annotations_comparative = data.surge_evaluation.comparative_annotation_dataframe()

surge_annotations

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,0,1,2
bot,category,label,item,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
rerank_blender,likert dialogue,informative,"(1042,38)_rerank_blender",2,4.0,
rerank_blender,likert dialogue,grammatical,"(1042,38)_rerank_blender",3,4.0,
rerank_blender,likert dialogue,emotional,"(1042,38)_rerank_blender",4,4.0,
rerank_blender,likert dialogue,relevant,"(1042,38)_rerank_blender",4,3.0,
rerank_blender,likert dialogue,quality,"(1042,38)_rerank_blender",2,3.0,
rerank_blender,...,...,...,...,...,...
rerank_blender,behavior,incorrect fact,"((441,26)_rerank_blender, 14)",0,,
rerank_blender,behavior,antisocial,"((441,26)_rerank_blender, 14)",0,,
rerank_blender,behavior,redundant,"((441,26)_rerank_blender, 14)",0,,
rerank_blender,behavior,self contradiction,"((441,26)_rerank_blender, 14)",0,,


In [3]:
def to_file(f):
    def fn_to_file(*args, load=None, reload=None, **kwargs):
        if load:
            return pd.read_csv(load)
        result = f(*args, **kwargs)
        if reload:
            result.to_csv(reload)
        return result
    return fn_to_file

In [4]:
@to_file
def across_evaluations(annotations, evaluation_fn):
    """
    :param annotations: iterable of annotations df to apply evaluation_fn to
    :param evaluation_fn: function (input is annotations df, output is results df)
    :return: results dataframe where first index level codes which evaluation (integer id)
    """
    results = [evaluation_fn(annotation) for annotation in annotations]
    all_results = pd.concat(results, keys=range(len(results)))
    all_results.index.set_names('round', level=0, inplace=True)
    return all_results

# 3 Behavior Evaluation Procedure

### Behavior Examples

In [5]:
def get_example(
        evaluation,
        category,
        label,
        mark,
        bot=None,
        context=0,
        seed=123,
        annotations: pd.DataFrame = None
):
    if annotations is None:
        annotations = evaluation.annotation_dataframe()
    labels = annotations.xs((category, label), level=(1, 2)).reset_index()
    options = labels[labels[0] == mark]
    if bot:
        options = options[options[sym.bot] == bot]
    try:
        example = options.sample(1, random_state=seed)
    except ValueError:
        return f'No samples for {category} {label} {mark} {bot}\n'
    eid = example[sym.item].item()
    if isinstance(eid, tuple):
        did, tid = eid
        turns = evaluation.dialogues[did].turns[max(0, tid-context):tid+1]
        botstring = '' if not bot else f'{bot}~~~\n'
        contextstring = ''.join((
            (
                f'User:  {turn.user_turn}\n'
                f'Sys:   {turn.bot_turn}\n'
            )
            for turn in turns[:-1]
        ))
        turn = turns[-1]
        turnstring = (
            f'User:  {turn.user_turn}\n'
            f'Sys:   {turn.bot_turn}\n'
            f'Label: {label} = {mark}\n'
        )
        return botstring + contextstring + turnstring
    else:
        dialogue = evaluation.dialogues[eid]
        turns = [
            turn
            for turn_pair in dialogue.turns
            for turn in (turn_pair.user_turn, turn_pair.bot_turn)
        ]
        return '\n'.join([f'{dialogue.bot}~~~', *turns, f'Label: {label} = {mark}\n'])


In [6]:
seeds = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
for s, b in zip(seeds, behavior):
    print(get_example(
        data.surge_evaluation,
        category.behavior, b, context=0, mark=1,
        seed=s,
        annotations=surge_annotations
    ))

User:  Yeah, I guess a lot of them are pretty sad people too just looking for someway to feel something 
Sys:   Yeah, that's what I think too. I think it's because they don't have anything going on in their lives.
Label: antisocial = 1

User:  I don't have children. I'm still in my teens.
Sys:   I bet it is challenging. I bet they are fun though so that is nice. Do you play any sports?
Label: commonsense contradiction = 1

User:  At the moment, I only have a dog
Sys:   What kind of dog do you have? My dog is a labrador retriever.
Label: partner contradiction = 1

User:  Thank you! I hope I get to go to the bahamas soon 
Sys:   I hope so too! I would love to visit there someday! The bahamas has such a beautiful tropical climate, and the climate is moderated by the gulf stream, which helps keep the climate so warm and humid!
Label: self contradiction = 1

User:  I see.
Sys:   That's awesome! Do you have a favorite song from joywave's second album? I love "content".
Label: ignore = 1

Use

### Behavior Annotation Pilot Agreements

In [7]:
def agreement_dataframe(annotations, load=None, reload=None, ci=True):
    if load:
        return pd.read_csv(load)
    doubly_annotated = annotations.iloc[:,:2].dropna().astype(int)
    label_groups = doubly_annotated.groupby(level=[sym.category, sym.label])
    kappas = label_groups.apply(fleiss_kappa, ci=ci)
    alphas = label_groups.apply(krippendorfs_alpha, ci=ci)
    agreements = pd.concat((alphas, kappas), axis=1)
    if reload:
        agreements.to_csv(reload)
    return agreements

In [8]:
def agreement_summaries(evaluations, load=None, reload=None):
    if load:
        return pd.read_csv(load)
    summaries = []
    for evaluation in evaluations:
        annotations = evaluation.annotation_dataframe()
        agreement = agreement_dataframe(annotations, ci=False)
        macros = agreement.dropna().mean()
        summaries.append(macros)
    if reload:
        ...
    sum_df = pd.concat(summaries, axis=1).transpose()
    sum_df.set_axis(
        [stat.kripp_alpha, 'x', stat.fleiss_kappa, stat.n],
        inplace=True, axis=1
    )
    sum_df.drop('x', axis=1, inplace=True)
    return sum_df

In [9]:
# todo - include ALL pilot annotations in agreement calculation (not just double annotation)
agreement_summaries(data.annotation_pilots)

Unnamed: 0,Krippendorff's alpha,Fleiss' kappa,n
0,0.112585,0.105706,65.0
1,0.377984,0.356535,15.0
2,0.182412,0.154556,15.5
3,0.261712,0.172157,120.486486
4,0.351674,0.29475,41.222222


### Behavior Annotation Pilot Screening

In [10]:
@to_file
def screening_rates_by_label(evaluation: edd.OnboardingEvaluation):
    perfs = {}
    workers_passed = {}
    workers_attempted = {}
    for did, dialogue in evaluation.dialogues.items():
        for attempt in dialogue.attempts:
            work_unit = evaluation.work_units[attempt.work_unit_id]
            round = int(did.split('_')[-1])
            task = work_unit.task
            labels = work_unit.labels
            num_mistakes = len(attempt.mistakes)
            worker = work_unit.worker_id
            accuracy = attempt.performance
            perfs.setdefault(task, []).append((num_mistakes, accuracy))
            workers_attempted.setdefault(task, set()).add(worker)
    screening = {}
    for task, ls in perfs.items():
        mistakes, accuracies = zip(*ls)
        avg_m = sum(mistakes) / len(mistakes)
        avg_a = (
            sum(accuracies) / len(accuracies)
            if all((a is not None for a in accuracies)) else None
        )
        n = len(mistakes)
        attempted = len(workers_attempted.get(task, ()))
        passed = len(workers_passed.get(task, ()))
        screening[task] = {
            'attempted': attempted, 'passed': passed,
            'mistakes': avg_m, 'accuracy': avg_a, 'n': n
        }
    return pd.DataFrame(screening.values(), screening)

In [11]:
across_evaluations(
    data.annotation_pilots_onboarding[2:4],
    screening_rates_by_label,
    reload='results/annotation_pilot_screening.csv'
)

Unnamed: 0_level_0,Unnamed: 1_level_0,attempted,passed,mistakes,accuracy,n
round,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,interpretability,4,0,0.25,0.979167,4
0,commonsense,4,0,2.0,0.856456,8
0,consistency,4,0,5.5,0.647395,10
0,transitions,4,0,6.0,0.660621,11
0,knowledge,4,0,2.666667,0.775214,12
0,sociality,4,0,0.4,0.96,5
1,interpretability,5,0,1.0,0.91342,7
1,commonsense,7,0,3.1,0.777473,10
1,consistency,4,0,5.090909,0.672116,11
1,personal_information,8,0,3.5,0.766667,16


# 4 Model Selection

### Bot Pilot Summary Statistics

In [12]:
@to_file
def interactor_summary_stats(evaluation: edd.Evaluation):
    num_dialogues = len(evaluation.dialogues)
    mean_turns = (
        sum((
            2*len(d.turns)
            for d in evaluation.dialogues.values()
        ))
        / num_dialogues
    )
    user_turn_len = (
        sum((
            len(nltk.word_tokenize(t.user_turn))
            for d in evaluation.dialogues.values()
            for t in d.turns
        ))
        / sum((
            len(d.turns)
            for d in evaluation.dialogues.values()
        ))
    )
    num_interactors = len({
        unit.worker_id
        for unit in evaluation.work_units.values()
    })
    summary = {
        'dialogues': num_dialogues,
        'mean turns': mean_turns,
        'user turn length': user_turn_len,
        'interactors': num_interactors,
    }
    return pd.DataFrame(summary.values(), summary)

In [13]:
across_evaluations(
    data.bot_pilots, interactor_summary_stats,
    load='results/bot_pilot_summary.csv'
)

Unnamed: 0,round,Unnamed: 1,0
0,0,dialogues,36.0
1,0,mean turns,31.388889
2,0,user turn length,8.219469
3,0,interactors,12.0
4,1,dialogues,184.0
5,1,mean turns,31.076087
6,1,user turn length,11.59986
7,1,interactors,33.0


### Bot Pilots Likert Quality

In [14]:
@to_file
def evaluate_interactive_likert(annotations):
    likert_annotations = annotations.xs(category.likert_dialogue, level=sym.category)
    label_groups = likert_annotations.groupby(level=[sym.bot, sym.label])
    means = label_groups.apply(mean_and_ci)
    return means

evaluate_interactive_likert(
    data.bot_pilots[0].annotation_dataframe(),
    reload='results/bot_pilot_interactive_likert.csv'
).xs(scale.quality, level=sym.label)

Unnamed: 0_level_0,mean,CI low,CI high,n
bot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bart_fid_rag_bcb,3.4,2.7089,4.0911,10.0
blender2_3B,3.4,1.734133,5.065867,5.0
cem,1.083333,0.899918,1.266749,12.0
dukenet,1.888889,1.175595,2.602183,9.0
emora,3.5,2.727326,4.272674,10.0
rerank_blender,3.8,3.235719,4.364281,10.0
rerank_blender2,3.3,2.343215,4.256785,10.0


### Bot Pilot Comparative Quality

In [15]:
def get_singly_annotated(df: pd.DataFrame, seed=None):
    if len(df.columns) == 1:
        return df.astype(int)
    previous_state = random.getstate()
    random.seed(seed)
    df = df.iloc[:,:2]
    mask = df[1].isna()
    singly_annotated = df.iloc[:,0][mask]
    doubly_annotated = df[~mask]
    selection = [random.randint(0, 1) for _ in range(len(doubly_annotated))]
    indices = list(range(len(doubly_annotated)))
    select_annotated = doubly_annotated.values[indices, selection]
    select_annotated = pd.DataFrame(select_annotated, index=doubly_annotated.index)
    annotations = pd.concat((singly_annotated, select_annotated))
    random.setstate(previous_state)
    return annotations.astype(int)

In [16]:
@to_file
def evaluate_comparisons(annotations):
    single_annotated = get_singly_annotated(annotations)
    prop_dfs = []
    for cmp, cmp_label in {-1: 'lose', 0: 'tie', 1: 'win'}.items():
        annotated = single_annotated == cmp
        annotated = annotated.astype(int)
        groups = annotated.groupby(level=[sym.bot, sym.bot_cmp, sym.label])
        props = groups.apply(prop_and_ci)
        props.rename(columns={stat.proportion: cmp_label}, inplace=True)
        prop_dfs.append(props)
    result = pd.concat(prop_dfs, axis=1)
    prop_dfs = []
    for cmp, cmp_label in {-1: 'lose', 0: 'tie', 1: 'win'}.items():
        annotated = single_annotated == cmp
        annotated = annotated.astype(int)
        groups = annotated.groupby(level=[sym.bot, sym.label])
        props = groups.apply(prop_and_ci)
        props.rename(columns={stat.proportion: cmp_label}, inplace=True)
        prop_dfs.append(props)
    result_vs_all = pd.concat(prop_dfs, axis=1)
    others_idx = {sym.bot_cmp: 'others'}
    result_vs_all = result_vs_all.assign(**others_idx)
    levels = [sym.bot, sym.bot_cmp, sym.label]
    result_vs_all = result_vs_all.set_index(sym.bot_cmp, append=True)
    result_vs_all = result_vs_all.reset_index().set_index(levels)
    result = pd.concat((result_vs_all, result))
    return result

cmp_annot = data.bot_pilots[0].comparative_annotation_dataframe()

cmp = evaluate_comparisons(
    cmp_annot,
    load='results/bot_pilot_interactive_comparative.csv'
)

### Bot Pilot Conversation Examples

In [17]:
seeds = [1, 1, 1, 1, 1, 1, 1]
for s, b in zip(seeds, bot):
    example = get_example(
        data.bot_pilots[0],
        category.likert_dialogue, label=scale.quality, bot=b, context=0, mark=1,
        seed=s
    )
    print(example)

blender2_3B~~~
Hi!
Hello, how are you? 
I'm good, how are you?
I'm doing well, thank you for asking. How's the weather in your neck of the woods?
It's not as cold as it was last week, which is nice. What's it like where you are?
I'm in the midwest, so it's pretty cold right now. I'm hoping it warms up soon.
Me too
Do you have any plans for the weekend? Or are you just going to stay home and watch tv?
I plan on visiting my parents and just chilling at home.
That sounds relaxing. I wish I could do that, but I have to work this weekend.
Dang. What do you do for work?
I work as a cashier at a grocery store. It pays the bills, so I can't complain.
Ah, sounds like chore tho
It's not too bad. I get to listen to music while I work, which is nice.
Oh that does sound nice. I listen to music when I work on homework.
What kind of music do you listen to? I'm a big fan of classical music myself.
I listen to pop punk pretty frequently, but anime music has made its way into my rotation.
That's cool. I

# 5 Conversation Collection

In [18]:
"""
Time results to collect conversations
"""

'\nTime results to collect conversations\n'

### Conversation Data Summary Statistics

In [19]:
interactor_summary_stats(
    data.dialogue_collection,
    load='results/conversation_summary_stats.csv'
)

Unnamed: 0.1,Unnamed: 0,0
0,dialogues,588.0
1,mean turns,30.595238
2,user turn length,11.353752
3,interactors,46.0


# 6 Evaluation

In [20]:
"""
Timing results for training and collection (per task)
"""

'\nTiming results for training and collection (per task)\n'

### Worker Group Completed Work

In [21]:
data.surge_evaluation.annotation_counts()

Unnamed: 0,Unnamed: 1,dialogues annotated,double annotated
likert dialogue,informative,400,108
likert dialogue,grammatical,400,108
likert dialogue,emotional,400,108
likert dialogue,relevant,400,108
likert dialogue,quality,400,108
likert dialogue,consistent,400,108
likert dialogue,proactive,400,108
likert dialogue,engaging,400,108
comparative,proactive,404,108
comparative,grammatical,404,108


### Worker Group Screening

In [22]:
across_evaluations(
    [data.student_onboarding, data.mturk_onboarding, data.surge_onboarding],
    screening_rates_by_label,
    load='results/evaluation_screening.csv'
)

Unnamed: 0,round,Unnamed: 1,attempted,passed,mistakes,accuracy,n
0,0,sociality,3,0,0.222222,,9
1,0,empathy,5,0,2.733333,,15
2,0,interpretability,7,0,1.3,,20
3,0,personal_information,5,0,3.555556,,18
4,0,consistency,1,0,3.0,,3
5,0,commonsense,2,0,2.8,,5
6,0,knowledge,1,0,3.666667,,3
7,1,commonsense,6,0,2.888889,,18
8,1,personal_information,11,0,7.393939,,33
9,1,transitions,24,0,8.597222,,72


### Agreements

In [23]:
agreements = agreement_dataframe(
    surge_annotations, load='results/surge_agreements.csv'
)
agreements

Unnamed: 0,category,label,Krippendorff's alpha,CI low,CI high,n,Fleiss' kappa,CI low.1,CI high.1,n.1
0,behavior,antisocial,0.553231,0.220313,0.784186,1634.0,0.553094,0.197545,0.81695,1634.0
1,behavior,commonsense contradiction,0.441968,0.391135,0.495004,1634.0,0.441797,0.389663,0.493002,1634.0
2,behavior,correct fact,0.617729,0.570461,0.661129,1634.0,0.617612,0.569462,0.667708,1634.0
3,behavior,empathetic,0.513505,0.467688,0.552542,1634.0,0.513356,0.470534,0.55643,1634.0
4,behavior,follow up,0.49247,0.452094,0.537156,1634.0,0.492315,0.448033,0.533,1634.0
5,behavior,ignore,0.571211,0.504229,0.636967,1634.0,0.57108,0.50485,0.638363,1634.0
6,behavior,incorrect fact,0.651525,0.581439,0.724609,1634.0,0.651418,0.573555,0.717872,1634.0
7,behavior,irrelevant,0.45612,0.400736,0.518956,1634.0,0.455954,0.400372,0.511914,1634.0
8,behavior,lack of empathy,0.435495,0.373795,0.49667,1634.0,0.435323,0.372182,0.499162,1634.0
9,behavior,life info,0.631394,0.586041,0.673584,1634.0,0.631281,0.585425,0.670557,1634.0


In [24]:
across_evaluations(
    [
        e.annotation_dataframe() for e in
        (data.student_evaluation, data.mturk_evaluation, data.surge_evaluation)
    ],
    agreement_dataframe,
    load='results/evaluation_agreements.csv'
)

Unnamed: 0,round,category,label,Krippendorff's alpha,CI low,CI high,n,Fleiss' kappa,CI low.1,CI high.1,n.1
0,0,behavior,antisocial,,,,233.0,,,,233.0
1,0,behavior,uninterpretable,0.322148,,,152.0,0.319911,,,152.0
2,0,comparative,consistent,0.680556,0.148148,1.000000,12.0,0.666667,,,12.0
3,0,comparative,emotional,0.141676,-0.362506,0.680556,12.0,0.127273,-0.317073,0.690323,12.0
4,0,comparative,engaging,0.067222,-0.480825,0.805241,12.0,0.238095,-0.142857,0.707317,12.0
...,...,...,...,...,...,...,...,...,...,...,...
87,2,likert turn,grammatical,0.418861,0.375635,0.465538,1634.0,0.296119,0.263644,0.333836,1634.0
88,2,likert turn,informative,0.278183,0.231128,0.324850,1634.0,0.091958,0.064174,0.119510,1634.0
89,2,likert turn,proactive,0.278859,0.232748,0.323215,1634.0,0.098088,0.069933,0.125158,1634.0
90,2,likert turn,quality,0.311672,0.266655,0.354655,1634.0,0.091631,0.062086,0.120679,1634.0


# 7 Comprehensive Analysis

### Likert Dialogue

In [25]:
def evaluate_likert_ratings(annotations, category, load=None, reload=None):
    if load:
        return pd.read_csv(load)
    single_annotated = get_singly_annotated(annotations)
    likert_annotations = single_annotated.xs(category, level=sym.category)
    label_groups = likert_annotations.groupby(level=[sym.bot, sym.label])
    means = label_groups.apply(mean_and_ci)
    if reload:
        means.to_csv(reload)
    return means

In [26]:
surge_likert_dialogue_ratings = evaluate_likert_ratings(
    surge_annotations, category.likert_dialogue,
    load='results/surge_likert_dialogue_ratings.csv'
)
surge_likert_dialogue_ratings

Unnamed: 0,bot,label,mean,CI low,CI high,n
0,bart_fid_rag_bcb,consistent,3.0,2.72081,3.27919,100.0
1,bart_fid_rag_bcb,engaging,3.29,3.056933,3.523067,100.0
2,bart_fid_rag_bcb,grammatical,3.75,3.570798,3.929202,100.0
3,bart_fid_rag_bcb,informative,3.79,3.596048,3.983952,100.0
4,bart_fid_rag_bcb,proactive,2.75,2.527263,2.972737,100.0
5,bart_fid_rag_bcb,quality,2.93,2.734089,3.125911,100.0
6,bart_fid_rag_bcb,relevant,3.55,3.316796,3.783204,100.0
7,blender2_3B,consistent,3.46,3.196311,3.723689,100.0
8,blender2_3B,engaging,3.94,3.757619,4.122381,100.0
9,blender2_3B,grammatical,4.24,4.077695,4.402305,100.0


### Likert Turn

In [27]:
surge_likert_turn_ratings = evaluate_likert_ratings(
    surge_annotations, category.likert_turn,
    load='results/surge_likert_turn_ratings.csv'
)
surge_likert_turn_ratings

Unnamed: 0,bot,label,mean,CI low,CI high,n
0,bart_fid_rag_bcb,consistent,3.944444,3.876275,4.012614,1512.0
1,bart_fid_rag_bcb,engaging,3.619048,3.551061,3.687035,1512.0
2,bart_fid_rag_bcb,grammatical,4.310847,4.263485,4.358208,1512.0
3,bart_fid_rag_bcb,informative,3.80754,3.75513,3.85995,1512.0
4,bart_fid_rag_bcb,proactive,2.938492,2.878133,2.998851,1512.0
5,bart_fid_rag_bcb,quality,3.339947,3.273733,3.406161,1512.0
6,bart_fid_rag_bcb,relevant,3.896164,3.827143,3.965185,1512.0
7,blender2_3B,consistent,4.100394,4.037904,4.162883,1524.0
8,blender2_3B,engaging,3.948819,3.891935,4.005703,1524.0
9,blender2_3B,grammatical,4.650919,4.614724,4.687113,1524.0


### Comparative

In [28]:
comparison_df = evaluate_comparisons(
    surge_annotations_comparative,
    load='results/surge_comparisons.csv'
)
comparison_df

Unnamed: 0,bot,bot comp,label,lose,CI low,CI high,n,tie,CI low.1,CI high.1,n.1,win,CI low.2,CI high.2,n.2
0,bart_fid_rag_bcb,others,consistent,0.000000,3.469447e-18,0.036641,101.0,0.613861,0.516402,0.702977,101.0,0.386139,0.297023,0.483598,101.0
1,bart_fid_rag_bcb,others,emotional,0.059406,2.750813e-02,0.123591,101.0,0.594059,0.496550,0.684676,101.0,0.346535,0.260895,0.443420,101.0
2,bart_fid_rag_bcb,others,engaging,0.009901,1.749911e-03,0.053967,101.0,0.643564,0.546475,0.730133,101.0,0.346535,0.260895,0.443420,101.0
3,bart_fid_rag_bcb,others,grammatical,0.118812,6.928719e-02,0.196271,101.0,0.455446,0.361736,0.552420,101.0,0.425743,0.333777,0.523150,101.0
4,bart_fid_rag_bcb,others,informative,0.000000,3.469447e-18,0.036641,101.0,0.455446,0.361736,0.552420,101.0,0.544554,0.447580,0.638264,101.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,rerank_blender,emora,grammatical,0.121212,4.816161e-02,0.273255,33.0,0.363636,0.221872,0.533838,33.0,0.515152,0.352184,0.674960,33.0
124,rerank_blender,emora,informative,0.000000,0.000000e+00,0.104270,33.0,0.303030,0.173755,0.473381,33.0,0.696970,0.526619,0.826245,33.0
125,rerank_blender,emora,proactive,0.000000,0.000000e+00,0.104270,33.0,0.424242,0.272356,0.591927,33.0,0.575758,0.408073,0.727644,33.0
126,rerank_blender,emora,quality,0.000000,0.000000e+00,0.104270,33.0,0.484848,0.325040,0.647816,33.0,0.515152,0.352184,0.674960,33.0


### Behaviors

In [29]:
def evaluate_behavior_rates(annotations, load=None, reload=None):
    if load:
        return pd.read_csv(load)
    single_annotated = get_singly_annotated(annotations)
    behavior_annotations = single_annotated.xs(category.behavior, level=sym.category)
    label_groups = behavior_annotations.groupby(level=[sym.bot, sym.label])
    means = label_groups.apply(prop_and_ci)
    if reload:
        means.to_csv(reload)
    return means

In [30]:
surge_behavior_rates = evaluate_behavior_rates(
    surge_annotations,
    load='results/surge_behavior_rates.csv'
)
surge_behavior_rates

Unnamed: 0,bot,label,proportion,CI low,CI high,n
0,bart_fid_rag_bcb,antisocial,0.000661,0.000117,0.003737,1512.0
1,bart_fid_rag_bcb,commonsense contradiction,0.216270,0.196251,0.237727,1512.0
2,bart_fid_rag_bcb,correct fact,0.357143,0.333381,0.381629,1512.0
3,bart_fid_rag_bcb,empathetic,0.335979,0.312613,0.360176,1512.0
4,bart_fid_rag_bcb,follow up,0.638889,0.614354,0.662719,1512.0
...,...,...,...,...,...,...
59,rerank_blender,preference info,0.305333,0.282549,0.329113,1500.0
60,rerank_blender,redundant,0.037333,0.028861,0.048169,1500.0
61,rerank_blender,self contradiction,0.046667,0.037102,0.058548,1500.0
62,rerank_blender,topic switch,0.256000,0.234557,0.278689,1500.0


# 8 Evaluation Metric Assessment

### Metric Sensitivity

In [31]:
from itertools import combinations
from scipy.stats import ttest_ind

def t_tests(df: pd.DataFrame):
    """
    :param df: (bot, data point) x 1 -> score
    :return: p values of test on each bot pair (pd.Series)
    """
    bots = set(df.index.get_level_values(0))
    bot_pairs = list(combinations(bots, 2))
    result = {}
    for ba, bb in bot_pairs:
        a = df.xs(ba).to_numpy().squeeze()
        b = df.xs(bb).to_numpy().squeeze()
        t, p = ttest_ind(a, b)
        result[(ba, bb)] = p
    result_series = pd.Series(result.values(), result)
    return result_series

get_singly_annotated(surge_annotations).xs(
    category.likert_turn,
    level=sym.category
).groupby(
    sym.label
).apply(
    t_tests
)

Unnamed: 0_level_0,blender2_3B,blender2_3B,blender2_3B,emora,emora,rerank_blender
Unnamed: 0_level_1,emora,rerank_blender,bart_fid_rag_bcb,rerank_blender,bart_fid_rag_bcb,bart_fid_rag_bcb
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
consistent,0.2821067,0.1263486,0.0001964507,0.007869605,1.301981e-06,0.02195203
emotional,7.838584e-13,0.1476471,2.066242e-08,1.924251e-08,0.2638675,2.908063e-05
engaging,0.06754221,0.0006162791,2.777692e-13,0.08888624,1.188359e-08,9.01594e-05
grammatical,2.160406e-05,7.011280000000001e-82,1.204052e-29,1.1749800000000001e-60,8.825884e-16,4.841986e-16
informative,1.243611e-09,5.632669e-16,8.063111e-22,3.3036020000000002e-40,1.8613279999999998e-48,0.1355242
proactive,4.620033e-06,0.04503122,5.689623e-51,0.0152566,6.086272e-80,1.772949e-59
quality,5.739296e-10,5.3847489999999995e-30,2.781181e-23,3.778628e-07,7.6119e-05,0.3737528
relevant,4.342593e-16,1.6909860000000002e-23,1.509481e-09,0.07063739,0.06835695,0.0003378969


### Predictive Validity

In [40]:
from statsmodels.miscmodels.ordinal_model import OrderedModel

def regressions(df, quality_column_name=None):
    """
    :param df: dialogue x (*features, quality) -> value
    :return: *(coef, low, high), mcfadden r^2
    """
    if not quality_column_name:
        quality_column_name = df.columns[-1]
    qualities = df[quality_column_name]
    features = [f for f in df.columns if f != quality_column_name]
    model = OrderedModel(qualities, df[features], distr='logit')
    results = model.fit()
    coefs = {f: results.params[f] for f in features}
    prsqrd = results.prsquared
    result = {**coefs, stat.mcfad_r2: prsqrd}
    return pd.Series(result.values(), result)

def dialogue_metrics(ev):
    df: pd.DataFrame = ev.annotation_dataframe()
    df = get_singly_annotated(df, seed=123)
    reindexed = df.reset_index()
    items = reindexed[sym.item]
    dialogues = [e[0] if isinstance(e, tuple) else e for e in items]
    reindexed['dialogue'] = dialogues
    reindexed.set_index(
        [sym.bot, sym.category, sym.label, 'dialogue', sym.item],
        inplace=True, verify_integrity=True
    )
    ld = reindexed.xs(category.likert_dialogue, level=sym.category)
    ld = ld.droplevel(sym.bot).droplevel(sym.item)
    ld.columns = ['score']
    ldq = ld.xs(scale.quality, level=sym.label)
    ldq.columns = ['quality']

    lt = reindexed.xs(category.likert_turn, level=sym.category)
    lt = lt.groupby([sym.label, 'dialogue']).mean()
    lt.columns = ['score']
    ltq = lt.xs(scale.quality, level=sym.label)
    ltq.columns = ['quality']

    be = reindexed.xs(category.behavior, level=sym.category)
    be = be.groupby([sym.label, 'dialogue']).mean()
    be.columns = ['score']

    ds = pd.concat(
        [lt, be, ld],
        keys=[category.likert_turn, category.behavior, category.likert_dialogue],
        names=[sym.category, sym.label, 'dialogue']
    )
    likert_dialogue_quality_features = ds.join(ldq, on='dialogue')
    likert_turn_quality_features = ds.join(ltq, on='dialogue')
    return likert_dialogue_quality_features, likert_turn_quality_features

@to_file
def dialogue_quality_regressions(ev):
    ldq, ltq = dialogue_metrics(ev)
    groups = ldq.groupby(
        [sym.category, sym.label]
    )
    result = groups.apply(regressions)
    return result

"""

"""

dialogue_quality_regressions(
    data.surge_evaluation,
    reload='results/dialogue_quality_regressions.csv'
)

Optimization terminated successfully.
         Current function value: 1.317115
         Iterations: 211
         Function evaluations: 341
Optimization terminated successfully.
         Current function value: 1.314205
         Iterations: 206
         Function evaluations: 345
Optimization terminated successfully.
         Current function value: 1.316186
         Iterations: 201
         Function evaluations: 331
Optimization terminated successfully.
         Current function value: 1.328068
         Iterations: 130
         Function evaluations: 220
Optimization terminated successfully.
         Current function value: 1.328096
         Iterations: 161
         Function evaluations: 264
Optimization terminated successfully.
         Current function value: 1.313182
         Iterations: 209
         Function evaluations: 337
Optimization terminated successfully.
         Current function value: 1.316118
         Iterations: 213
         Function evaluations: 348
Optimization termina



Unnamed: 0_level_0,Unnamed: 1_level_0,score,McFadden's pseudo-R-squared
category,label,Unnamed: 2_level_1,Unnamed: 3_level_1
likert turn,consistent,0.437186,0.008301
likert turn,emotional,0.459415,0.010492
likert turn,engaging,0.364656,0.009
likert turn,grammatical,-0.038475,5.4e-05
likert turn,informative,-0.024559,3.3e-05
likert turn,proactive,0.418151,0.011262
likert turn,quality,0.451505,0.009052
likert turn,relevant,0.411045,0.010554
behavior,antisocial,6.022125,0.00119
behavior,commonsense contradiction,-3.765228,0.024278


### Agreement Between Static and Interactive Evaluators