In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from analysis import *

In [None]:
surge_annotations = data.surge_evaluation.annotation_dataframe()
surge_annotations_comparative = data.surge_evaluation.comparative_annotation_dataframe()

surge_annotations

# 7 Comprehensive Analysis

### Likert Dialogue

In [None]:
bots = ['Blender2', 'Emora', 'BartFidRAG', 'RerankBlender']
# https://blog.finxter.com/how-to-plot-matplotlibs-color-palette-and-choose-your-plot-color/
graphing_bot_colors = {
    'blender2_3B': 'purple',
    'bart_fid_rag_bcb': 'royalblue',
    'emora': 'turquoise',
    'rerank_blender': 'green'
}
bot_transformer = {
    'blender2_3B': 'Blender2',
    'emora': 'Emora',
    'rerank_blender': 'Blender-Decode',
    'bart_fid_rag_bcb': 'BART-FiDRAG'
}
dimensions_abbrev = {
    'consistent': 'CO',
    'emotional': 'EU',
    'engaging': 'EN',
    'grammatical': 'GR',
    'informative': 'IN',
    'proactive': 'PR',
    'quality': 'OQ',
    'relevant': 'RE'
}

dimensions_identity = {
    'consistent': 'consistent',
    'emotional': 'emotional',
    'engaging': 'engaging',
    'grammatical': 'grammatical',
    'informative': 'informative',
    'proactive': 'proactive',
    'quality': 'quality',
    'relevant': 'relevant'
}

dimensions_transformer = dimensions_identity

In [None]:
SMALL_SIZE = 16
MEDIUM_SIZE = 20
BIGGER_SIZE = 24

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=12)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

def grouped_barplot(df0, title, ylabel, xlabel, ylim, value_col='mean', rot=45, fig_size=(10,5)):
    df = df0.reset_index()
    plt.rcParams["figure.figsize"] = fig_size

    df['lower'] = df[value_col] - df["CI low"]
    cilow = df.pivot(index='label', columns='bot', values='lower')
    df['upper'] = df["CI high"] - df[value_col]
    cihigh = df.pivot(index='label', columns='bot', values='upper')

    err = []
    for col in cilow:
        err.append([cilow[col].values, cihigh[col].values])

    df0 = df.pivot(index='label', columns='bot', values=value_col)
    ax = df0.plot(
        kind='bar',
        ylim=ylim,
        title=title,
        rot=rot,
        yerr=err,
        color=[graphing_bot_colors[bot] for bot in df0.columns]
    )
    ax.legend(
        [bot_transformer[bot] for bot in df0.columns],
        ncol=2
    )
    ax.set_ylabel(ylabel, labelpad=20)
    ax.set_xlabel(xlabel, labelpad=20)
    ax.set_xticklabels([dimensions_transformer[d] if d in dimensions_transformer else behaviors_transformer[d] for d in df0.index])

In [None]:
def evaluate_likert_ratings(annotations, category, load=None, reload=None):
    if load:
        return pd.read_csv(load)
    single_annotated = get_singly_annotated(annotations)
    likert_annotations = single_annotated.xs(category, level=sym.category)
    label_groups = likert_annotations.groupby(level=[sym.bot, sym.label])
    means = label_groups.apply(mean_and_ci)
    if reload:
        means.to_csv(reload)
    return means

In [None]:
surge_likert_dialogue_ratings = evaluate_likert_ratings(
    surge_annotations, category.likert_dialogue,
    reload='results/surge_likert_dialogue_ratings'
)
sldr = prettify(surge_likert_dialogue_ratings, float_prec=3, col_types={"n": int}, sort_by=["bot", "mean"], to_csv="results/paper/surge_likert_dialogue_ratings", index=False)
sldr

In [None]:
grouped_barplot(sldr, title="Average Dialogue Likert Rating", ylabel="Likert Rating", xlabel='Label', ylim=(2.5,4.55), rot=45, fig_size=(10,5))


### Likert Turn


In [None]:
surge_likert_turn_ratings = evaluate_likert_ratings(
    surge_annotations, category.likert_turn,
    reload='results/surge_likert_turn_ratings'
)
sltr = prettify(surge_likert_turn_ratings, float_prec=3, col_types={"n": int}, sort_by=["bot", "mean"], to_csv="results/paper/surge_likert_turn_ratings", index=False)
sltr

In [None]:
grouped_barplot(sltr, title="Average Turn Likert Rating", ylabel="Likert Rating", xlabel='Label', ylim=(2.5,5.0), rot=45)

### Comparative

In [None]:
comparison_df = evaluate_comparisons(
    surge_annotations_comparative,
    reload='results/surge_comparisons'
)
comparison_df

In [None]:
# each bot is a dataframe
botvothers = comparison_df[comparison_df.index.get_level_values('bot comp') == 'others'][['win', 'tie', 'lose']]
print(comparison_df.columns)
botvothers['CI low'] = comparison_df.iloc[:, 9]
botvothers['CI high'] = comparison_df.iloc[:, 10]
botvothers.reset_index(level=['bot comp'], inplace=True)
botvothers.drop('bot comp', inplace=True, axis='columns')
toplot = botvothers.reorder_levels(['label', 'bot']).sort_index()
toplot

In [None]:
from matplotlib.text import Text

def plot_comparative(df0, title, value_col, fig_size):
    # https://stackoverflow.com/questions/59922701/pandas-how-can-i-group-a-stacked-bar-chart
    plt.rcParams["figure.figsize"] = fig_size

    df0['lower'] = df0[value_col] - df0["CI low"]
    df0['upper'] = df0["CI high"] - df0[value_col]

    errLow = df0[['lower']].reset_index(['bot', 'label']).pivot(index='label', columns='bot', values='lower')
    errHi = df0[['upper']].reset_index(['bot', 'label']).pivot(index='label', columns='bot', values='upper')

    # 4 x 2 x 8 (bots x low, hi x labels)
    err = []
    for col in errLow:
        err.append([errLow[col].values, errHi[col].values])

    df0 = df0.unstack(level=-1)
    fig, ax = plt.subplots()

    groups = []
    for i in df0.columns:
        if i[1] not in groups:
            groups.append(i[1])

    (df0['win']+df0['tie']+df0['lose']).plot(kind='bar', color=[graphing_bot_colors[i] for i in groups], alpha=0.2, rot=0, ax=ax)
    (df0['win']+df0['tie']).plot(kind='bar', color=[graphing_bot_colors[i] for i in groups], alpha=0.4, rot=0, ax=ax)
    df0['win'].plot(kind='bar', color=[graphing_bot_colors[i] for i in groups], rot=0, ax=ax, yerr=err)

    h, l = ax.get_legend_handles_labels()
    markers = {}
    for h, l, (wtl, bot) in zip(h, l, df0.columns):
        markers.setdefault(bot, []).append((h,l))
    wtl_dummies = [plt.plot([],marker="", ls="")[0]]*4
    bot_dummies = [plt.plot([],marker="", ls="")[0]]*4
    handles = wtl_dummies
    labels = ["", "Lose:", "Tie:", "Win:"]
    for i, (bot, symbols) in enumerate(markers.items()):
        handles.append(bot_dummies[i])
        labels.append(bot_transformer[bot])
        handles.extend([s[0] for s in symbols])
        labels.extend(["" for s in symbols])
    leg = plt.legend(handles, labels, ncol=5, loc='lower right', bbox_to_anchor=(0.67, -1.2), labelspacing=0.25)
    for i, vpack in enumerate(leg._legend_handle_box.get_children()):
        if i == 0: # row titles
            for hpack in vpack.get_children():
                hpack.get_children()[0].set_width(0)
        else:
            for j, hpack in enumerate(vpack.get_children()):
                if j > 0: # bot win/tie/lose markers
                    hpack.get_children()[0].get_children()[0].set_width(50)
                else: # column titles
                    hpack.get_children()[0].set_width(0)
    ax.set_title(title)
    ax.set_ylabel('Proportion', labelpad=20)
    ax.set_xlabel('Label', labelpad=20)
    ax.set_xticklabels([dimensions_transformer[d] for d in df0.index])

    plt.tight_layout()
    plt.show()
    return df0

In [None]:
df = plot_comparative(toplot, 'Comparative Evaluation Results', 'win', (20, 40))

### Behaviors

In [None]:
def evaluate_behavior_rates(annotations, load=None, reload=None):
    if load:
        return pd.read_csv(load)
    single_annotated = get_singly_annotated(annotations)
    behavior_annotations = single_annotated.xs(category.behavior, level=sym.category)
    label_groups = behavior_annotations.groupby(level=[sym.bot, sym.label])
    means = label_groups.apply(prop_and_ci)
    if reload:
        means.to_csv(reload)
    return means

In [None]:
surge_behavior_rates = evaluate_behavior_rates(
    surge_annotations,
    reload='results/surge_behavior_rates'
)
sbr = prettify(surge_behavior_rates,  float_prec=3, col_types={"n": int}, sort_by=["bot", "proportion"], to_csv="results/paper/surge_behavior_rates", index=False)
sbr

In [None]:
behaviors_abbrev = {
    'correct fact': 'CF',
    'empathetic': 'EM',
    'follow up': 'FU',
    'life info': 'LI',
    'preference info': 'PI',
    'uninterpretable': 'UI',
    'antisocial': 'AS',
    'commonsense contradiction': 'CC',
    'ignore': 'IG',
    'incorrect fact': '~CF',
    'irrelevant': 'IR',
    'lack of empathy': '~EM',
    'partner contradiction': 'PC',
    'redundant': 'RD',
    'self contradiction': 'SC',
    'topic switch': 'TS'
}

behaviors_identity = {
    'correct fact': 'correct fact',
    'empathetic': 'empathetic',
    'follow up': 'follow up',
    'life info': 'life info',
    'preference info': 'preference info',
    'uninterpretable': 'uninterpretable',
    'antisocial': 'antisocial',
    'commonsense contradiction': 'contra common',
    'ignore': 'ignore',
    'incorrect fact': 'incorrect fact',
    'irrelevant': 'irrelevant',
    'lack of empathy': 'lack of empathy',
    'partner contradiction': 'contra partner',
    'redundant': 'redundant',
    'self contradiction': 'contra self',
    'topic switch': 'topic switch'
}

behaviors_transformer = behaviors_identity

sbr = sbr.reset_index()
to_maximize = {'correct fact', 'empathetic', 'follow up', 'life info', 'preference info'}
maximize = sbr[sbr['label'].isin(to_maximize)]
grouped_barplot(maximize, title="Rates of Desirable Behaviors", ylabel="Rate", xlabel='Behavior', ylim=(0,0.7), value_col="proportion", rot=0, fig_size=(20,5))

to_minimize = {'uninterpretable', 'antisocial', 'commonsense contradiction', 'ignore', 'incorrect fact', 'irrelevant', 'lack of empathy', 'partner contradiction', 'redundant', 'self contradiction', 'topic switch'}
minimize = sbr[sbr['label'].isin(to_minimize)]
grouped_barplot(minimize, title="Rates of Undesirable Behaviors", ylabel="Rate", xlabel='Behavior', ylim=(0,0.35), value_col="proportion", rot=45, fig_size=(20,5))