In [14]:
import pandas as pd
from analysis import *
from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import binom_test

In [15]:
surge_annotations = data.surge_evaluation.annotation_dataframe()
surge_annotations_comparative = data.surge_evaluation.comparative_annotation_dataframe()

surge_annotations

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,0,1,2
bot,category,label,item,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
rerank_blender,likert dialogue,emotional,"(109,13)_rerank_blender",4,4.0,
rerank_blender,likert dialogue,consistent,"(109,13)_rerank_blender",4,4.0,
rerank_blender,likert dialogue,grammatical,"(109,13)_rerank_blender",4,3.0,
rerank_blender,likert dialogue,informative,"(109,13)_rerank_blender",4,4.0,
rerank_blender,likert dialogue,proactive,"(109,13)_rerank_blender",4,3.0,
rerank_blender,...,...,...,...,...,...
rerank_blender,behavior,follow up,"((438,26)_rerank_blender, 14)",0,,
rerank_blender,behavior,topic switch,"((438,26)_rerank_blender, 14)",0,,
rerank_blender,behavior,ignore,"((438,26)_rerank_blender, 14)",0,,
rerank_blender,behavior,irrelevant,"((438,26)_rerank_blender, 14)",0,,


# 8 Comprehensive Analysis

### Metric Sensitivity

In [16]:
from itertools import combinations
from scipy.stats import ttest_ind

def p_vals(df: pd.DataFrame, test='t'):
    """
    :param df: (bot, data point) x 1 -> score
    :param test: statistical test function (t for t test, p for prop test, s for sign test)
    :return: p values of test on each bot pair (pd.Series)
    """
    bots = set(df.index.get_level_values(0))
    bot_pairs = list(combinations(bots, 2))
    result = {}
    for ba, bb in bot_pairs:
        a = df.xs(ba).to_numpy().squeeze()
        b = df.xs(bb).to_numpy().squeeze()
        if test == 't':
            t, p = ttest_ind(a, b, equal_var=False)
        elif test == 'p':
            z, p = proportions_ztest(count=[
                sum(a), sum(b)
            ], nobs=[
                len(a), len(b)
            ])
        elif test == 's':
            # sign test
            a = a[a==1]
            b = b[b==1]
            p = binom_test(sum(a), sum(a)+sum(b), p=0.5)
        else:
            raise ValueError('invalid arg for param "test"')
        result[(ba, bb)] = p
    result_series = pd.Series(result.values(), result)
    return result_series

@to_file
def t_test_p_values_comparing_bots(annotations):
    annotations = get_singly_annotated(annotations)
    prop_annotations = annotations.xs(
        category.behavior, level=sym.category, drop_level=False
    )
    comp_annotations = annotations.xs(
        category.comparative, level=sym.category, drop_level=False
    )
    mean_annotations = annotations.drop(
        index=category.behavior, level=sym.category
    ).drop(
        index=category.comparative, level=sym.category
    )
    mean_ps = mean_annotations.groupby(
        [sym.category, sym.label]
    ).apply(p_vals)
    prop_ps = prop_annotations.groupby(
        [sym.category, sym.label]
    ).apply(lambda x: p_vals(x, test='p'))
    comp_ps = comp_annotations.groupby(
        [sym.category, sym.label]
    ).apply(lambda x: p_vals(x, test='s'))
    result = pd.concat([prop_ps, mean_ps, comp_ps], axis=0)
    return result

t_test_p_values_comparing_bots(surge_annotations, reload='results/t_test_p_values_comparing_bots')

Unnamed: 0_level_0,Unnamed: 1_level_0,rerank_blender,rerank_blender,rerank_blender,emora,emora,blender2_3B
Unnamed: 0_level_1,Unnamed: 1_level_1,emora,blender2_3B,bart_fid_rag_bcb,blender2_3B,bart_fid_rag_bcb,bart_fid_rag_bcb
category,label,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
behavior,antisocial,0.002474602,0.4194563,0.01087556,0.01427387,0.3156357,0.05989715
behavior,commonsense contradiction,4.674101e-08,0.1014559,0.02194954,0.0001139109,1.149751e-14,8.311311e-05
behavior,correct fact,7.16263e-16,0.6660799,9.406697999999999e-19,1.753072e-17,2.974544e-61,2.6809620000000002e-17
behavior,empathetic,0.3697819,0.392742,5.08536e-10,0.9659565,9.043662e-08,7.085574e-08
behavior,follow up,0.2214653,2.639515e-26,4.876607e-32,2.1119940000000002e-32,9.76546e-39,0.2217655
behavior,ignore,0.001197645,0.3957589,0.005297479,0.01598324,0.6507543,0.05060281
behavior,incorrect fact,1.8516760000000002e-62,9.507153e-55,0.1503432,0.001317827,5.731725e-55,2.224882e-47
behavior,irrelevant,0.1449497,4.882563e-09,9.615384e-07,9.40247e-06,0.0005185477,0.3358462
behavior,lack of empathy,1.615134e-05,0.0006580446,2.186129e-06,2.456389e-14,0.6614582,9.700592e-16
behavior,life info,0.01880903,0.145556,0.1557004,0.0001365093,0.0001602902,0.9735843


### Predictive Validity

In [17]:
from statsmodels.miscmodels.ordinal_model import OrderedModel
from statsmodels.regression.linear_model import OLS as LinearModel
from statsmodels.tools.tools import add_constant

def dialogue_metrics(ev):
    df: pd.DataFrame = ev.annotation_dataframe()
    df = get_singly_annotated(df, seed=123)
    reindexed = df.reset_index()
    items = reindexed[sym.item]
    dialogues = [e[0] if isinstance(e, tuple) else e for e in items]
    reindexed['dialogue'] = dialogues
    reindexed.set_index(
        [sym.bot, sym.category, sym.label, 'dialogue', sym.item],
        inplace=True, verify_integrity=True
    )
    ld = reindexed.xs(category.likert_dialogue, level=sym.category)
    ld = ld.droplevel(sym.bot).droplevel(sym.item)
    ld.columns = ['score']
    ldq = ld.xs(scale.quality, level=sym.label)
    ldq.columns = ['quality']

    lt = reindexed.xs(category.likert_turn, level=sym.category)
    lt = lt.groupby([sym.label, 'dialogue']).mean()
    lt.columns = ['score']
    ltq = lt.xs(scale.quality, level=sym.label)
    ltq.columns = ['quality']

    be = reindexed.xs(category.behavior, level=sym.category)
    be = be.groupby([sym.label, 'dialogue']).mean()
    be.columns = ['score']

    ds = pd.concat(
        [lt, be, ld],
        keys=[category.likert_turn, category.behavior, category.likert_dialogue],
        names=[sym.category, sym.label, 'dialogue']
    )
    likert_dialogue_quality_features = ds.join(ldq, on='dialogue')
    likert_turn_quality_features = ds.join(ltq, on='dialogue')
    return likert_dialogue_quality_features, likert_turn_quality_features


def regressions(df, quality_column_name=None, model='linear'):
    """
    :param df: dialogue x (*features, quality) -> value
    :return: *(coef, low, high), mcfadden r^2
    """
    if not quality_column_name:
        quality_column_name = df.columns[-1]
    qualities = df[quality_column_name]
    features = [f for f in df.columns if f != quality_column_name]
    if model == 'ordinal':
        model = OrderedModel(qualities, df[features], distr='logit')
        results = model.fit()
        coefs = {f: results.params[f] for f in features}
        prsqrd = results.prsquared
        result = {stat.mcfad_r2: prsqrd, stat.p_of_llr_test: results.llr_pvalue}
    elif model == 'linear':
        x = add_constant(df[features])
        y = qualities
        model = LinearModel(y, x)
        results = model.fit()
        coefs = {f: results.params[f] for f in features}
        rsquared = results.rsquared
        result = {**coefs, stat.r2: rsquared, stat.p_of_f_test: results.f_pvalue}
    else:
        raise ValueError('Param "model" must be one of {"linear", "ordinal"}')
    return pd.Series(result.values(), result)

In [18]:
@to_file
def dialogue_quality_regressions(ev):
    ldq, ltq = dialogue_metrics(ev)
    ldq_groups = ldq.groupby(
        [sym.category, sym.label]
    )
    ltq_groups = ltq.groupby(
        [sym.category, sym.label]
    )
    names = ['Predicted', 'Metric']
    linear_result = ldq_groups.apply(lambda x: regressions(x, model='linear'))
    linear_result.columns = pd.MultiIndex.from_arrays(
        [['Likert Dialogue Quality']*3,
        ['LR Coefficient', 'LR R-Squared', stat.p_of_f_test]],
        names=names
    )
    ordinal_result = ldq_groups.apply(lambda x: regressions(x, model='ordinal'))
    ordinal_result.columns = pd.MultiIndex.from_arrays(
        [['Likert Dialogue Quality']*2,
        ['OR Pseudo R-Squared', stat.p_of_llr_test]],
        names=names
    )
    linear_turn_result = ltq_groups.apply(regressions)
    linear_turn_result.columns = pd.MultiIndex.from_arrays(
        [['Likert Turn Quality']*3,
        ['LR Coefficient', 'LR R-Squared', stat.p_of_f_test]],
        names=names
    )
    result = pd.concat((linear_turn_result, linear_result, ordinal_result), axis=1)
    return result.round(5)

regs = dialogue_quality_regressions(
    data.surge_evaluation,
    reload='results/dialogue_quality_regressions'
)
regs

Optimization terminated successfully.
         Current function value: 1.314613
         Iterations: 217
         Function evaluations: 355
Optimization terminated successfully.
         Current function value: 1.314321
         Iterations: 193
         Function evaluations: 320
Optimization terminated successfully.
         Current function value: 1.314382
         Iterations: 210
         Function evaluations: 341
Optimization terminated successfully.
         Current function value: 1.327590
         Iterations: 118
         Function evaluations: 207
Optimization terminated successfully.
         Current function value: 1.327138
         Iterations: 188
         Function evaluations: 311
Optimization terminated successfully.
         Current function value: 1.314526
         Iterations: 222
         Function evaluations: 360
Optimization terminated successfully.
         Current function value: 1.313996
         Iterations: 212
         Function evaluations: 354
Optimization termina



Unnamed: 0_level_0,Predicted,Likert Turn Quality,Likert Turn Quality,Likert Turn Quality,Likert Dialogue Quality,Likert Dialogue Quality,Likert Dialogue Quality,Likert Dialogue Quality,Likert Dialogue Quality
Unnamed: 0_level_1,Metric,LR Coefficient,LR R-Squared,P value of F-test,LR Coefficient,LR R-Squared,P value of F-test,OR Pseudo R-Squared,P value of LLR-test
category,label,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
likert turn,consistent,0.21707,0.04474,2e-05,0.25186,0.02999,0.0005,0.00979,0.00126
likert turn,emotional,0.18738,0.03755,0.0001,0.23505,0.02943,0.00057,0.01001,0.00111
likert turn,engaging,0.1642,0.04061,5e-05,0.19074,0.02729,0.00091,0.00996,0.00114
likert turn,grammatical,0.29094,0.06348,0.0,0.01671,0.0001,0.83866,1e-05,0.91405
likert turn,informative,0.02049,0.00046,0.66881,-0.05404,0.00159,0.42573,0.00035,0.54115
likert turn,proactive,0.22061,0.06614,0.0,0.20584,0.02868,0.00067,0.00985,0.00122
likert turn,quality,1.0,1.0,0.0,0.24156,0.02906,0.00062,0.01025,0.00097
likert turn,relevant,0.26306,0.0987,0.0,0.23057,0.03776,9e-05,0.01239,0.00029
behavior,antisocial,-3.70553,0.01164,0.031,-0.55548,0.00013,0.82001,7e-05,0.78842
behavior,commonsense contradiction,-1.2955,0.07139,0.0,-1.69743,0.06103,0.0,0.02228,0.0


In [19]:
to_plot_regs = regs[[("Likert Dialogue Quality", "LR R-Squared"), ("Likert Dialogue Quality", "P value of F-test")]]
to_plot_regs = to_plot_regs.drop(("likert dialogue", "quality"))
to_plot_regs = to_plot_regs.reset_index()
to_plot_regs

Predicted,category,label,Likert Dialogue Quality,Likert Dialogue Quality
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,LR R-Squared,P value of F-test
0,likert turn,consistent,0.02999,0.0005
1,likert turn,emotional,0.02943,0.00057
2,likert turn,engaging,0.02729,0.00091
3,likert turn,grammatical,0.0001,0.83866
4,likert turn,informative,0.00159,0.42573
5,likert turn,proactive,0.02868,0.00067
6,likert turn,quality,0.02906,0.00062
7,likert turn,relevant,0.03776,9e-05
8,behavior,antisocial,0.00013,0.82001
9,behavior,commonsense contradiction,0.06103,0.0


In [20]:
regs = prettify(regs, to_csv="results/paper/predictive_validity.csv")

# Incremental Validity

In [21]:
def drop_column_level_duplication(df: pd.DataFrame, columns, levels=None):
    if levels is None:
        levels = list(range(len(columns)))
    level_columns = df.xs(columns, axis=1, level=levels)
    unique = level_columns.iloc[:,0].to_frame()
    unique.columns = [columns]
    dropped = df.drop(columns=columns, level=levels)
    result = pd.concat([dropped, unique], axis=1)
    return result

def multivariate_regression(df: pd.DataFrame, model='linear'):
    def apply_regressions(df: pd.DataFrame):
        unstacked = df.unstack([sym.category, sym.label])
        unstacked = drop_column_level_duplication(unstacked, 'quality', 0)
        results = regressions(unstacked, quality_column_name='quality', model=model)
        return results
    result = apply_regressions(df)
    result.index = [
        (idx[1] if isinstance(idx, tuple) else idx)
        for idx in result.index.values
    ]
    return result.round(5)

from collections import namedtuple

@to_file
def incremental_regression(
        df: pd.DataFrame,
        categories,
        model='linear',
        exclude_quality=True,
        beam=1,
        select='backward'
):
    data_points = set(df.index.get_level_values('dialogue'))
    num_data_points = len(data_points)
    adjust = lambda r2, f: 1 - (1 - r2) * ((num_data_points - 1) / (num_data_points - f))
    Step: type = namedtuple('Step', ('r2', 'p', 'feature'))
    class Path(list):
        def metric(self):
            # if len(self) == 0: return 0
            # else: return self[-1].llr if len(self) == 1 else self[-1].llr / self[-2].llr
            return self.r2
        @property
        def r2(self):
            return adjust(self[-1].r2, len(self)) if self else 0
        # @property
        # def adj_r2(self):
        #     return adjust(self.r2, len(self))
        @property
        def p(self): return self[-1].p if self else 1
        @property
        def features(self): return {x.feature for x in self}
    r2_name = stat.r2 if model=='linear' else stat.mcfad_r2
    p_name = stat.p_of_f_test if model=='linear' else stat.p_of_llr_test
    frontier = [Path()]
    feature_pool = {
        x[:2] for x in df.index.values
        if (not (exclude_quality and scale.quality in x))
        and x[0] in categories
    }
    for _ in feature_pool:
        new_frontier = []
        for path in frontier:
            for candidate in feature_pool - path.features:
                if select == 'forward':
                    candidate_features = path.features | {candidate}
                elif select == 'backward':
                    candidate_features = feature_pool - path.features
                else:
                    raise ValueError('param select must be one of {"forward", "backward"}')
                row_mask = [
                    x[:2] in candidate_features
                    and (not (exclude_quality and scale.quality in x))
                    and x[0] in categories
                    for x in df.index.values
                ]
                candidate_df = df.loc[row_mask, :]
                candidate_results = multivariate_regression(candidate_df, model=model)
                r2 = candidate_results[r2_name].item()
                p = candidate_results[p_name]
                new_frontier.append(Path([*path, Step(r2, p, candidate)]))
        frontier = sorted(new_frontier, key=lambda x: x.metric(), reverse=True)[:beam]
    result = {step.feature: {'Adjusted R-Squared': step.r2, p_name: step.p} for step in frontier[0]}
    return pd.DataFrame(result.values(), result)


ldq, ltq = dialogue_metrics(data.surge_evaluation)
regs = incremental_regression(
    ldq, (category.likert_turn, category.behavior), beam=1,
    reload='results/dialogue_incremental_regressions'
)
regs

Unnamed: 0,Unnamed: 1,Adjusted R-Squared,P value of F-test
likert turn,emotional,0.17799,0.0
likert turn,relevant,0.1772,0.0
likert turn,consistent,0.17213,0.0
behavior,redundant,0.1716,0.0
behavior,empathetic,0.16777,0.0
behavior,commonsense contradiction,0.15683,0.0
likert turn,informative,0.15412,0.0
behavior,self contradiction,0.15336,0.0
behavior,life info,0.14527,0.0
likert turn,proactive,0.14519,0.0


In [22]:
behavior_regs = incremental_regression(
    ldq, (category.behavior,), beam=10,
    reload='results/behavior_incremental_regressions'
)
behavior_regs

Unnamed: 0,Unnamed: 1,Adjusted R-Squared,P value of F-test
behavior,antisocial,0.15964,0.0
behavior,life info,0.15955,0.0
behavior,preference info,0.1585,0.0
behavior,commonsense contradiction,0.15692,0.0
behavior,uninterpretable,0.15503,0.0
behavior,ignore,0.1525,0.0
behavior,correct fact,0.15197,0.0
behavior,topic switch,0.14809,0.0
behavior,follow up,0.14527,0.0
behavior,redundant,0.14366,0.0


In [23]:
likert_turn_regs = incremental_regression(
    ldq, (category.likert_turn,), beam=10,
    reload='results/likert_turn_incremental_regressions'
)
likert_turn_regs

Unnamed: 0,Unnamed: 1,Adjusted R-Squared,P value of F-test
likert turn,grammatical,0.08566,1e-05
likert turn,informative,0.08508,0.0
likert turn,engaging,0.08285,0.0
likert turn,emotional,0.07753,0.0
likert turn,consistent,0.06889,0.0
likert turn,proactive,0.05786,1e-05
likert turn,relevant,0.03776,9e-05


In [24]:
likert_dialogue_regs = incremental_regression(
    ldq, (category.likert_dialogue,), beam=10,
    reload='results/likert_dialogue_incremental_regressions'
)
likert_dialogue_regs

Unnamed: 0,Unnamed: 1,Adjusted R-Squared,P value of F-test
likert dialogue,emotional,0.58426,0.0
likert dialogue,informative,0.57832,0.0
likert dialogue,grammatical,0.56439,0.0
likert dialogue,relevant,0.54517,0.0
likert dialogue,proactive,0.51636,0.0
likert dialogue,consistent,0.48673,0.0
likert dialogue,engaging,0.38103,0.0


## Table for Paper

In [25]:
def add_delta(df):
    df['R-Squared'] = (df['Adjusted R-Squared']*100)
    df['delta'] = df['R-Squared'].diff()
    df['delta'] = df['delta'].map('{:.4f}'.format)
    df['R-Squared'] = df['R-Squared'].map('{:.4f}'.format)
    df['R-Squared delta'] = df['R-Squared'] + ' (' + df['delta'] + ')'

final_behavior_regs = behavior_regs.reset_index().rename({'level_1': 'ABC-Eval'}, axis=1).drop(['level_0', 'P value of F-test'], axis=1)
add_delta(final_behavior_regs)

final_likert_turn_regs = likert_turn_regs.reset_index().rename({'level_1': 'Likert Turn'}, axis=1).drop(['level_0', 'P value of F-test'], axis=1)
add_delta(final_likert_turn_regs)

final_likert_dialogue_regs = likert_dialogue_regs.reset_index().rename({'level_1': 'Likert Dialogue'}, axis=1).drop(['level_0', 'P value of F-test'], axis=1)
add_delta(final_likert_dialogue_regs)

final_behavior_regs

Unnamed: 0,ABC-Eval,Adjusted R-Squared,R-Squared,delta,R-Squared delta
0,antisocial,0.15964,15.964,,15.9640 (nan)
1,life info,0.15955,15.955,-0.009,15.9550 (-0.0090)
2,preference info,0.1585,15.85,-0.105,15.8500 (-0.1050)
3,commonsense contradiction,0.15692,15.692,-0.158,15.6920 (-0.1580)
4,uninterpretable,0.15503,15.503,-0.189,15.5030 (-0.1890)
5,ignore,0.1525,15.25,-0.253,15.2500 (-0.2530)
6,correct fact,0.15197,15.197,-0.053,15.1970 (-0.0530)
7,topic switch,0.14809,14.809,-0.388,14.8090 (-0.3880)
8,follow up,0.14527,14.527,-0.282,14.5270 (-0.2820)
9,redundant,0.14366,14.366,-0.161,14.3660 (-0.1610)


In [26]:
combined = pd.concat(
    [
        final_behavior_regs[['ABC-Eval', 'R-Squared delta']],
        final_likert_turn_regs[['Likert Turn', 'R-Squared delta']],
        final_likert_dialogue_regs[['Likert Dialogue', 'R-Squared delta']]
    ],
    axis=1)

combined.to_csv('results/paper/incremental_validity.csv', index=False)
combined

Unnamed: 0,ABC-Eval,R-Squared delta,Likert Turn,R-Squared delta.1,Likert Dialogue,R-Squared delta.2
0,antisocial,15.9640 (nan),grammatical,8.5660 (nan),emotional,58.4260 (nan)
1,life info,15.9550 (-0.0090),informative,8.5080 (-0.0580),informative,57.8320 (-0.5940)
2,preference info,15.8500 (-0.1050),engaging,8.2850 (-0.2230),grammatical,56.4390 (-1.3930)
3,commonsense contradiction,15.6920 (-0.1580),emotional,7.7530 (-0.5320),relevant,54.5170 (-1.9220)
4,uninterpretable,15.5030 (-0.1890),consistent,6.8890 (-0.8640),proactive,51.6360 (-2.8810)
5,ignore,15.2500 (-0.2530),proactive,5.7860 (-1.1030),consistent,48.6730 (-2.9630)
6,correct fact,15.1970 (-0.0530),relevant,3.7760 (-2.0100),engaging,38.1030 (-10.5700)
7,topic switch,14.8090 (-0.3880),,,,
8,follow up,14.5270 (-0.2820),,,,
9,redundant,14.3660 (-0.1610),,,,
