In [1]:
import pandas as pd
from analysis import *
from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import binom_test

In [2]:
surge_annotations = data.surge_evaluation.annotation_dataframe()
surge_annotations_comparative = data.surge_evaluation.comparative_annotation_dataframe()

surge_annotations

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,0,1,2
bot,category,label,item,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
rerank_blender,likert dialogue,emotional,"(109,13)_rerank_blender",4,4.0,
rerank_blender,likert dialogue,consistent,"(109,13)_rerank_blender",4,4.0,
rerank_blender,likert dialogue,grammatical,"(109,13)_rerank_blender",4,3.0,
rerank_blender,likert dialogue,informative,"(109,13)_rerank_blender",4,4.0,
rerank_blender,likert dialogue,proactive,"(109,13)_rerank_blender",4,3.0,
rerank_blender,...,...,...,...,...,...
rerank_blender,behavior,follow up,"((438,26)_rerank_blender, 14)",0,,
rerank_blender,behavior,topic switch,"((438,26)_rerank_blender, 14)",0,,
rerank_blender,behavior,ignore,"((438,26)_rerank_blender, 14)",0,,
rerank_blender,behavior,irrelevant,"((438,26)_rerank_blender, 14)",0,,


# 8 Comprehensive Analysis

### Metric Sensitivity

In [16]:
from itertools import combinations
from scipy.stats import ttest_ind

def p_vals(df: pd.DataFrame, test='t', downsample=None):
    """
    :param df: (bot, data point) x 1 -> score
    :param test: statistical test function (t for t test, p for prop test, s for sign test)
    :param downsample: number of samples ber bot to subsample without replacement for the analysis
    :return: p values of test on each bot pair (pd.Series)
    """
    seed = 123
    bots = set(df.index.get_level_values(0))
    num_bots = len(bots)
    bot_pairs = list(combinations(bots, 2))
    result = {}
    for ba, bb in bot_pairs:
        if test == 't':
            if downsample:
                a = df.xs(ba).sample(downsample, random_state=seed).to_numpy().squeeze()
                b = df.xs(bb).sample(downsample, random_state=seed).to_numpy().squeeze()
            else:
                a = df.xs(ba).to_numpy().squeeze()
                b = df.xs(bb).to_numpy().squeeze()
            t, p = ttest_ind(a, b, equal_var=False)
        elif test == 'p':
            if downsample:
                a = df.xs(ba).sample(downsample, random_state=seed).to_numpy().squeeze()
                b = df.xs(bb).sample(downsample, random_state=seed).to_numpy().squeeze()
            else:
                a = df.xs(ba).to_numpy().squeeze()
                b = df.xs(bb).to_numpy().squeeze()
            z, p = proportions_ztest(count=[
                sum(a), sum(b)
            ], nobs=[
                len(a), len(b)
            ])
        elif test == 's':
            # sign test
            comp_data = df.xs((ba, bb), level=[sym.bot, sym.bot_cmp])
            if downsample:
                comp_data = comp_data.sample(downsample, random_state=seed)
            a = comp_data.to_numpy().squeeze() == 1
            b = comp_data.to_numpy().squeeze() == -1
            p = binom_test(sum(a), sum(a)+sum(b), p=0.5)
        else:
            raise ValueError('invalid arg for param "test"')
        result[(ba, bb)] = p
    result_series = pd.Series(result.values(), result)
    return result_series

@to_file
def p_values_comparing_bots(evaluation, downsample=None):
    comp_annotations = get_singly_annotated(evaluation.comparative_annotation_dataframe(), seed=123)
    annotations = get_singly_annotated(evaluation.annotation_dataframe(), seed=123)
    prop_annotations = annotations.xs(
        category.behavior, level=sym.category, drop_level=False
    )
    mean_annotations = annotations.drop(
        index=category.behavior, level=sym.category
    ).drop(
        index=category.comparative, level=sym.category
    )
    mean_ps = mean_annotations.groupby(
        [sym.category, sym.label]
    ).apply(lambda x: p_vals(x, test='t', downsample=downsample))
    prop_ps = prop_annotations.groupby(
        [sym.category, sym.label]
    ).apply(lambda x: p_vals(x, test='p', downsample=downsample))
    comp_groups = comp_annotations.groupby(sym.label)
    comp_ps = comp_groups.apply(lambda x: p_vals(x, test='s', downsample=downsample))
    comp_ps = pd.concat({category.comparative: comp_ps}, names=[sym.category])
    result = pd.concat([prop_ps, mean_ps, comp_ps], axis=0)
    return result

p_values_comparing_bots(data.surge_evaluation, downsample=32, reload='results/p_values_comparing_bots_downsampled').round(4)

  zstat = value / std
  zstat = value / std
  zstat = value / std


Unnamed: 0_level_0,Unnamed: 1_level_0,bart_fid_rag_bcb,bart_fid_rag_bcb,bart_fid_rag_bcb,emora,emora,blender2_3B
Unnamed: 0_level_1,Unnamed: 1_level_1,emora,blender2_3B,rerank_blender,blender2_3B,rerank_blender,rerank_blender
category,label,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
behavior,antisocial,,,,,,
behavior,commonsense contradiction,0.1685,0.3202,0.5218,0.6888,0.4497,0.7192
behavior,correct fact,0.0002,0.0098,0.0211,0.1685,0.0976,0.7679
behavior,empathetic,0.6107,0.7978,0.2807,0.8002,0.1143,0.1832
behavior,follow up,0.0002,0.5986,0.0005,0.001,0.7679,0.0025
behavior,ignore,0.4911,0.1623,0.4911,0.0452,1.0,0.0452
behavior,incorrect fact,0.0101,0.0101,0.4911,,0.0389,0.0389
behavior,irrelevant,0.2002,0.6888,0.3202,0.0976,0.7679,0.1685
behavior,lack of empathy,0.5218,0.2296,0.2296,0.0722,0.0722,1.0
behavior,life info,0.7404,0.5453,0.756,0.3513,0.5218,0.7679


In [17]:
p_values_comparing_bots(data.surge_evaluation, reload='results/t_test_p_values_comparing_bots').round(4)

Unnamed: 0_level_0,Unnamed: 1_level_0,bart_fid_rag_bcb,bart_fid_rag_bcb,bart_fid_rag_bcb,emora,emora,blender2_3B
Unnamed: 0_level_1,Unnamed: 1_level_1,emora,blender2_3B,rerank_blender,blender2_3B,rerank_blender,rerank_blender
category,label,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
behavior,antisocial,0.5597,0.1601,0.0119,0.0587,0.0035,0.2117
behavior,commonsense contradiction,0.0,0.0002,0.0473,0.0008,0.0,0.0917
behavior,correct fact,0.0,0.0,0.0,0.0,0.0,0.8325
behavior,empathetic,0.0,0.0,0.0,0.5285,0.2644,0.6252
behavior,follow up,0.0,0.5851,0.0,0.0,0.2512,0.0
behavior,ignore,0.3671,0.0436,0.0079,0.0035,0.0004,0.5149
behavior,incorrect fact,0.0,0.0,0.0663,0.0023,0.0,0.0
behavior,irrelevant,0.0019,0.1866,0.0,0.0,0.0457,0.0
behavior,lack of empathy,0.6253,0.0,0.0,0.0,0.0,0.0007
behavior,life info,0.0,0.9618,0.3605,0.0,0.0015,0.3851


### Predictive Validity

In [49]:
from statsmodels.miscmodels.ordinal_model import OrderedModel
from statsmodels.regression.linear_model import OLS as LinearModel
from statsmodels.tools.tools import add_constant


def all_dialogue_metrics(data):
    static: pd.DataFrame = data.surge_evaluation.annotation_dataframe()
    static = get_singly_annotated(static, seed=123)
    reindexed = static.reset_index()
    items = reindexed[sym.item]
    dialogues = [e[0] if isinstance(e, tuple) else e for e in items]
    reindexed['dialogue'] = dialogues
    reindexed.set_index(
        [sym.bot, sym.category, sym.label, 'dialogue', sym.item],
        inplace=True, verify_integrity=True
    )
    ld = reindexed.xs(category.likert_dialogue, level=sym.category)
    ld = ld.droplevel(sym.bot).droplevel(sym.item)
    ld.columns = ['score']
    ldq = ld.xs(scale.quality, level=sym.label)
    ldq.columns = ['quality']

    lt = reindexed.xs(category.likert_turn, level=sym.category)
    lt = lt.groupby([sym.label, 'dialogue']).mean()
    lt.columns = ['score']
    ltq = lt.xs(scale.quality, level=sym.label)
    ltq.columns = ['quality']

    be = reindexed.xs(category.behavior, level=sym.category)
    be = be.groupby([sym.label, 'dialogue']).mean()
    be.columns = ['score']

    interactive = data.dialogue_collection.annotation_dataframe()
    idq = interactive.xs((category.likert_dialogue, scale.quality), level=(sym.category, sym.label))
    idq = idq.droplevel(0)

    ds = pd.concat(
        [lt, be, ld],
        keys=[category.likert_turn, category.behavior, category.likert_dialogue],
        names=[sym.category, sym.label, 'dialogue']
    )
    likert_dialogue_quality_features = ds.join(ldq, on='dialogue')
    likert_turn_quality_features = ds.join(ltq, on='dialogue')
    interactive_dialogue_quality_features = ds.join(idq, on='dialogue')
    interactive_dialogue_quality_features.columns = ['score', 'quality']

    interactive_comparisons = data.dialogue_collection.comparative_annotation_dataframe()
    surge_comparisons = get_singly_annotated(data.surge_evaluation.comparative_annotation_dataframe(), seed=123)
    compared_dialogues = surge_comparisons.index.get_level_values('dialogues')
    unique_compared_dialogues = {tuple(x) for x in {frozenset(y) for y in compared_dialogues}}
    comparison_map = dict(unique_compared_dialogues)
    compared_selector = [
        pair in unique_compared_dialogues
        for pair in interactive_comparisons.index.get_level_values('dialogues')
    ]
    comparative: pd.DataFrame = interactive_comparisons.loc[compared_selector, :]
    compared_selector = [
        pair in unique_compared_dialogues
        for pair in surge_comparisons.index.get_level_values('dialogues')
    ]
    surge_comparisons: pd.DataFrame = surge_comparisons.loc[compared_selector, :]
    comparative_quality = comparative.xs(scale.quality, level=sym.label)
    comparative_quality.index = [first for _, _, (first, second) in comparative_quality.index.values]
    comparative_quality.columns = ['quality']
    surge_comparisons.index = pd.MultiIndex.from_arrays(
        list(zip(*[
            (category.comparative, label, left)
            for _, _, label, (left, right) in surge_comparisons.index.values
        ])),
        names=[sym.category, sym.label, 'dialogue']
    )
    surge_comparisons.columns = ['score']
    filtered_ds = ds.loc[[(c, l, d) for c, l, d in ds.index.values if d in comparison_map]]
    compared_features = ds.loc[[(c, l, comparison_map[d]) for c, l, d in filtered_ds.index.values]]
    comparative_features = filtered_ds.to_numpy() - compared_features.to_numpy()
    filtered_ds['diff'] = comparative_features.squeeze().tolist()
    del filtered_ds['score']
    filtered_ds.columns = ['score']
    filtered_ds = pd.concat([filtered_ds, surge_comparisons], axis=0)
    comparative_quality_features = filtered_ds.join(comparative_quality, on='dialogue')

    return (
        likert_dialogue_quality_features,
        likert_turn_quality_features,
        comparative_quality_features,
        interactive_dialogue_quality_features
    )

all_dialogue_metrics(data)

def regressions(df, quality_column_name=None, model='linear'):
    """
    :param df: dialogue x (*features, quality) -> value
    :return: *(coef, low, high), mcfadden r^2
    """
    if not quality_column_name:
        quality_column_name = df.columns[-1]
    qualities = df[quality_column_name]
    features = [f for f in df.columns if f != quality_column_name]
    if model == 'ordinal':
        model = OrderedModel(qualities, df[features], distr='logit')
        results = model.fit()
        coefs = {f: results.params[f] for f in features}
        prsqrd = results.prsquared
        result = {stat.mcfad_r2: prsqrd, stat.p_of_llr_test: results.llr_pvalue}
    elif model == 'linear':
        x = add_constant(df[features])
        y = qualities
        model = LinearModel(y, x)
        results = model.fit()
        coefs = {f: results.params[f] for f in features}
        rsquared = results.rsquared
        result = {**coefs, stat.r2: rsquared, stat.p_of_f_test: results.f_pvalue}
    else:
        raise ValueError('Param "model" must be one of {"linear", "ordinal"}')
    return pd.Series(result.values(), result)



In [50]:
@to_file
def dialogue_quality_regressions(data):
    ldq, ltq, icq, idq = all_dialogue_metrics(data)
    ldq_groups = ldq.groupby([sym.category, sym.label])
    ltq_groups = ltq.groupby([sym.category, sym.label])
    icq_groups = icq.groupby([sym.category, sym.label])
    idq_groups = idq.groupby([sym.category, sym.label])
    names = ['Predicted', 'Metric']
    linear_result = ldq_groups.apply(lambda x: regressions(x, model='linear'))
    linear_result.columns = pd.MultiIndex.from_arrays(
        [['Likert Dialogue Quality']*3,
        ['LR Coefficient', 'LR R-Squared', stat.p_of_f_test]],
        names=names
    )
    ordinal_result = ldq_groups.apply(lambda x: regressions(x, model='ordinal'))
    ordinal_result.columns = pd.MultiIndex.from_arrays(
        [['Likert Dialogue Quality']*2,
        ['OR Pseudo R-Squared', stat.p_of_llr_test]],
        names=names
    )
    linear_turn_result = ltq_groups.apply(regressions)
    linear_turn_result.columns = pd.MultiIndex.from_arrays(
        [['Likert Turn Quality']*3,
        ['LR Coefficient', 'LR R-Squared', stat.p_of_f_test]],
        names=names
    )
    linear_compare_result = icq_groups.apply(regressions)
    linear_compare_result.columns = pd.MultiIndex.from_arrays(
        [['Interactive Comparison']*3,
        ['LR Coefficient', 'LR R-Squared', stat.p_of_f_test]],
        names=names
    )
    interactive_dial_result = idq_groups.apply(regressions)
    interactive_dial_result.columns = pd.MultiIndex.from_arrays(
        [['Interactive Likert']*3,
        ['LR Coefficient', 'LR R-Squared', stat.p_of_f_test]],
        names=names
    )
    result = pd.concat(( linear_compare_result, interactive_dial_result, linear_result, linear_turn_result), axis=1)
    return result.round(5)

regs = dialogue_quality_regressions(
    data,
    load='results/dialogue_quality_regressions'
)
regs

Unnamed: 0_level_0,Predicted,Interactive Comparison,Interactive Comparison,Interactive Comparison,Interactive Likert,Interactive Likert,Interactive Likert,Likert Dialogue Quality,Likert Dialogue Quality,Likert Dialogue Quality,Likert Turn Quality,Likert Turn Quality,Likert Turn Quality
Unnamed: 0_level_1,Metric,LR Coefficient,LR R-Squared,P value of F-test,LR Coefficient,LR R-Squared,P value of F-test,LR Coefficient,LR R-Squared,P value of F-test,LR Coefficient,LR R-Squared,P value of F-test
category,label,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
behavior,antisocial,-0.28946,6e-05,0.91533,-0.77421,0.00022,0.76585,-0.55548,0.00013,0.82001,-3.70553,0.01164,0.031
behavior,commonsense contradiction,-1.29356,0.06519,0.00035,-2.36858,0.1048,0.0,-1.69743,0.06103,0.0,-1.2955,0.07139,0.0
behavior,correct fact,-0.51742,0.0166,0.07488,-0.48723,0.00799,0.07411,0.05121,0.0001,0.84188,-0.34247,0.00899,0.05813
behavior,empathetic,0.46971,0.01802,0.06341,1.12296,0.05755,0.0,0.88404,0.04044,5e-05,0.4503,0.02107,0.00362
behavior,follow up,0.24425,0.00585,0.29152,0.39619,0.00828,0.069,0.27118,0.0044,0.18548,0.46015,0.02544,0.00137
behavior,ignore,-0.95919,0.01765,0.06623,-3.17356,0.10413,0.0,-1.71655,0.03455,0.00019,-1.74979,0.07209,0.0
behavior,incorrect fact,-0.84437,0.03112,0.01437,-0.36389,0.00282,0.28973,-0.89117,0.01915,0.00557,-0.83946,0.03412,0.0002
behavior,irrelevant,-0.61683,0.01176,0.13427,-2.37005,0.09497,0.0,-1.60563,0.04943,1e-05,-1.26057,0.06118,0.0
behavior,lack of empathy,-1.37372,0.0588,0.0007,-2.53345,0.10348,0.0,-1.54854,0.04384,2e-05,-1.16185,0.04956,1e-05
behavior,life info,0.41711,0.00985,0.17089,0.15641,0.00062,0.62082,0.32752,0.00306,0.26971,0.18006,0.00186,0.38999


In [51]:
to_plot_regs = regs[[("Likert Dialogue Quality", "LR R-Squared"), ("Likert Dialogue Quality", "P value of F-test")]]
to_plot_regs = to_plot_regs.drop(("likert dialogue", "quality"))
to_plot_regs = to_plot_regs.reset_index()
to_plot_regs

Predicted,category,label,Likert Dialogue Quality,Likert Dialogue Quality
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,LR R-Squared,P value of F-test
0,behavior,antisocial,0.00013,0.82001
1,behavior,commonsense contradiction,0.06103,0.0
2,behavior,correct fact,0.0001,0.84188
3,behavior,empathetic,0.04044,5e-05
4,behavior,follow up,0.0044,0.18548
5,behavior,ignore,0.03455,0.00019
6,behavior,incorrect fact,0.01915,0.00557
7,behavior,irrelevant,0.04943,1e-05
8,behavior,lack of empathy,0.04384,2e-05
9,behavior,life info,0.00306,0.26971


In [52]:
regs = prettify(regs, to_csv="results/paper/predictive_validity.csv")

# Incremental Validity

In [53]:
def drop_column_level_duplication(df: pd.DataFrame, columns, levels=None):
    if levels is None:
        levels = list(range(len(columns)))
    level_columns = df.xs(columns, axis=1, level=levels)
    unique = level_columns.iloc[:,0].to_frame()
    unique.columns = [columns]
    dropped = df.drop(columns=columns, level=levels)
    result = pd.concat([dropped, unique], axis=1)
    return result

def multivariate_regression(df: pd.DataFrame, model='linear'):
    def apply_regressions(df: pd.DataFrame):
        unstacked = df.unstack([sym.category, sym.label])
        unstacked = drop_column_level_duplication(unstacked, 'quality', 0)
        results = regressions(unstacked, quality_column_name='quality', model=model)
        return results
    result = apply_regressions(df)
    result.index = [
        (idx[1] if isinstance(idx, tuple) else idx)
        for idx in result.index.values
    ]
    return result.round(5)

from collections import namedtuple

@to_file
def incremental_regression(
        df: pd.DataFrame,
        categories,
        model='linear',
        exclude_quality=True,
        beam=1,
        select='backward'
):
    data_points = set(df.index.get_level_values('dialogue'))
    num_data_points = len(data_points)
    adjust = lambda r2, f: 1 - (1 - r2) * ((num_data_points - 1) / (num_data_points - f))
    Step: type = namedtuple('Step', ('r2', 'p', 'feature'))
    class Path(list):
        def metric(self):
            # if len(self) == 0: return 0
            # else: return self[-1].llr if len(self) == 1 else self[-1].llr / self[-2].llr
            return self.r2
        @property
        def r2(self):
            return adjust(self[-1].r2, len(self)) if self else 0
        # @property
        # def adj_r2(self):
        #     return adjust(self.r2, len(self))
        @property
        def p(self): return self[-1].p if self else 1
        @property
        def features(self): return {x.feature for x in self}
    r2_name = stat.r2 if model=='linear' else stat.mcfad_r2
    p_name = stat.p_of_f_test if model=='linear' else stat.p_of_llr_test
    frontier = [Path()]
    feature_pool = {
        x[:2] for x in df.index.values
        if (not (exclude_quality and scale.quality in x))
        and x[0] in categories
    }
    for _ in feature_pool:
        new_frontier = []
        for path in frontier:
            for candidate in feature_pool - path.features:
                if select == 'forward':
                    candidate_features = path.features | {candidate}
                elif select == 'backward':
                    candidate_features = feature_pool - path.features
                else:
                    raise ValueError('param select must be one of {"forward", "backward"}')
                row_mask = [
                    x[:2] in candidate_features
                    and (not (exclude_quality and scale.quality in x))
                    and x[0] in categories
                    for x in df.index.values
                ]
                candidate_df = df.loc[row_mask, :]
                candidate_results = multivariate_regression(candidate_df, model=model)
                r2 = candidate_results[r2_name].item()
                p = candidate_results[p_name]
                new_frontier.append(Path([*path, Step(r2, p, candidate)]))
        frontier = sorted(new_frontier, key=lambda x: x.metric(), reverse=True)[:beam]
    result = {step.feature: {'Adjusted R-Squared': step.r2, p_name: step.p} for step in frontier[0]}
    return pd.DataFrame(result.values(), result)


ldq, ltq, icq, idq = all_dialogue_metrics(data)
regs = incremental_regression(
    ldq, (category.likert_turn, category.behavior), beam=1,
    load='results/dialogue_incremental_regressions'
)
regs

Unnamed: 0,Unnamed: 1,Adjusted R-Squared,P value of F-test
likert turn,consistent,0.17799,0.0
behavior,empathetic,0.17776,0.0
behavior,uninterpretable,0.16896,0.0
likert turn,emotional,0.16587,0.0
behavior,ignore,0.16495,0.0
behavior,preference info,0.1642,0.0
likert turn,informative,0.16244,0.0
likert turn,grammatical,0.1611,0.0
behavior,antisocial,0.15754,0.0
likert turn,relevant,0.15748,0.0


In [54]:
behavior_regs = incremental_regression(
    ldq, (category.behavior,), beam=10,
    load='results/behavior_incremental_regressions'
)
behavior_regs

Unnamed: 0,Unnamed: 1,Adjusted R-Squared,P value of F-test
behavior,antisocial,0.15964,0.0
behavior,ignore,0.15955,0.0
behavior,life info,0.15907,0.0
behavior,preference info,0.15813,0.0
behavior,commonsense contradiction,0.15663,0.0
behavior,uninterpretable,0.15466,0.0
behavior,correct fact,0.15197,0.0
behavior,topic switch,0.14809,0.0
behavior,follow up,0.14527,0.0
behavior,redundant,0.14366,0.0


In [55]:
behavior_regs = incremental_regression(
    idq, (category.behavior,), beam=10,
    load='results/behavior_incremental_regressions_interactive'
)
behavior_regs

Unnamed: 0,Unnamed: 1,Adjusted R-Squared,P value of F-test
behavior,antisocial,0.2603,0.0
behavior,life info,0.26013,0.0
behavior,topic switch,0.2598,0.0
behavior,preference info,0.25966,0.0
behavior,incorrect fact,0.2594,0.0
behavior,partner contradiction,0.25912,0.0
behavior,uninterpretable,0.25831,0.0
behavior,follow up,0.25716,0.0
behavior,correct fact,0.25588,0.0
behavior,ignore,0.25222,0.0


In [56]:
behavior_regs_comp = incremental_regression(
    icq, (category.behavior,), beam=10,
    load='results/behavior_incremental_regressions_comparative'
)
behavior_regs_comp

Unnamed: 0,Unnamed: 1,Adjusted R-Squared,P value of F-test
behavior,antisocial,0.18339,0.00221
behavior,ignore,0.18338,0.00129
behavior,topic switch,0.18322,0.00074
behavior,preference info,0.1831,0.0004
behavior,partner contradiction,0.1823,0.00022
behavior,correct fact,0.18098,0.00012
behavior,irrelevant,0.17869,7e-05
behavior,empathetic,0.17866,3e-05
behavior,commonsense contradiction,0.17335,2e-05
behavior,life info,0.16845,1e-05


In [57]:
likert_turn_regs = incremental_regression(
    ldq, (category.likert_turn,), beam=10,
    load='results/likert_turn_incremental_regressions'
)
likert_turn_regs

Unnamed: 0,Unnamed: 1,Adjusted R-Squared,P value of F-test
likert turn,grammatical,0.08566,1e-05
likert turn,informative,0.08508,0.0
likert turn,engaging,0.08285,0.0
likert turn,emotional,0.07753,0.0
likert turn,consistent,0.06889,0.0
likert turn,proactive,0.05786,1e-05
likert turn,relevant,0.03776,9e-05


In [58]:
likert_turn_regs = incremental_regression(
    idq, (category.likert_turn,), beam=10,
    load='results/likert_turn_incremental_regressions_interactive'
)
likert_turn_regs

Unnamed: 0,Unnamed: 1,Adjusted R-Squared,P value of F-test
likert turn,informative,0.14708,0.0
likert turn,grammatical,0.14651,0.0
likert turn,emotional,0.1441,0.0
likert turn,consistent,0.13808,0.0
likert turn,relevant,0.12962,0.0
likert turn,proactive,0.10285,0.0
likert turn,engaging,0.06687,0.0


In [59]:
likert_turn_regs_comp = incremental_regression(
    icq, (category.likert_turn,), beam=10,
    load='results/likert_turn_incremental_regressions_comparative'
)
likert_turn_regs_comp

Unnamed: 0,Unnamed: 1,Adjusted R-Squared,P value of F-test
likert turn,emotional,0.19089,0.0
likert turn,engaging,0.19059,0.0
likert turn,grammatical,0.19002,0.0
likert turn,relevant,0.1833,0.0
likert turn,informative,0.17103,0.0
likert turn,consistent,0.14697,0.0
likert turn,proactive,0.12035,0.0


In [60]:
likert_dialogue_regs = incremental_regression(
    ldq, (category.likert_dialogue,), beam=10,
    load='results/likert_dialogue_incremental_regressions'
)
likert_dialogue_regs

Unnamed: 0,Unnamed: 1,Adjusted R-Squared,P value of F-test
likert dialogue,emotional,0.58426,0.0
likert dialogue,informative,0.57832,0.0
likert dialogue,grammatical,0.56439,0.0
likert dialogue,relevant,0.54517,0.0
likert dialogue,proactive,0.51636,0.0
likert dialogue,consistent,0.48673,0.0
likert dialogue,engaging,0.38103,0.0


In [61]:
likert_dialogue_regs = incremental_regression(
    idq, (category.likert_dialogue,), beam=10,
    load='results/likert_dialogue_incremental_regressions_interactive'
)
likert_dialogue_regs

Unnamed: 0,Unnamed: 1,Adjusted R-Squared,P value of F-test
likert dialogue,informative,0.12078,0.0
likert dialogue,consistent,0.12078,0.0
likert dialogue,grammatical,0.12052,0.0
likert dialogue,engaging,0.1201,0.0
likert dialogue,proactive,0.11942,0.0
likert dialogue,emotional,0.10604,0.0
likert dialogue,relevant,0.08183,0.0


In [62]:
likert_dialogue_regs_comp = incremental_regression(
    icq, (category.likert_dialogue,), beam=10,
    load='results/likert_dialogue_incremental_regressions_comparative'
)
likert_dialogue_regs_comp

Unnamed: 0,Unnamed: 1,Adjusted R-Squared,P value of F-test
likert dialogue,relevant,0.16572,2e-05
likert dialogue,grammatical,0.16572,1e-05
likert dialogue,informative,0.1647,0.0
likert dialogue,proactive,0.16335,0.0
likert dialogue,consistent,0.15861,0.0
likert dialogue,emotional,0.15367,0.0
likert dialogue,engaging,0.14639,0.0


In [63]:
comparative_dialogue_regs_comp = incremental_regression(
    icq, (category.comparative,), beam=10,
    load='results/comparative_incremental_regressions_comparative'
)
comparative_dialogue_regs_comp

Unnamed: 0,Unnamed: 1,Adjusted R-Squared,P value of F-test
comparative,proactive,0.05369,0.1727
comparative,consistent,0.05298,0.11737
comparative,relevant,0.05138,0.07843
comparative,grammatical,0.04877,0.05184
comparative,informative,0.04748,0.02712
comparative,emotional,0.04205,0.01725
comparative,engaging,0.03525,0.00912


## Table for Paper

In [64]:
def add_delta(df):
    df['R-Squared'] = (df['Adjusted R-Squared']*100)
    df['delta'] = df['R-Squared'].diff()
    df['delta'] = df['delta'].map('{:.4f}'.format)
    df['R-Squared'] = df['R-Squared'].map('{:.4f}'.format)
    df['R-Squared delta'] = df['R-Squared'] + ' (' + df['delta'] + ')'

final_behavior_regs = behavior_regs.reset_index().rename({'level_1': 'ABC-Eval'}, axis=1).drop(['level_0', 'P value of F-test'], axis=1)
add_delta(final_behavior_regs)

final_likert_turn_regs = likert_turn_regs.reset_index().rename({'level_1': 'Likert Turn'}, axis=1).drop(['level_0', 'P value of F-test'], axis=1)
add_delta(final_likert_turn_regs)

final_likert_dialogue_regs = likert_dialogue_regs.reset_index().rename({'level_1': 'Likert Dialogue'}, axis=1).drop(['level_0', 'P value of F-test'], axis=1)
add_delta(final_likert_dialogue_regs)

final_behavior_regs

Unnamed: 0,ABC-Eval,Adjusted R-Squared,R-Squared,delta,R-Squared delta
0,antisocial,0.2603,26.03,,26.0300 (nan)
1,life info,0.26013,26.013,-0.017,26.0130 (-0.0170)
2,topic switch,0.2598,25.98,-0.033,25.9800 (-0.0330)
3,preference info,0.25966,25.966,-0.014,25.9660 (-0.0140)
4,incorrect fact,0.2594,25.94,-0.026,25.9400 (-0.0260)
5,partner contradiction,0.25912,25.912,-0.028,25.9120 (-0.0280)
6,uninterpretable,0.25831,25.831,-0.081,25.8310 (-0.0810)
7,follow up,0.25716,25.716,-0.115,25.7160 (-0.1150)
8,correct fact,0.25588,25.588,-0.128,25.5880 (-0.1280)
9,ignore,0.25222,25.222,-0.366,25.2220 (-0.3660)


In [65]:
combined = pd.concat(
    [
        final_behavior_regs[['ABC-Eval', 'R-Squared delta']],
        final_likert_turn_regs[['Likert Turn', 'R-Squared delta']],
        final_likert_dialogue_regs[['Likert Dialogue', 'R-Squared delta']]
    ],
    axis=1)

combined.to_csv('results/paper/incremental_validity.csv', index=False)
combined

Unnamed: 0,ABC-Eval,R-Squared delta,Likert Turn,R-Squared delta.1,Likert Dialogue,R-Squared delta.2
0,antisocial,26.0300 (nan),informative,14.7080 (nan),informative,12.0780 (nan)
1,life info,26.0130 (-0.0170),grammatical,14.6510 (-0.0570),consistent,12.0780 (0.0000)
2,topic switch,25.9800 (-0.0330),emotional,14.4100 (-0.2410),grammatical,12.0520 (-0.0260)
3,preference info,25.9660 (-0.0140),consistent,13.8080 (-0.6020),engaging,12.0100 (-0.0420)
4,incorrect fact,25.9400 (-0.0260),relevant,12.9620 (-0.8460),proactive,11.9420 (-0.0680)
5,partner contradiction,25.9120 (-0.0280),proactive,10.2850 (-2.6770),emotional,10.6040 (-1.3380)
6,uninterpretable,25.8310 (-0.0810),engaging,6.6870 (-3.5980),relevant,8.1830 (-2.4210)
7,follow up,25.7160 (-0.1150),,,,
8,correct fact,25.5880 (-0.1280),,,,
9,ignore,25.2220 (-0.3660),,,,
