In [76]:
import pandas as pd
from analysis import *
from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import binom_test

In [77]:
surge_annotations = data.surge_evaluation.annotation_dataframe()
surge_annotations_comparative = data.surge_evaluation.comparative_annotation_dataframe()

surge_annotations

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,0,1,2
bot,category,label,item,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
rerank_blender,likert dialogue,emotional,"(109,13)_rerank_blender",4,4.0,
rerank_blender,likert dialogue,consistent,"(109,13)_rerank_blender",4,4.0,
rerank_blender,likert dialogue,grammatical,"(109,13)_rerank_blender",4,3.0,
rerank_blender,likert dialogue,informative,"(109,13)_rerank_blender",4,4.0,
rerank_blender,likert dialogue,proactive,"(109,13)_rerank_blender",4,3.0,
rerank_blender,...,...,...,...,...,...
rerank_blender,behavior,follow up,"((438,26)_rerank_blender, 14)",0,,
rerank_blender,behavior,topic switch,"((438,26)_rerank_blender, 14)",0,,
rerank_blender,behavior,ignore,"((438,26)_rerank_blender, 14)",0,,
rerank_blender,behavior,irrelevant,"((438,26)_rerank_blender, 14)",0,,


# 8 Comprehensive Analysis

### Metric Sensitivity

In [78]:
from itertools import combinations
from scipy.stats import ttest_ind

def p_vals(df: pd.DataFrame, test='t', downsample=None):
    """
    :param df: (bot, data point) x 1 -> score
    :param test: statistical test function (t for t test, p for prop test, s for sign test)
    :param downsample: number of samples ber bot to subsample without replacement for the analysis
    :return: p values of test on each bot pair (pd.Series)
    """
    seed = 123
    bots = set(df.index.get_level_values(0))
    num_bots = len(bots)
    bot_pairs = list(combinations(bots, 2))
    result = {}
    for ba, bb in bot_pairs:
        if test == 't':
            if downsample:
                a = df.xs(ba).sample(downsample, random_state=seed).to_numpy().squeeze()
                b = df.xs(bb).sample(downsample, random_state=seed).to_numpy().squeeze()
            else:
                a = df.xs(ba).to_numpy().squeeze()
                b = df.xs(bb).to_numpy().squeeze()
            t, p = ttest_ind(a, b, equal_var=False)
        elif test == 'p':
            if downsample:
                a = df.xs(ba).sample(downsample, random_state=seed).to_numpy().squeeze()
                b = df.xs(bb).sample(downsample, random_state=seed).to_numpy().squeeze()
            else:
                a = df.xs(ba).to_numpy().squeeze()
                b = df.xs(bb).to_numpy().squeeze()
            z, p = proportions_ztest(count=[
                sum(a), sum(b)
            ], nobs=[
                len(a), len(b)
            ])
        elif test == 's':
            # sign test
            comp_data = df.xs((ba, bb), level=[sym.bot, sym.bot_cmp])
            if downsample:
                comp_data = comp_data.sample(downsample, random_state=seed)
            a = comp_data.to_numpy().squeeze() == 1
            b = comp_data.to_numpy().squeeze() == -1
            p = binom_test(sum(a), sum(a)+sum(b), p=0.5)
        else:
            raise ValueError('invalid arg for param "test"')
        result[(ba, bb)] = p
    result_series = pd.Series(result.values(), result)
    return result_series

@to_file
def p_values_comparing_bots(evaluation, downsample=None):
    comp_annotations = get_singly_annotated(evaluation.comparative_annotation_dataframe(), seed=123)
    annotations = get_singly_annotated(evaluation.annotation_dataframe(), seed=123)
    prop_annotations = annotations.xs(
        category.behavior, level=sym.category, drop_level=False
    )
    mean_annotations = annotations.drop(
        index=category.behavior, level=sym.category
    ).drop(
        index=category.comparative, level=sym.category
    )
    mean_ps = mean_annotations.groupby(
        [sym.category, sym.label]
    ).apply(lambda x: p_vals(x, test='t', downsample=downsample))
    prop_ps = prop_annotations.groupby(
        [sym.category, sym.label]
    ).apply(lambda x: p_vals(x, test='p', downsample=downsample))
    comp_groups = comp_annotations.groupby(sym.label)
    comp_ps = comp_groups.apply(lambda x: p_vals(x, test='s', downsample=downsample))
    comp_ps = pd.concat({category.comparative: comp_ps}, names=[sym.category])
    result = pd.concat([prop_ps, mean_ps, comp_ps], axis=0)
    return result

p_values_comparing_bots(data.surge_evaluation, downsample=32, load='results/p_values_comparing_bots_downsampled').round(4)

Unnamed: 0_level_0,Unnamed: 1_level_0,bart_fid_rag_bcb,bart_fid_rag_bcb,bart_fid_rag_bcb,emora,emora,blender2_3B
Unnamed: 0_level_1,Unnamed: 1_level_1,emora,blender2_3B,rerank_blender,blender2_3B,rerank_blender,rerank_blender
category,label,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
behavior,antisocial,,,,,,
behavior,commonsense contradiction,0.1685,0.3202,0.5218,0.6888,0.4497,0.7192
behavior,correct fact,0.0002,0.0098,0.0211,0.1685,0.0976,0.7679
behavior,empathetic,0.6107,0.7978,0.2807,0.8002,0.1143,0.1832
behavior,follow up,0.0002,0.5986,0.0005,0.001,0.7679,0.0025
behavior,ignore,0.4911,0.1623,0.4911,0.0452,1.0,0.0452
behavior,incorrect fact,0.0101,0.0101,0.4911,,0.0389,0.0389
behavior,irrelevant,0.2002,0.6888,0.3202,0.0976,0.7679,0.1685
behavior,lack of empathy,0.5218,0.2296,0.2296,0.0722,0.0722,1.0
behavior,life info,0.7404,0.5453,0.756,0.3513,0.5218,0.7679


In [79]:
p_values_comparing_bots(data.surge_evaluation, load='results/t_test_p_values_comparing_bots').round(4)

Unnamed: 0_level_0,Unnamed: 1_level_0,bart_fid_rag_bcb,bart_fid_rag_bcb,bart_fid_rag_bcb,emora,emora,blender2_3B
Unnamed: 0_level_1,Unnamed: 1_level_1,emora,blender2_3B,rerank_blender,blender2_3B,rerank_blender,rerank_blender
category,label,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
behavior,antisocial,0.5597,0.1601,0.0119,0.0587,0.0035,0.2117
behavior,commonsense contradiction,0.0,0.0002,0.0473,0.0008,0.0,0.0917
behavior,correct fact,0.0,0.0,0.0,0.0,0.0,0.8325
behavior,empathetic,0.0,0.0,0.0,0.5285,0.2644,0.6252
behavior,follow up,0.0,0.5851,0.0,0.0,0.2512,0.0
behavior,ignore,0.3671,0.0436,0.0079,0.0035,0.0004,0.5149
behavior,incorrect fact,0.0,0.0,0.0663,0.0023,0.0,0.0
behavior,irrelevant,0.0019,0.1866,0.0,0.0,0.0457,0.0
behavior,lack of empathy,0.6253,0.0,0.0,0.0,0.0,0.0007
behavior,life info,0.0,0.9618,0.3605,0.0,0.0015,0.3851


### Predictive Validity

In [80]:
from statsmodels.miscmodels.ordinal_model import OrderedModel
from statsmodels.regression.linear_model import OLS as LinearModel
from statsmodels.discrete.discrete_model import Logit as LogisticModel
from statsmodels.tools.tools import add_constant


def all_dialogue_metrics(data):
    static: pd.DataFrame = data.surge_evaluation.annotation_dataframe()
    static = get_singly_annotated(static, seed=123)
    reindexed = static.reset_index()
    items = reindexed[sym.item]
    dialogues = [e[0] if isinstance(e, tuple) else e for e in items]
    reindexed['dialogue'] = dialogues
    reindexed.set_index(
        [sym.bot, sym.category, sym.label, 'dialogue', sym.item],
        inplace=True, verify_integrity=True
    )
    ld = reindexed.xs(category.likert_dialogue, level=sym.category)
    ld = ld.droplevel(sym.bot).droplevel(sym.item)
    ld.columns = ['score']
    ldq = ld.xs(scale.quality, level=sym.label)
    ldq.columns = ['quality']

    lt = reindexed.xs(category.likert_turn, level=sym.category)
    lt = lt.groupby([sym.label, 'dialogue']).mean()
    lt.columns = ['score']
    ltq = lt.xs(scale.quality, level=sym.label)
    ltq.columns = ['quality']

    be = reindexed.xs(category.behavior, level=sym.category)
    be = be.groupby([sym.label, 'dialogue']).mean()
    be.columns = ['score']

    interactive = data.dialogue_collection.annotation_dataframe()
    idq = interactive.xs((category.likert_dialogue, scale.quality), level=(sym.category, sym.label))
    idq = idq.droplevel(0)

    ds = pd.concat(
        [lt, be, ld],
        keys=[category.likert_turn, category.behavior, category.likert_dialogue],
        names=[sym.category, sym.label, 'dialogue']
    )
    likert_dialogue_quality_features = ds.join(ldq, on='dialogue')
    likert_turn_quality_features = ds.join(ltq, on='dialogue')
    interactive_dialogue_quality_features = ds.join(idq, on='dialogue')
    interactive_dialogue_quality_features.columns = ['score', 'quality']

    interactive_comparisons = data.dialogue_collection.comparative_annotation_dataframe()
    surge_comparisons = get_singly_annotated(data.surge_evaluation.comparative_annotation_dataframe(), seed=123)
    compared_dialogues = surge_comparisons.index.get_level_values('dialogues')
    unique_compared_dialogues = {tuple(x) for x in {frozenset(y) for y in compared_dialogues}}
    comparison_map = dict(unique_compared_dialogues)
    compared_selector = [
        pair in unique_compared_dialogues
        for pair in interactive_comparisons.index.get_level_values('dialogues')
    ]
    comparative: pd.DataFrame = interactive_comparisons.loc[compared_selector, :]
    compared_selector = [
        pair in unique_compared_dialogues
        for pair in surge_comparisons.index.get_level_values('dialogues')
    ]
    surge_comparisons: pd.DataFrame = surge_comparisons.loc[compared_selector, :]
    comparative_quality = comparative.xs(scale.quality, level=sym.label)
    comparative_quality.index = [first for _, _, (first, second) in comparative_quality.index.values]
    comparative_quality.columns = ['quality']
    surge_comparisons.index = pd.MultiIndex.from_arrays(
        list(zip(*[
            (category.comparative, label, left)
            for _, _, label, (left, right) in surge_comparisons.index.values
        ])),
        names=[sym.category, sym.label, 'dialogue']
    )
    surge_comparisons.columns = ['score']
    filtered_ds = ds.loc[[(c, l, d) for c, l, d in ds.index.values if d in comparison_map]]
    compared_features = ds.loc[[(c, l, comparison_map[d]) for c, l, d in filtered_ds.index.values]]
    comparative_features = filtered_ds.to_numpy() - compared_features.to_numpy()
    filtered_ds['diff'] = comparative_features.squeeze().tolist()
    del filtered_ds['score']
    filtered_ds.columns = ['score']
    filtered_ds = pd.concat([filtered_ds, surge_comparisons], axis=0)
    comparative_quality_features = filtered_ds.join(comparative_quality, on='dialogue')
    icq = comparative_quality_features
    icq = icq[icq['quality'] != 0]
    icq.loc[:,'quality'] = (icq['quality'] > 0).astype(int)

    return (
        likert_dialogue_quality_features,
        likert_turn_quality_features,
        icq,
        interactive_dialogue_quality_features
    )

all_dialogue_metrics(data)

def regressions(df, quality_column_name=None, model='linear', adjust_r2=False):
    """
    :param df: dialogue x (*features, quality) -> value
    :return: *(coef, low, high), mcfadden r^2
    """
    if not quality_column_name:
        quality_column_name = df.columns[-1]
    qualities = df[quality_column_name]
    features = [f for f in df.columns if f != quality_column_name]
    if model == 'ordinal':
        model = OrderedModel(qualities, df[features], distr='logit')
        results = model.fit()
        coefs = {f: results.params[f] for f in features}
        prsqrd = results.prsquared
        if adjust_r2:
            prsqrd = 1 - (results.llf - len(features)) / results.llnull
        result = {stat.mcfad_r2: prsqrd, stat.p_of_llr_test: results.llr_pvalue}
    elif model == 'linear':
        x = add_constant(df[features])
        y = qualities
        model = LinearModel(y, x)
        results = model.fit()
        coefs = {f: results.params[f] for f in features}
        rsquared = results.rsquared
        if adjust_r2:
            adjust = lambda r2, f: 1 - (1 - r2) * ((len(y) - 1) / (len(y) - f - 1))
            rsquared = adjust(rsquared, len(features))
        result = {**coefs, stat.r2: rsquared, stat.p_of_f_test: results.f_pvalue}
    elif model == 'logistic':
        x = add_constant(df[features])
        y = qualities.astype(bool)
        model = LogisticModel(y, x)
        results = model.fit()
        coefs = {f: results.params[f] for f in features}
        prsqrd = results.prsquared
        if adjust_r2:
            prsqrd = 1 - (results.llf - len(features)) / results.llnull
        result = {**coefs, stat.mcfad_r2: prsqrd, stat.p_of_llr_test: results.llr_pvalue}
    else:
        raise ValueError('Param "model" must be one of {"linear", "ordinal", "logistic"}')
    return pd.Series(result.values(), result)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  icq.loc[:,'quality'] = (icq['quality'] > 0).astype(int)


In [81]:
@to_file
def dialogue_quality_regressions(data):
    ldq, ltq, icq, idq = all_dialogue_metrics(data)
    ldq_groups = ldq.groupby([sym.category, sym.label])
    ltq_groups = ltq.groupby([sym.category, sym.label])
    icq_groups = icq.groupby([sym.category, sym.label])
    idq_groups = idq.groupby([sym.category, sym.label])
    names = ['Predicted', 'Metric']
    linear_compare_result = icq_groups.apply(lambda x: regressions(x, model='logistic'))
    linear_compare_result.columns = pd.MultiIndex.from_arrays(
        [['Interactive Comparison']*3,
        ['LC Coefficient', 'LC Pseudo R-Squared', stat.p_of_llr_test]],
        names=names
    )
    linear_result = ldq_groups.apply(lambda x: regressions(x, model='linear'))
    linear_result.columns = pd.MultiIndex.from_arrays(
        [['Likert Dialogue Quality']*3,
        ['LR Coefficient', 'LR R-Squared', stat.p_of_f_test]],
        names=names
    )
    ordinal_result = ldq_groups.apply(lambda x: regressions(x, model='ordinal'))
    ordinal_result.columns = pd.MultiIndex.from_arrays(
        [['Likert Dialogue Quality']*2,
        ['OR Pseudo R-Squared', stat.p_of_llr_test]],
        names=names
    )
    linear_turn_result = ltq_groups.apply(regressions)
    linear_turn_result.columns = pd.MultiIndex.from_arrays(
        [['Likert Turn Quality']*3,
        ['LR Coefficient', 'LR R-Squared', stat.p_of_f_test]],
        names=names
    )
    interactive_dial_result = idq_groups.apply(regressions)
    interactive_dial_result.columns = pd.MultiIndex.from_arrays(
        [['Interactive Likert']*3,
        ['LR Coefficient', 'LR R-Squared', stat.p_of_f_test]],
        names=names
    )
    result = pd.concat(( linear_compare_result, interactive_dial_result, linear_result, linear_turn_result), axis=1)
    return result.round(5)

regs = dialogue_quality_regressions(
    data,
    load='results/dialogue_quality_regressions'
)
regs

Unnamed: 0_level_0,Predicted,Interactive Comparison,Interactive Comparison,Interactive Comparison,Interactive Likert,Interactive Likert,Interactive Likert,Likert Dialogue Quality,Likert Dialogue Quality,Likert Dialogue Quality,Likert Turn Quality,Likert Turn Quality,Likert Turn Quality
Unnamed: 0_level_1,Metric,LC Coefficient,LC Pseudo R-Squared,P value of LLR-test,LR Coefficient,LR R-Squared,P value of F-test,LR Coefficient,LR R-Squared,P value of F-test,LR Coefficient,LR R-Squared,P value of F-test
category,label,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
behavior,antisocial,0.51392,3e-05,0.92597,-0.77421,0.00022,0.76585,-0.55548,0.00013,0.82001,-3.70553,0.01164,0.031
behavior,commonsense contradiction,-3.11386,0.05594,0.00017,-2.36858,0.1048,0.0,-1.69743,0.06103,0.0,-1.2955,0.07139,0.0
behavior,correct fact,-1.17535,0.01423,0.05782,-0.48723,0.00799,0.07411,0.05121,0.0001,0.84188,-0.34247,0.00899,0.05813
behavior,empathetic,1.04127,0.01486,0.05258,1.12296,0.05755,0.0,0.88404,0.04044,5e-05,0.4503,0.02107,0.00362
behavior,follow up,0.47388,0.00385,0.32362,0.39619,0.00828,0.069,0.27118,0.0044,0.18548,0.46015,0.02544,0.00137
behavior,ignore,-2.11469,0.01431,0.05713,-3.17356,0.10413,0.0,-1.71655,0.03455,0.00019,-1.74979,0.07209,0.0
behavior,incorrect fact,-1.81422,0.02434,0.0131,-0.36389,0.00282,0.28973,-0.89117,0.01915,0.00557,-0.83946,0.03412,0.0002
behavior,irrelevant,-1.30597,0.00915,0.1282,-2.37005,0.09497,0.0,-1.60563,0.04943,1e-05,-1.26057,0.06118,0.0
behavior,lack of empathy,-3.2325,0.04713,0.00056,-2.53345,0.10348,0.0,-1.54854,0.04384,2e-05,-1.16185,0.04956,1e-05
behavior,life info,0.82256,0.00646,0.20107,0.15641,0.00062,0.62082,0.32752,0.00306,0.26971,0.18006,0.00186,0.38999


In [82]:
to_plot_regs = regs[[("Likert Dialogue Quality", "LR R-Squared"), ("Likert Dialogue Quality", "P value of F-test")]]
to_plot_regs = to_plot_regs.drop(("likert dialogue", "quality"))
to_plot_regs = to_plot_regs.reset_index()
to_plot_regs

Predicted,category,label,Likert Dialogue Quality,Likert Dialogue Quality
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,LR R-Squared,P value of F-test
0,behavior,antisocial,0.00013,0.82001
1,behavior,commonsense contradiction,0.06103,0.0
2,behavior,correct fact,0.0001,0.84188
3,behavior,empathetic,0.04044,5e-05
4,behavior,follow up,0.0044,0.18548
5,behavior,ignore,0.03455,0.00019
6,behavior,incorrect fact,0.01915,0.00557
7,behavior,irrelevant,0.04943,1e-05
8,behavior,lack of empathy,0.04384,2e-05
9,behavior,life info,0.00306,0.26971


In [83]:
regs = prettify(regs, to_csv="results/paper/predictive_validity.csv")

# Incremental Validity

In [84]:
def drop_column_level_duplication(df: pd.DataFrame, columns, levels=None):
    if levels is None:
        levels = list(range(len(columns)))
    level_columns = df.xs(columns, axis=1, level=levels)
    unique = level_columns.iloc[:,0].to_frame()
    unique.columns = [columns]
    dropped = df.drop(columns=columns, level=levels)
    result = pd.concat([dropped, unique], axis=1)
    return result

def multivariate_regression(df: pd.DataFrame, model='linear', adjust_r2=True):
    def apply_regressions(df: pd.DataFrame):
        unstacked = df.unstack([sym.category, sym.label])
        unstacked = drop_column_level_duplication(unstacked, 'quality', 0)
        results = regressions(unstacked, quality_column_name='quality', model=model, adjust_r2=adjust_r2)
        return results
    result = apply_regressions(df)
    result.index = [
        (idx[1] if isinstance(idx, tuple) else idx)
        for idx in result.index.values
    ]
    return result.round(5)

from collections import namedtuple

@to_file
def incremental_regression(
        df: pd.DataFrame,
        categories,
        model='linear',
        beam=1,
        select='backward',
        exclusions=(),
):
    data_points = set(df.index.get_level_values('dialogue'))
    num_data_points = len(data_points)
    Step: type = namedtuple('Step', ('r2', 'p', 'feature'))
    class Path(list):
        def metric(self):
            # if len(self) == 0: return 0
            # else: return self[-1].llr if len(self) == 1 else self[-1].llr / self[-2].llr
            return self.r2
        @property
        def r2(self):
            return self[-1].r2 if self else 0
        # @property
        # def adj_r2(self):
        #     return adjust(self.r2, len(self))
        @property
        def p(self): return self[-1].p if self else 1
        @property
        def features(self): return {x.feature for x in self}
    r2_name = stat.r2 if model=='linear' else stat.mcfad_r2
    p_name = stat.p_of_f_test if model=='linear' else stat.p_of_llr_test
    frontier = [Path()]
    feature_pool = {
        x[:2] for x in df.index.values
        if (not (x in exclusions or x[1] in exclusions))
        and x[0] in categories
    }
    for _ in feature_pool:
        new_frontier = []
        for path in frontier:
            for candidate in feature_pool - path.features:
                if select == 'forward':
                    candidate_features = path.features | {candidate}
                elif select == 'backward':
                    candidate_features = feature_pool - path.features
                else:
                    raise ValueError('param select must be one of {"forward", "backward"}')
                row_mask = [
                    x[:2] in candidate_features
                    and (not (x in exclusions or x[1] in exclusions))
                    and x[0] in categories
                    for x in df.index.values
                ]
                candidate_df = df.loc[row_mask, :]
                candidate_results = multivariate_regression(candidate_df, model=model)
                r2 = candidate_results[r2_name].item()
                p = candidate_results[p_name]
                new_frontier.append(Path([*path, Step(r2, p, candidate)]))
        frontier = sorted(new_frontier, key=lambda x: x.metric(), reverse=True)[:beam]
    result = {
        step.feature: {r2_name: step.r2, p_name: step.p}
        for i, step in enumerate(frontier[0])
    }
    return pd.DataFrame(result.values(), result)


ldq, ltq, icq, idq = all_dialogue_metrics(data)
regs = incremental_regression(
    ldq, (category.likert_turn, category.behavior), beam=1, exclusions=[scale.quality],
    reload='results/dialogue_incremental_regressions'
)
regs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  icq.loc[:,'quality'] = (icq['quality'] > 0).astype(int)


Unnamed: 0,Unnamed: 1,R-Squared,P value of F-test
likert turn,emotional,0.12771,0.0
likert turn,engaging,0.12919,0.0
behavior,commonsense contradiction,0.12861,0.0
likert turn,consistent,0.13006,0.0
behavior,correct fact,0.1318,0.0
behavior,irrelevant,0.13152,0.0
behavior,redundant,0.12149,0.0
behavior,empathetic,0.11976,0.0
behavior,life info,0.10719,0.0
behavior,preference info,0.10928,0.0


In [85]:
behavior_regs = incremental_regression(
    ldq, (category.behavior,), beam=10,
    reload='results/behavior_incremental_regressions'
)
behavior_regs

Unnamed: 0,Unnamed: 1,R-Squared,P value of F-test
behavior,ignore,0.12454,0.0
behavior,preference info,0.12628,0.0
behavior,commonsense contradiction,0.12751,0.0
behavior,life info,0.12746,0.0
behavior,antisocial,0.1285,0.0
behavior,uninterpretable,0.13069,0.0
behavior,correct fact,0.13017,0.0
behavior,topic switch,0.12843,0.0
behavior,follow up,0.12778,0.0
behavior,redundant,0.12836,0.0


In [86]:
behavior_regs = incremental_regression(
    idq, (category.behavior,), beam=10,
    reload='results/behavior_incremental_regressions_interactive'
)
behavior_regs

Unnamed: 0,Unnamed: 1,R-Squared,P value of F-test
behavior,preference info,0.2294,0.0
behavior,incorrect fact,0.23104,0.0
behavior,partner contradiction,0.23257,0.0
behavior,life info,0.23381,0.0
behavior,topic switch,0.23557,0.0
behavior,antisocial,0.23743,0.0
behavior,uninterpretable,0.23925,0.0
behavior,follow up,0.24002,0.0
behavior,correct fact,0.24065,0.0
behavior,ignore,0.23886,0.0


In [87]:
behavior_regs_comp = incremental_regression(
    icq, (category.behavior,), beam=10, model='logistic',
    reload='results/behavior_incremental_regressions_comparative'
)
behavior_regs_comp

Optimization terminated successfully.
         Current function value: 0.583013
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.583013
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.583013
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.583013
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.583013
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.583013
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.583013
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.583013
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.583013
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.583013
  

Unnamed: 0,Unnamed: 1,McFadden's pseudo-R-squared,P value of LLR-test
behavior,partner contradiction,0.03212,0.00073
behavior,irrelevant,0.03964,0.00044
behavior,preference info,0.04716,0.00025
behavior,ignore,0.0546,0.00014
behavior,topic switch,0.06153,8e-05
behavior,antisocial,0.06919,4e-05
behavior,incorrect fact,0.07676,2e-05
behavior,empathetic,0.08089,1e-05
behavior,correct fact,0.08407,1e-05
behavior,follow up,0.08598,1e-05


In [88]:
likert_turn_regs = incremental_regression(
    ldq, (category.likert_turn,), beam=10, exclusions=[scale.quality],
    reload='results/likert_turn_incremental_regressions'
)
likert_turn_regs

Unnamed: 0,Unnamed: 1,R-Squared,P value of F-test
likert turn,grammatical,0.06933,1e-05
likert turn,informative,0.07111,0.0
likert turn,engaging,0.07121,0.0
likert turn,emotional,0.06819,0.0
likert turn,consistent,0.06183,0.0
likert turn,proactive,0.05311,1e-05
likert turn,relevant,0.03534,9e-05


In [89]:
likert_turn_regs = incremental_regression(
    idq, (category.likert_turn,), beam=10, exclusions=[scale.quality],
    reload='results/likert_turn_incremental_regressions_interactive'
)
likert_turn_regs

Unnamed: 0,Unnamed: 1,R-Squared,P value of F-test
likert turn,informative,0.13185,0.0
likert turn,grammatical,0.13348,0.0
likert turn,emotional,0.13324,0.0
likert turn,consistent,0.12935,0.0
likert turn,relevant,0.12302,0.0
likert turn,proactive,0.09833,0.0
likert turn,engaging,0.06453,0.0


In [90]:
likert_turn_regs_comp = incremental_regression(
    icq, (category.likert_turn,), beam=10, model='logistic', exclusions=['quality'],
    reload='results/likert_turn_incremental_regressions_comparative'
)
likert_turn_regs_comp

Optimization terminated successfully.
         Current function value: 0.584976
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.584976
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.584976
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.584976
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.584976
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.584976
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.584976
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.585421
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.585421
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.585421
  

Unnamed: 0,Unnamed: 1,McFadden's pseudo-R-squared,P value of LLR-test
likert turn,engaging,0.09995,0.0
likert turn,emotional,0.10766,0.0
likert turn,grammatical,0.11468,0.0
likert turn,relevant,0.11732,0.0
likert turn,informative,0.1162,0.0
likert turn,consistent,0.1027,0.0
likert turn,proactive,0.08611,0.0


In [91]:
likert_turn_regs_comp = incremental_regression(
    icq, (category.likert_turn,), beam=10, model='logistic', exclusions=[scale.proactive, scale.quality],
    reload='results/likert_turn_incremental_regressions_comparative_no_proactive_or_quality'
)
likert_turn_regs_comp

Optimization terminated successfully.
         Current function value: 0.624573
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.624573
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.624573
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.624573
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.624573
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.624573
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.624581
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.624581
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.624581
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.624581
  

Unnamed: 0,Unnamed: 1,McFadden's pseudo-R-squared,P value of LLR-test
likert turn,emotional,0.05059,0.00036
likert turn,engaging,0.05843,0.00015
likert turn,grammatical,0.06543,6e-05
likert turn,relevant,0.07104,2e-05
likert turn,informative,0.06306,4e-05
likert turn,consistent,0.04131,0.0004


In [92]:
likert_dialogue_regs = incremental_regression(
    ldq, (category.likert_dialogue,), beam=10, exclusions=['quality'],
    reload='results/likert_dialogue_incremental_regressions'
)
likert_dialogue_regs

Unnamed: 0,Unnamed: 1,R-Squared,P value of F-test
likert dialogue,emotional,0.57684,0.0
likert dialogue,informative,0.57189,0.0
likert dialogue,grammatical,0.55887,0.0
likert dialogue,relevant,0.54057,0.0
likert dialogue,proactive,0.5127,0.0
likert dialogue,consistent,0.48414,0.0
likert dialogue,engaging,0.37947,0.0


In [93]:
likert_dialogue_regs = incremental_regression(
    idq, (category.likert_dialogue,), beam=10, exclusions=['quality'],
    reload='results/likert_dialogue_incremental_regressions_interactive'
)
likert_dialogue_regs

Unnamed: 0,Unnamed: 1,R-Squared,P value of F-test
likert dialogue,informative,0.10508,0.0
likert dialogue,consistent,0.10736,0.0
likert dialogue,grammatical,0.10936,0.0
likert dialogue,engaging,0.11119,0.0
likert dialogue,proactive,0.11275,0.0
likert dialogue,emotional,0.10154,0.0
likert dialogue,relevant,0.07952,0.0


In [94]:
likert_dialogue_regs_comp = incremental_regression(
    icq, (category.likert_dialogue,), beam=10, model='logistic', exclusions=['quality'],
    reload='results/likert_dialogue_incremental_regressions_comparative'
)
likert_dialogue_regs_comp

Optimization terminated successfully.
         Current function value: 0.601494
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.601494
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.601494
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.601494
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.601494
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.601494
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.601494
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.601515
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.601515
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.601515
  

Unnamed: 0,Unnamed: 1,McFadden's pseudo-R-squared,P value of LLR-test
likert dialogue,relevant,0.07608,2e-05
likert dialogue,grammatical,0.0839,1e-05
likert dialogue,informative,0.09159,0.0
likert dialogue,consistent,0.09681,0.0
likert dialogue,proactive,0.10104,0.0
likert dialogue,emotional,0.10452,0.0
likert dialogue,engaging,0.10652,0.0


In [95]:
likert_dialogue_regs_comp = incremental_regression(
    icq, (category.likert_dialogue,), beam=10, model='logistic', exclusions=[scale.engaging, scale.quality],
    reload='results/likert_dialogue_incremental_regressions_comparative_no_engaging_or_quality'
)
likert_dialogue_regs_comp

Optimization terminated successfully.
         Current function value: 0.639971
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.639971
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.639971
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.639971
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.639971
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.639971
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.640620
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.640620
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.640620
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.640620
  

Unnamed: 0,Unnamed: 1,McFadden's pseudo-R-squared,P value of LLR-test
likert dialogue,grammatical,0.02835,0.00381
likert dialogue,relevant,0.03538,0.00191
likert dialogue,informative,0.04215,0.00089
likert dialogue,proactive,0.04828,0.00038
likert dialogue,consistent,0.05308,0.00016
likert dialogue,emotional,0.04831,0.00016


In [96]:
comparative_dialogue_regs_comp = incremental_regression(
    icq, (category.comparative,), beam=10, model='logistic', exclusions=[scale.quality],
    reload='results/comparative_incremental_regressions_comparative'
)
comparative_dialogue_regs_comp

Optimization terminated successfully.
         Current function value: 0.663635
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.663635
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.663635
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.663635
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.663635
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.663635
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.663635
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.664634
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.664634
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.664634
  

Unnamed: 0,Unnamed: 1,McFadden's pseudo-R-squared,P value of LLR-test
comparative,consistent,-0.01369,0.16135
comparative,relevant,-0.00728,0.11867
comparative,proactive,-0.00027,0.07723
comparative,grammatical,0.00643,0.047
comparative,informative,0.0125,0.02693
comparative,emotional,0.01662,0.01631
comparative,engaging,0.01771,0.01072


## Table for Paper

In [97]:
def add_delta(df):
    df['R-Squared'] = (df['Adjusted R-Squared']*100)
    df['delta'] = df['R-Squared'].diff()
    df['delta'] = df['delta'].map('{:.4f}'.format)
    df['R-Squared'] = df['R-Squared'].map('{:.4f}'.format)
    df['R-Squared delta'] = df['R-Squared'] + ' (' + df['delta'] + ')'

final_behavior_regs = behavior_regs.reset_index().rename({'level_1': 'ABC-Eval'}, axis=1).drop(['level_0', 'P value of F-test'], axis=1)
add_delta(final_behavior_regs)

final_likert_turn_regs = likert_turn_regs.reset_index().rename({'level_1': 'Likert Turn'}, axis=1).drop(['level_0', 'P value of F-test'], axis=1)
add_delta(final_likert_turn_regs)

final_likert_dialogue_regs = likert_dialogue_regs.reset_index().rename({'level_1': 'Likert Dialogue'}, axis=1).drop(['level_0', 'P value of F-test'], axis=1)
add_delta(final_likert_dialogue_regs)

final_behavior_regs

KeyError: 'Adjusted R-Squared'

In [None]:
combined = pd.concat(
    [
        final_behavior_regs[['ABC-Eval', 'R-Squared delta']],
        final_likert_turn_regs[['Likert Turn', 'R-Squared delta']],
        final_likert_dialogue_regs[['Likert Dialogue', 'R-Squared delta']]
    ],
    axis=1)

combined.to_csv('results/paper/incremental_validity.csv', index=False)
combined