In [None]:
import pandas as pd
from analysis import *
from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import binom_test

In [None]:
surge_annotations = data.surge_evaluation.annotation_dataframe()
surge_annotations_comparative = data.surge_evaluation.comparative_annotation_dataframe()

surge_annotations

# 8 Comprehensive Analysis

### Metric Sensitivity

In [None]:
from itertools import combinations
from scipy.stats import ttest_ind

def p_vals(df: pd.DataFrame, test='t'):
    """
    :param df: (bot, data point) x 1 -> score
    :param test: statistical test function (t for t test, p for prop test, s for sign test)
    :return: p values of test on each bot pair (pd.Series)
    """
    bots = set(df.index.get_level_values(0))
    bot_pairs = list(combinations(bots, 2))
    result = {}
    for ba, bb in bot_pairs:
        a = df.xs(ba).to_numpy().squeeze()
        b = df.xs(bb).to_numpy().squeeze()
        if test == 't':
            t, p = ttest_ind(a, b, equal_var=False)
        elif test == 'p':
            z, p = proportions_ztest(count=[
                sum(a), sum(b)
            ], nobs=[
                len(a), len(b)
            ])
        elif test == 's':
            # sign test
            a = a[a==1]
            b = b[b==1]
            p = binom_test(sum(a), sum(a)+sum(b), p=0.5)
        else:
            raise ValueError('invalid arg for param "test"')
        result[(ba, bb)] = p
    result_series = pd.Series(result.values(), result)
    return result_series

@to_file
def t_test_p_values_comparing_bots(annotations):
    annotations = get_singly_annotated(annotations)
    prop_annotations = annotations.xs(
        category.behavior, level=sym.category, drop_level=False
    )
    comp_annotations = annotations.xs(
        category.comparative, level=sym.category, drop_level=False
    )
    mean_annotations = annotations.drop(
        index=category.behavior, level=sym.category
    ).drop(
        index=category.comparative, level=sym.category
    )
    mean_ps = mean_annotations.groupby(
        [sym.category, sym.label]
    ).apply(p_vals)
    prop_ps = prop_annotations.groupby(
        [sym.category, sym.label]
    ).apply(lambda x: p_vals(x, test='p'))
    comp_ps = comp_annotations.groupby(
        [sym.category, sym.label]
    ).apply(lambda x: p_vals(x, test='s'))
    result = pd.concat([prop_ps, mean_ps, comp_ps], axis=0)
    return result

t_test_p_values_comparing_bots(surge_annotations, load='results/t_test_p_values_comparing_bots')

### Predictive Validity

In [26]:
from statsmodels.miscmodels.ordinal_model import OrderedModel
from statsmodels.regression.linear_model import OLS as LinearModel
from statsmodels.tools.tools import add_constant

def dialogue_metrics(ev):
    df: pd.DataFrame = ev.annotation_dataframe()
    df = get_singly_annotated(df, seed=123)
    reindexed = df.reset_index()
    items = reindexed[sym.item]
    dialogues = [e[0] if isinstance(e, tuple) else e for e in items]
    reindexed['dialogue'] = dialogues
    reindexed.set_index(
        [sym.bot, sym.category, sym.label, 'dialogue', sym.item],
        inplace=True, verify_integrity=True
    )
    ld = reindexed.xs(category.likert_dialogue, level=sym.category)
    ld = ld.droplevel(sym.bot).droplevel(sym.item)
    ld.columns = ['score']
    ldq = ld.xs(scale.quality, level=sym.label)
    ldq.columns = ['quality']

    lt = reindexed.xs(category.likert_turn, level=sym.category)
    lt = lt.groupby([sym.label, 'dialogue']).mean()
    lt.columns = ['score']
    ltq = lt.xs(scale.quality, level=sym.label)
    ltq.columns = ['quality']

    be = reindexed.xs(category.behavior, level=sym.category)
    be = be.groupby([sym.label, 'dialogue']).mean()
    be.columns = ['score']

    ds = pd.concat(
        [lt, be, ld],
        keys=[category.likert_turn, category.behavior, category.likert_dialogue],
        names=[sym.category, sym.label, 'dialogue']
    )
    likert_dialogue_quality_features = ds.join(ldq, on='dialogue')
    likert_turn_quality_features = ds.join(ltq, on='dialogue')
    return likert_dialogue_quality_features, likert_turn_quality_features


def regressions(df, quality_column_name=None, model='linear'):
    """
    :param df: dialogue x (*features, quality) -> value
    :return: *(coef, low, high), mcfadden r^2
    """
    if not quality_column_name:
        quality_column_name = df.columns[-1]
    qualities = df[quality_column_name]
    features = [f for f in df.columns if f != quality_column_name]
    if model == 'ordinal':
        model = OrderedModel(qualities, df[features], distr='logit')
        results = model.fit()
        coefs = {f: results.params[f] for f in features}
        prsqrd = results.prsquared
        result = {stat.mcfad_r2: prsqrd, 'P value of LLR-test': results.llr_pvalue}
    elif model == 'linear':
        x = add_constant(df[features])
        y = qualities
        model = LinearModel(y, x)
        results = model.fit()
        coefs = {f: results.params[f] for f in features}
        rsquared = results.rsquared
        result = {**coefs, 'R-Squared': rsquared, 'P value of F-test': results.f_pvalue}
    else:
        raise ValueError('Param "model" must be one of {"linear", "ordinal"}')
    return pd.Series(result.values(), result)

@to_file
def dialogue_quality_regressions(ev):
    ldq, ltq = dialogue_metrics(ev)
    ldq_groups = ldq.groupby(
        [sym.category, sym.label]
    )
    ltq_groups = ltq.groupby(
        [sym.category, sym.label]
    )
    names = ['Predicted', 'Metric']
    linear_result = ldq_groups.apply(lambda x: regressions(x, model='linear'))
    linear_result.columns = pd.MultiIndex.from_arrays(
        [['Likert Dialogue Quality']*3,
        ['LR Coefficient', 'LR R-Squared', 'P value of F-test']],
        names=names
    )
    ordinal_result = ldq_groups.apply(lambda x: regressions(x, model='ordinal'))
    ordinal_result.columns = pd.MultiIndex.from_arrays(
        [['Likert Dialogue Quality']*2,
        ['OR Pseudo R-Squared', 'P value of LLR-test']],
        names=names
    )
    linear_turn_result = ltq_groups.apply(regressions)
    linear_turn_result.columns = pd.MultiIndex.from_arrays(
        [['Likert Turn Quality']*3,
        ['LR Coefficient', 'LR R-Squared', 'P value of F-test']],
        names=names
    )
    result = pd.concat((linear_turn_result, linear_result, ordinal_result), axis=1)
    return result.round(5)

dialogue_quality_regressions(
    data.surge_evaluation,
    load='results/dialogue_quality_regressions'
)

Unnamed: 0_level_0,Predicted,Likert Turn Quality,Likert Turn Quality,Likert Turn Quality,Likert Dialogue Quality,Likert Dialogue Quality,Likert Dialogue Quality,Likert Dialogue Quality,Likert Dialogue Quality
Unnamed: 0_level_1,Metric,LR Coefficient,LR R-Squared,P value of F-test,LR Coefficient,LR R-Squared,P value of F-test,OR Pseudo R-Squared,P value of LLR-test
category,label,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
likert turn,consistent,0.22018,0.04654,1e-05,0.24864,0.0276,0.00085,0.00862,0.00225
likert turn,emotional,0.16635,0.02928,0.00059,0.22243,0.02434,0.00175,0.00802,0.0032
likert turn,engaging,0.16493,0.04114,4e-05,0.19437,0.02657,0.00107,0.00887,0.00194
likert turn,grammatical,0.29904,0.06781,0.0,0.00659,2e-05,0.93777,0.0,0.9573
likert turn,informative,0.01924,0.0004,0.68836,-0.05261,0.00141,0.45444,0.00046,0.47933
likert turn,proactive,0.21331,0.06253,0.0,0.22051,0.03107,0.0004,0.01032,0.00083
likert turn,quality,1.0,1.0,0.0,0.22776,0.02412,0.00184,0.00825,0.0028
likert turn,relevant,0.25511,0.09226,0.0,0.21768,0.03124,0.00038,0.01012,0.00093
behavior,antisocial,-3.60729,0.00827,0.06928,-0.00927,0.0,0.99747,0.0,0.96702
behavior,commonsense contradiction,-1.19415,0.06063,0.0,-2.06686,0.08446,0.0,0.02971,0.0


In [None]:
def stepwise_regression(df: pd.DataFrame, quality_column_name=None, model='linear'):
    if model == 'linear':
        ...
    elif model == 'ordinal':
        ...
    left = ...
