In [None]:
import pandas as pd
from analysis import *

# 3 Behavior Evaluation Procedure

### Behavior Examples

In [None]:
surge_annotations = data.surge_evaluation.annotation_dataframe()
seeds = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
for s, b in zip(seeds, behavior):
    print(get_example(
        data.surge_evaluation,
        category.behavior, b, context=0, mark=1,
        seed=s,
        annotations=surge_annotations
    ))

### Pilot Filtering

In [None]:
from copy import deepcopy
def filter_pilot(pilot, qualifications):
    pilot_copy = deepcopy(pilot)
    work_units_to_keep = set()
    for work_unit_id, work_unit in pilot.work_units.items():
        if 'likert' not in work_unit.task and 'comparative' not in work_unit.task:
            if qualifications[work_unit.worker_id][work_unit.task]:
                work_units_to_keep.add(work_unit_id)

    for dialogue_id, dialogue in pilot.dialogues.items():
        for turn_idx, turn in enumerate(dialogue.turns):
            for label, annotations in turn.behavior_annotations.items():
                annotations_to_keep = [annot for annot in annotations if annot.work_unit_id in work_units_to_keep]
                pilot_copy.dialogues[dialogue_id].turns[turn_idx].behavior_annotations[label] = annotations_to_keep

    work_unit_objects_to_keep = {wid: u for wid, u in pilot.work_units.items() if wid in work_units_to_keep}
    pilot_copy.work_units = work_unit_objects_to_keep

    return pilot_copy

In [None]:
def get_task(dialogue_id):
    if 'personal_information' in dialogue_id:
        task = 'personal_information'
    elif 'grammar' in dialogue_id:
        task = 'interpretability'
    else:
        task = dialogue_id[:dialogue_id.find('_')]
    return task

In [None]:
# Remove non-Phd+ workers from pilots 0 and 1
phd_plus = ["liyan", "zihao", "jinho", "han", "greg", "sichang"]
under = ["sophy", "jessica", "samir", "angela", "chen", "dan"]

tasks_pilot0 = {u.task for u in data.annotation_pilots[0].work_units.values()}
qualifications_pilot0 = {w: {t: True for t in tasks_pilot0} for w in phd_plus}
qualifications_pilot0.update({w: {t: False for t in tasks_pilot0} for w in under})
grad_filtered_pilot0 = filter_pilot(data.annotation_pilots[0], qualifications_pilot0)

tasks_pilot1 = {u.task for u in data.annotation_pilots[1].work_units.values()}
qualifications_pilot1 = {w: {t: True for t in tasks_pilot1} for w in phd_plus}
qualifications_pilot1.update({w: {t: False for t in tasks_pilot1} for w in under})
grad_filtered_pilot1 = filter_pilot(data.annotation_pilots[1], qualifications_pilot1)

In [None]:
grad_pilots = [grad_filtered_pilot0, grad_filtered_pilot1, *data.annotation_pilots[2:]]

In [None]:
qualifications_pilot0 = {w: {t: True for t in tasks_pilot0} for w in under}
qualifications_pilot0.update({w: {t: False for t in tasks_pilot0} for w in phd_plus})
undergrad_filtered_pilot0 = filter_pilot(data.annotation_pilots[0], qualifications_pilot0)

qualifications_pilot1 = {w: {t: True for t in tasks_pilot1} for w in under}
qualifications_pilot1.update({w: {t: False for t in tasks_pilot1} for w in phd_plus})
undergrad_filtered_pilot1 = filter_pilot(data.annotation_pilots[1], qualifications_pilot1)

In [None]:
undergrad_pilots = [undergrad_filtered_pilot0, undergrad_filtered_pilot1, *data.annotation_pilots[2:]]

In [None]:
pilot0_raw = {
    "use_e_know", "cont_e_know", "cont_common", "use_profile", "cont_profile", "use_s_ctxt", "cont_s_ctxt", "use_p_ctxt", "cont_p_ctxt",
    "request", "present", "ignore_request", "ignore_present",
    "grammar_error", "lack_of_sociality", "repetition", "irrelevant"
}

# pilot0 is loaded from excel files, not from json files so it does not use the label standardization process in Research repo: converter.py
# missing labels:
#   use profile (done), contradict profile (done)
#   ignore request (done), ignore present (done)
# pilot0 is fully captured in data.json now! (9/14/2022)


pilot1_raw = {
    "use_profile", "cont_profile", "cont_s_ctxt", "cont_p_ctxt", "redundant",
    "This response is uninterpretable",
    "exhibits antisocial behavior",
    "This response contradicts common knowledge."

    "Yes, SPEAKER_X is asking a question or making a request.",
    "No, SPEAKER_X is just sharing something with SPEAKER_Y.",
    "Yes, SPEAKER_X is asking for SPEAKER_Y to elaborate on the ideas presented in the previous turn.",
    "No, SPEAKER_X changes to a different talking point, discussion, or topic.",
    "Yes, SPEAKER_X is ONLY building on, exploring, or responding to what SPEAKER_Y said in the previous turn.",
    "Yes, the new talking point is relevant to the current discussion, OR appropriately transitions to a new topic.",
    "No, introducing the new talking point is abrupt and interrupts the current discussion.",
    "Yes, SPEAKER_X or acknowledges what SPEAKER_Y just said, OR her response implies that she understood what SPEAKER_Y just said.",
    "Yes, SPEAKER_X directly responds to or acknowledges what SPEAKER_Y just said, OR SPEAKER_X's response implies that she understood what SPEAKER_Y just said.",
    "No, SPEAKER_X ignored SPEAKER_Y.",
    "Yes, what SPEAKER_X said does not necessarily require an acknowledgement or follow-up from SPEAKER_Y.",
    "No, there is an unspoken expectation that SPEAKER_Y responds to or acknowledges what SPEAKER_X said this turn.",

    "SPEAKER_X's response incorporates or assumes a fact.",
    "Yes, I know for sure the fact is accurate.",
    "The fact is accurate; a credible source verified the fact in my search.",
    "No, I know for sure the fact is inaccurate, false, or highly implausible.",
    "The fact is inaccurate; a credible source falsified the fact or revealed that it is highly implausible.",
    "It is misleading for SPEAKER_X to claim or assume the fact, because there is no way that SPEAKER_X or anyone else has tested whether the fact is accurate.",
    "My search revealed multiple credible sources that disagreed about whether the fact was true.",
    "I couldn't find enough credible evidence in my search to either verify or falsify the fact."
}

pilot2_raw = {
    "use_profile", "cont_profile", "cont_s_ctxt", "cont_p_ctxt", "redundant",
    "This response contradicts common knowledge.",
    "This response exhibits antisocial behavior.",
    "This response is uninterpretable",

    "Yes, SPEAKER_X directly responds to or acknowledges what SPEAKER_Y just said, OR SPEAKER_X's response implies that she understood what SPEAKER_Y just said",
    "No, SPEAKER_X ignored SPEAKER_Y.",
    "Not applicable, what SPEAKER_Y just said does not require a response or acknowledgement from SPEAKER_X.",
    "Yes, SPEAKER_X is asking a question or making a request.",
    "No, SPEAKER_X is just sharing something with SPEAKER_Y.",
    "Yes, the response is relevant to the current discussion, OR appropriately transitions to a new talking point.",
    "No, the response feels abrupt and interrupts the current discussion.",
    "Yes, SPEAKER_X is asking for SPEAKER_Y to elaborate on the ideas presented in the previous turn.",
    "Yes, SPEAKER_X is ONLY responding to, building on, or further exploring what SPEAKER_Y said in the previous turn.",
    "No, SPEAKER_X changes to a different talking point, discussion, or topic.",

    "SPEAKER_X's response incorporates or assumes at least one fact.",
    "Yes, I know for sure ALL facts are accurate.",
    "No, I know for sure that one of the facts is inaccurate, false, or highly implausible.",
    "It is misleading for SPEAKER_X to claim or assume one of the facts, because there is no way that SPEAKER_X knows whether that fact is accurate.",
    "ALL facts are accurate; a credible source verified the facts in my search.",
    "One of the facts is inaccurate; a credible source falsified the fact or revealed that it is highly implausible.",
    "My search revealed multiple credible sources that disagreed about whether one of the facts was true.",
    "I couldn't find enough credible evidence in my search to either verify or falsify one of the facts."
}

# pilot1 is missing use profile and contradict profile labels after processing of Research repo: converter.py
# pilot1 is fully captured in data.json now! (9/14/2022)

# pilot3 was missing use profile and contradict profile labels
# pilot3 is fully captured in data.json now! (9/14/2022)

full_raw = {
    "cont_s_ctxt", "cont_p_ctxt", "redundant",
    "This response contradicts common knowledge.",
    "This response exhibits antisocial behavior.",
    "This response is uninterpretable",

    "Yes, SPEAKER_X directly responds to or acknowledges what SPEAKER_Y just said, OR SPEAKER_X's response implies that she understood what SPEAKER_Y just said.",
    "No, SPEAKER_X ignored SPEAKER_Y.",
    "Not applicable, what SPEAKER_Y just said does not require a response or acknowledgement from SPEAKER_X.",
    "Yes, SPEAKER_X is changing the topic of the conversation.",
    "No, SPEAKER_X  is introducing a new talking point but it is still within the current topic of conversation.",
    "No, SPEAKER_X is ONLY responding to, building on, or further exploring what SPEAKER_Y said in the previous turn.",
    "Yes, the response naturally continues the current discussion with relevant questions or ideas, OR the response appropriately transitions to a new discussion if the current discussion has reached a natural conclusion.",
    "No, the response feels abrupt, and interrupts the current discussion because it is irrelevant",

    "SPEAKER_X's response incorporates or assumes at least one fact.",
    "Yes, I know for sure ALL facts are accurate.",
    "No, I know for sure that one of the facts is inaccurate, false, or highly implausible.",
    "It is misleading for SPEAKER_X to claim or assume one of the facts, because there is no way that SPEAKER_X knows whether that fact is accurate.",
    "ALL facts are accurate; a credible source verified the facts in my search.",
    "One of the facts is inaccurate; a credible source falsified the fact or revealed that it is highly implausible.",
    "My search revealed multiple credible sources that disagreed about whether one of the facts was true.",
    "I couldn't find enough credible evidence in my search to either verify or falsify one of the facts."
}

### Pilot Work Distributions

In [None]:
def distributions(evaluations):
    summaries = []
    for evaluation in evaluations:
        # number of dialogues
        num_dia = len(evaluation.dialogues.keys())
        # number of annotators per dialogue
        annotation_counts = set()
        for _, dialogue in evaluation.dialogues.items():
            for turn in dialogue.turns:
                for label, annotations in turn.behavior_annotations.items():
                    annotation_counts.add(len(annotations))
        annot_per_dia = ', '.join([str(c) for c in annotation_counts])
        # number of annotators
        annotators = {unit.worker_id for unit in evaluation.work_units.values()}
        summaries.append([num_dia, annot_per_dia, ', '.join(annotators), len(annotators)])

    sum_df = pd.DataFrame.from_records(summaries)
    sum_df.set_axis(
        ["Dialogues", "Annotators per Dialogue", "Annotators List", "Annotators"],
        inplace=True, axis=1
    )
    return sum_df

### Behavior Annotation Pilot Agreements

In [None]:
@to_file
def agreement_summaries(evaluations):
    summaries = []
    for evaluation in evaluations:
        annotations = evaluation.annotation_dataframe()
        agreement = agreement_dataframe(annotations, ci=False, k=100, dropna=False)
        macros = agreement.dropna().mean()
        summaries.append(macros)
    sum_df = pd.concat(summaries, axis=1).transpose()
    sum_df.set_axis(
        [stat.kripp_alpha, stat.n],
        inplace=True, axis=1
    )
    # sum_df.drop('x', axis=1, inplace=True)
    return sum_df

In [None]:
# undergraduates
distributions(undergrad_pilots)

In [None]:
agreement_summaries(undergrad_pilots)

In [None]:
# graduates
distributions(grad_pilots)

In [None]:
agreement_summaries(grad_pilots)

In [None]:
# all
distributions(data.annotation_pilots)

In [None]:
agreement_summaries(data.annotation_pilots)

### Screened Pilot Agreements

In [None]:
screening_threshold = {
    'commonsense': 2,
    'consistency': 2,
    'empathy': 2,
    'interpretability': 1,
    'knowledge': 2,
    'personal_information': 2,
    'sociality': 1,
    'transitions': 2
}

def check_onboarding(project):
    workers = {u.worker_id for u in project.work_units.values()}
    tasks = {get_task(dialogue_id) for dialogue_id in project.dialogues.keys()}

    qualifications = {w: {t: True for t in tasks} for w in workers}
    for dialogue_id, onboarding_dialogue in project.dialogues.items():
        if '_2' in dialogue_id:
            task = get_task(dialogue_id)
            for attempt in onboarding_dialogue.attempts:
                worker = project.work_units[attempt.work_unit_id].worker_id
                if not len(attempt.mistakes) <= screening_threshold[task]:
                    qualifications[worker][task] = False
    return qualifications

lab_pilot = data.annotation_pilots[2]
lab_pilot_with_training = data.annotation_pilots_onboarding[2]
lab_qualifications = check_onboarding(lab_pilot_with_training)

student_pilot = data.annotation_pilots[3]
student_pilot_with_training = data.annotation_pilots_onboarding[3]
student_qualifications = check_onboarding(student_pilot_with_training)

surge_pilot = data.annotation_pilots[4]
surge_pilot_with_training = data.annotation_pilots_onboarding[4]
surge_qualifications = check_onboarding(surge_pilot_with_training)

In [None]:
# make new evaluation that excludes assignments from failed workers

filtered_lab_pilot = filter_pilot(lab_pilot, lab_qualifications)
filtered_student_pilot = filter_pilot(student_pilot, student_qualifications)
filtered_surge_pilot = filter_pilot(surge_pilot, surge_qualifications)
agreement_summaries([filtered_lab_pilot, filtered_student_pilot, filtered_surge_pilot])

### Behavior Annotation Pilot Screening

In [None]:
# across_evaluations(
#     data.annotation_pilots_onboarding[2:5],
#     screening_rates_by_label,
#     load='results/annotation_pilot_screening'
# )

### Krippendorf's alpha Verifications

In [None]:
from analysis import krippendorff
import numpy as np

x = np.array([
    [1, np.nan],
    [1, np.nan],
    [0, np.nan],
    [0, np.nan],
    [1, 1],
    [1, 1],
    [1, 1],
    [1, 0],
    [1, 0],
    [1, 0],
    [1, 0]
])

print(krippendorff.alpha(x.T, level_of_measurement='ordinal'))

x = np.array([
    [1, 1],
    [1, 1],
    [1, 1],
    [1, 0],
    [1, 0],
    [1, 0],
    [1, 0]
])

print(krippendorff.alpha(x.T, level_of_measurement='ordinal'))

# .alpha ignores cases where only one annotation is available for unit! Good, don't need to do anything special