In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from analysis import *

In [None]:
surge_annotations = data.surge_evaluation.annotation_dataframe()
surge_annotations_comparative = data.surge_evaluation.comparative_annotation_dataframe()

surge_annotations

# 6 Evaluation


### Worker Group Completed Work


In [None]:
data.surge_evaluation.annotation_counts()

In [None]:
data.student_evaluation.annotation_counts()

In [None]:
data.mturk_evaluation.annotation_counts()

## Worker Work Over Time Graph

In [None]:
from collections import Counter, defaultdict
from datetime import datetime
def get_date_counts(project):
    sorted_dates = sorted([u.completion_date.split(' ')[0] for _, u in project.work_units.items()])
    start_date = sorted_dates[0]
    start_datetime = datetime.strptime(start_date, '%m-%d-%Y')
    counts = Counter(sorted_dates)
    relative_counts = {}
    prev_key = 0
    for date_key in counts:
        datetime_key = datetime.strptime(date_key, '%m-%d-%Y')
        day_n = (datetime_key - start_datetime).days
        while prev_key < day_n:
            prev_key += 1
            relative_counts[prev_key] = 0
        relative_counts[day_n] = counts[date_key]
    return relative_counts

In [None]:
def get_cumulative_date_counts(project):
    sorted_dates = sorted([u.completion_date.split(' ')[0] for _, u in project.work_units.items()])
    start_date = sorted_dates[0]
    start_datetime = datetime.strptime(start_date, '%m-%d-%Y')
    counts = Counter(sorted_dates)
    relative_counts = {}
    prev_key = 0
    total = 0
    for date_key in counts:
        datetime_key = datetime.strptime(date_key, '%m-%d-%Y')
        day_n = (datetime_key - start_datetime).days
        while prev_key < day_n:
            prev_key += 1
            relative_counts[prev_key] = total
        total += counts[date_key]
        relative_counts[day_n] = total
    return relative_counts

In [None]:
def get_hour_counts(project):
    sorted_dates = sorted([datetime.strptime(u.completion_date, '%m-%d-%Y %H:%M:%S') for _, u in project.work_units.items()])
    start_datetime = sorted_dates[0]
    relative_counts = defaultdict(int)
    prev_key = 0
    for datetime_key in sorted_dates:
        delta = int((datetime_key - start_datetime).total_seconds() / 60 / 60)
        while prev_key < delta:
            prev_key += 1
            relative_counts[prev_key] = 0
        relative_counts[delta] += 1
    return relative_counts

In [None]:
perform_count = get_cumulative_date_counts
surger_dates = perform_count(data.surge_evaluation)
mturk_dates = perform_count(data.mturk_evaluation)
student_dates = perform_count(data.student_evaluation)

all_dates = {}
for day_n, val in surger_dates.items():
    all_dates[day_n] = {
        'Surge': val,
        'MTurk': mturk_dates.get(day_n, np.NaN),
        'Student': student_dates.get(day_n, np.NaN)
    }

all_dates_df = pd.DataFrame(all_dates).T
all_dates_df

In [None]:
SMALL_SIZE = 16
MEDIUM_SIZE = 20
BIGGER_SIZE = 24

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

plt.rcParams["figure.figsize"] = (20,10)

fig, ax = plt.subplots()
all_dates_df['MTurk'].plot()
all_dates_df['Student'].plot()
all_dates_df['Surge'].plot()

ax.set_xlabel('Days', labelpad=20)
ax.set_ylabel('Assignments', labelpad=20)

plt.legend()

### Worker Group Screening


In [None]:
screening = across_evaluations(
    [data.annotation_pilots_onboarding[-2], data.student_onboarding, data.mturk_onboarding, data.surge_onboarding],
    screening_rates_by_label,
    reload='results/evaluation_screening'
)
to_plot_screening = screening[['attempted', 'passed']]
to_plot_screening = to_plot_screening.transpose()
for dim in ['knowledge', 'sociality', 'interpretability', 'personal_information', 'consistency', 'commonsense', 'empathy', 'transitions']:
    if dim != 'transitions':
        to_plot_screening[(4, dim)] = to_plot_screening[(0, dim)] + to_plot_screening[(1, dim)]
    else:
        to_plot_screening[(4, dim)] = to_plot_screening[(0, dim)]
to_plot_screening.drop(0, axis='columns', inplace=True)
to_plot_screening.drop(1, axis='columns', inplace=True)
to_plot_screening = to_plot_screening.transpose()
to_plot_screening['screen-rate'] = to_plot_screening['passed'] / to_plot_screening['attempted']
to_plot_screening = to_plot_screening.reset_index()
to_plot_screening.replace({'round': {2: 'MTurk', 3: 'Surge', 4: 'Student'}}, inplace=True)
to_plot_screening.replace({'level_1': {'personal_information': 'personal info'}}, inplace=True)
to_plot_screening

In [None]:
plt.rcParams["figure.figsize"] = (10,5)
df = to_plot_screening.pivot(index='level_1', columns='round', values='screen-rate')
ax = df.plot(
    kind='bar',
    rot=45,
    # color=[graphing_bot_colors[bot] for bot in df0.columns]
)
ax.legend(loc='center left',bbox_to_anchor=(1.0, 0.5))
ax.set_ylabel('Proportion')
ax.set_xlabel('Annotation Subtask')

In [None]:
SMALL_SIZE = 16
MEDIUM_SIZE = 20
BIGGER_SIZE = 24

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

plt.rcParams["figure.figsize"] = (20,10)

df0 = to_plot_screening
df0 = df0.set_index(['level_1', 'round'])
df0 = df0.unstack(level=-1)
fig, ax = plt.subplots()

groups = []
for i in df0.columns:
    if i[1] not in groups:
        groups.append(i[1])

df0['attempted'].plot(kind='bar', alpha=0.2, rot=0, ax=ax)
df0['passed'].plot(kind='bar', rot=0, ax=ax)

h, l = ax.get_legend_handles_labels()
markers = {}
for h, l, (label, category) in zip(h, l, df0.columns):
    markers.setdefault(category, []).append((h,l))
wtl_dummies = [plt.plot([],marker="", ls="")[0]]*3
bot_dummies = [plt.plot([],marker="", ls="")[0]]*3
handles = wtl_dummies
labels = ["", "Failed:", "Passed:"]
for i, (bot, symbols) in enumerate(markers.items()):
    handles.append(bot_dummies[i])
    labels.append(bot)
    handles.extend([s[0] for s in symbols])
    labels.extend(["" for s in symbols])
leg = plt.legend(handles, labels, ncol=4, loc='upper right', bbox_to_anchor=(0.67, -0.35), labelspacing=0.25)
for i, vpack in enumerate(leg._legend_handle_box.get_children()):
    if i == 0: # row titles
        for hpack in vpack.get_children():
            hpack.get_children()[0].set_width(0)
    else:
        for j, hpack in enumerate(vpack.get_children()):
            if j > 0: # category (attempted vs passed) markers
                hpack.get_children()[0].get_children()[0].set_width(50)
            else: # column titles
                hpack.get_children()[0].set_width(0)

ax.set_ylabel('Frequency', labelpad=30)
ax.set_xlabel('Annotation Subtask', labelpad=30)
ax.set_xticklabels([d for d in df0.index])

plt.tight_layout()
plt.show()

### Agreements


In [None]:
agreements = agreement_dataframe(
    surge_annotations, reload='results/surge_agreements'
)
agreements = prettify(agreements, float_prec=3, sort_by=["category", "Krippendorff's alpha"], col_types={"n": int}, to_csv='results/paper/surge_agreements', index=False)
agreements

In [None]:
# Build the plot
plt.rcParams["figure.figsize"] = (20,10)

fig, ax = plt.subplots()

def plot_by_category(ax, df, category, color, xaxis_start):
    extracted = df[df["category"] == category]
    lower_bound = extracted["Krippendorff's alpha"] - extracted["CI low"]
    upper_bound = extracted["CI high"] - extracted["Krippendorff's alpha"]
    xaxis_end = xaxis_start + len(extracted)
    ax.errorbar(np.arange(xaxis_start, xaxis_end),
                extracted["Krippendorff's alpha"],
                yerr=[lower_bound, upper_bound],
                fmt='o',
                elinewidth=1,
                color=color)
    return xaxis_end

likert_turn_color = "blue"
likert_dialogue_color = "red"
comparative_color = "green"
behavior_color = "orange"

krip_agreements = agreements.iloc[: , :-4]
krip_agreements = krip_agreements.reset_index()
likert_dialogue_start = plot_by_category(ax, krip_agreements, "likert turn", likert_turn_color, 0)
comparative_start = plot_by_category(ax, krip_agreements, "likert dialogue", likert_dialogue_color, likert_dialogue_start)
behavior_start = plot_by_category(ax, krip_agreements, "comparative", comparative_color, comparative_start)
misc_start = plot_by_category(ax, krip_agreements, "behavior", behavior_color, behavior_start)

category_range = {likert_dialogue_start: likert_turn_color, comparative_start: likert_dialogue_color, behavior_start: comparative_color, misc_start: behavior_color}
xaxis_colors = {}
prev_idx = 0
for idx, color in category_range.items():
    for i in range(prev_idx, idx):
        xaxis_colors[i] = color
    prev_idx = idx

ax.set_ylabel("Krippendorf's alpha", labelpad=20)
xpos = np.arange(len(krip_agreements))
ax.set_xlabel("Evaluation Label", labelpad=20)
ax.set_xticks(xpos)
ax.set_xticklabels(krip_agreements["label"], rotation=90)
for tickloc, ticklabel in zip(plt.gca().get_xticks(), plt.gca().get_xticklabels()):
    ticklabel.set_color(xaxis_colors[tickloc])
ax.set_title('Interannotator Agreement')
ax.yaxis.grid(True)

# Save the figure and show
plt.tight_layout()
plt.show()