In [None]:
import os
import pickle
from utils import Text, apply_history, calc_percentage_agreement, calc_cohen_kappa, calc_kendallstau,  calc_rank_correlation, get_all_users, apply_ranking_to_scores, apply_scores, scale_scores, scale_ranked_scores, calc_rbo, calc_krippendorfs_alpha, calc_fleiss_kappa
from sklearn import preprocessing
import numpy as np
import glob
import re
from datetime import datetime
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.metrics import cohen_kappa_score

from collections import Counter

In [None]:
#load text data
num_texts = 94

data = pickle.load(open(f"/workspace/data/ARTS_only_texts_{num_texts}.pkl", "rb"))
determined_pairs = pickle.load(open(f"/workspace/data/determined_pairs_{num_texts*4}.pkl", "rb"))

In [None]:
all_users = sorted(get_all_users())
all_users

In [None]:
#build dictionary

user1, user2 ="Gold-94", "Rater1-94"

texts = {}
texts[user1] = {t_id : Text(t_id, text[0]) for t_id, text in data.iterrows()}
texts[user2] = {t_id : Text(t_id, text[0]) for t_id, text in data.iterrows()}


histories = {}
histories[user1] = pickle.load(open(f"/workspace/Histories/{user1}_history.pkl", "rb"))
histories[user2] = pickle.load(open( f"/workspace/Histories/{user2}_history.pkl", "rb"))

apply_history(histories[user1], texts[user1])
apply_history(histories[user2], texts[user2])


#if user only ranks, just apply scores to the texts
#apply_ranking_to_scores(histories[user2], texts[user2])

#if user only estimates individual scores for texts
#apply_scores(histories[user2], texts[user2])

In [None]:

#%load_ext autoreload
#%autoreload 2


print(f"Given the following users: {user1, user2}")
print(f"Agreement percentage:\t {round(calc_percentage_agreement(histories[user1], histories[user2]), 4)}")
print(f"Cohens kappa: \t\t {round(calc_cohen_kappa(histories[user1],  histories[user2]),4)}")


print("-"*70)

stat, p_val_rank = calc_rank_correlation(texts[user1], texts[user2])
print(f"Rank correlation: \t r={stat}, p-value={p_val_rank}")

tau, p_val_tau = calc_kendallstau(texts[user1], texts[user2])
print(f"Kendalls tau: \t\t t={tau}, p-value={p_val_tau}")

rbo = round(calc_rbo(texts[user1], texts[user2]), 4)
print(f"Rbo: \t\t\t {rbo}")

In [None]:
#time analysis
decision_durations = []
users = [user for user in all_users if not 'gpt' in user and not 'Gold' in user]
for user in users:
    for i in range(1, len(histories[user])):
        d_start = datetime.strptime(histories[user][i-1][-1], "%H:%M:%S")
        d_end = datetime.strptime(histories[user][i][-1], "%H:%M:%S")

        decision_durations.append((d_end- d_start).total_seconds())

In [None]:
print(f"Median effort for one annotation in secondes: {np.mean(decision_durations)}")

In [None]:
#generating gold labels

texts = {}
histories = {}

for user in users:
    texts[user] = {t_id : Text(t_id, text[0]) for t_id, text in data.iterrows()}
    histories[user] = pickle.load(open(f"/workspace/{user}_history.pkl", "rb"))
    apply_history(histories[user], texts[user])

#gold user
texts['Gold'] = {t_id : Text(t_id, text[0]) for t_id, text in data.iterrows()}
histories['Gold'] = {}

for i in range(len(determined_pairs)):
    #get user labels
    user_labels = [histories[user][i][1] for user in users]
    most_common = Counter(user_labels).most_common(1)[0][0]
    histories['Gold'][i] = (determined_pairs[i], most_common, '00:00:00')

In [None]:
#pickle.dump(histories['Gold'], open("/workspace/Gold_history.pkl", "wb"))

In [None]:
#analyse ira for users
print(f"krippendorfs alpha: {round(calc_krippendorfs_alpha(histories, determined_pairs), 4)}")
print(f"fleiss kappa: {round(calc_fleiss_kappa(histories, determined_pairs), 4)}")

In [None]:
#scaled_ranked_scores = scale_ranked_scores(texts[user1])
#pickle.dump(scaled_ranked_scores, open(f"/workspace/{user1}_scores.pkl", "wb"))

In [None]:
#find cased with bad agreement
all_votes = []
for i in range(len(determined_pairs)):
    votes = []
    for user in users:
        votes.append(histories[user][i][1])
    
    if votes.count(determined_pairs[i][0]) in [2,3]: 
        all_votes.append(((determined_pairs[i]), votes))


In [None]:
raw_scores = [texts[user1][key].get_rating() for key in texts[user1].keys()]

In [None]:
hard_pairs = []

for vote in all_votes:
    t1, t2 = vote[0]
    hard_pairs.append([texts['Gold-94'][t1].get_text(), texts['Gold-94'][t2].get_text()])

In [None]:
pickle.dump(all_votes, open("/workspace/hard_cases_rater.pkl", "wb"))

In [None]:
#find cases where gpt4 disagrees on gold labels
disagrees_gpt = []
for i in range(len(determined_pairs)):
    if histories['Gold-94'][i][1] != histories['gpt-4-1106-preview-94'][i][1]:
        disagrees_gpt.append(i)


In [None]:
#pickle.dump(disagrees_gpt, open("/workspace/disagrees_gpt.pkl", "wb"))

In [None]:
scores = pickle.load(open("/workspace/Gold-94_Scores.pkl", "rb"))