In [1]:
import json
import numpy as np
from sklearn.metrics import cohen_kappa_score
from scipy.stats import kendalltau
import matplotlib.pyplot as plt

In [2]:
configs = [
    "target",
    "fid_abstract",
    #"led_abstract",
    "fid_context",
    #"led_context",
    "fid_oracle",
    #"led_oracle",
    "fid_keyword",
    #"led_keyword"
]

In [3]:
metrics = [
    "fluency", "coherence", "relevance", "overall"
]

In [4]:
group_config = {}
with open("group1_configs.json") as f:
    group_config_idx = json.load(f)

for ID, indices in group_config_idx.items():
    group_config[ID] = [configs[idx] for idx in indices]
    
with open("group2_configs.json") as f:
    group_config_idx = json.load(f)

for ID, indices in group_config_idx.items():
    group_config[ID] = [configs[idx] for idx in indices]

In [5]:
group1_names = ["Ximeng","Lily", "Eric"]
group1_file_names = ["Ximeng.csv","Lily.csv", "EricNgo.csv"]
#group1_names = ["Ximeng","Lily"]
#group1_file_names = ["Ximeng.csv","Lily.csv"]
group2_names = ["Gerardo","Biswadip","Tanmay"]
group2_file_names = ["gerardo.csv","biswadip.csv", "TanmayVakare.csv"]
#group2_names = ["Biswadip"]
#group2_file_names = ["biswadip.csv"]
group_names = [group1_names, group2_names]
group_file_names = [group1_file_names, group2_file_names]

In [6]:
responses = {k: {config: {metric: {} for metric in metrics} for config in configs} for k in group_config.keys()}
for group_name, group_file_name in zip(group_names, group_file_names):
    for name, file_name in zip(group_name, group_file_name):
        with open(file_name) as f:
            for i, line in enumerate(f):
                elements = line.strip().split(",")
                if len(elements)!=6:
                    continue
                ID, system, fluency, coherence, relevance, overall = elements
                if i>0 and ID != "":
                    system_name = group_config[ID][int(system)-1]
                    responses[ID][system_name]["fluency"][name] = int(fluency)
                    responses[ID][system_name]["coherence"][name] = int(coherence)
                    responses[ID][system_name]["relevance"][name] = int(relevance)
                    responses[ID][system_name]["overall"][name] = int(overall)

In [7]:
group1_target_file_names = ["Ximeng_target.csv","lily_target.csv", "EricNgo_target.csv"]
group2_target_file_names = ["gerardo_target.csv", "biswadip_target.csv", "TanmayVakare_target.csv"]
group_names = [group1_names, group2_names]
group_target_file_names = [group1_target_file_names, group2_target_file_names]

In [8]:
responses_target = {k: {config: {metric: {} for metric in metrics} for config in configs} for k in group_config.keys()}

for group_name, group_file_name in zip(group_names, group_target_file_names):
    for name, file_name in zip(group_name, group_file_name):
        with open(file_name) as f:
            for i, line in enumerate(f):
                elements = line.strip().split(",")
                if len(elements)!=6:
                    continue
                ID, system, fluency, coherence, relevance, overall = elements
                """
                if fluency == "":
                    fluency = "5"
                if coherence == "":
                    coherence = "5"
                if relevance == "":
                    relevance = "5"
                if overall == "":
                    overall = "5"
                """
                if i>0 and ID != "":
                    if ID[-1] == "$":
                        ID = ID[:-1]
                    system_name = group_config[ID][int(system)-1]
                    responses_target[ID][system_name]["fluency"][name] = int(fluency)
                    responses_target[ID][system_name]["coherence"][name] = int(coherence)
                    responses_target[ID][system_name]["relevance"][name] = int(relevance)
                    responses_target[ID][system_name]["overall"][name] = int(overall)

In [9]:
def pair_agreement(name1, name2):
    blind_scores1 = []
    blind_scores2 = []
    for config in configs:
        for metric in metrics:
            this_metric = []
            for ID in group_config.keys():
                if name1 in responses[ID][config][metric]:
                    score = responses[ID][config][metric][name1]
                    if score > 0:
                        blind_scores1.append(score)
                if name2 in responses[ID][config][metric]:
                    score= responses[ID][config][metric][name2]
                    if score > 0:
                        blind_scores2.append(score)

    target_scores1 = []
    target_scores2 = []
    for config in configs:
        for metric in metrics:
            this_metric = []
            for ID in group_config.keys():
                if name1 in responses_target[ID][config][metric]:
                    score = responses_target[ID][config][metric][name1]
                    if score > 0:
                        target_scores1.append(score)
                if name2 in responses_target[ID][config][metric]:
                    score = responses_target[ID][config][metric][name2]
                    if score > 0:
                        target_scores2.append(score)

    combined_scores1 = blind_scores1 + target_scores1
    combined_scores2 = blind_scores2 + target_scores2
    return kendalltau(combined_scores1, combined_scores2)

In [11]:
all_kendall = {}
for group in group_names:
    for i in range(len(group)):
        for j in range(i+1,len(group)):
            name1 = group[i]
            name2 = group[j]
            kendall = pair_agreement(name1, name2)
            all_kendall[name1+" "+name2] = kendall.correlation

In [14]:
all_kendall

{'Ximeng Lily': 0.20090085672615393,
 'Ximeng Eric': 0.15006789504157894,
 'Lily Eric': 0.22760719375294797,
 'Gerardo Biswadip': 0.2868042008605897,
 'Gerardo Tanmay': 0.30872041451010324,
 'Biswadip Tanmay': 0.26041109229171566}

In [13]:
np.mean(list(all_kendall.values()))

0.23908527553051492

In [16]:
exclude = {"67855635_1_0_2@748227@85501317"}

In [21]:
responses_across_examples = {config: {metric: {} for metric in metrics} for config in configs}
for config in configs:
    for metric in metrics:
        this_metric = []
        for ID in group_config.keys():
            if ID in exclude:
                continue
            scores = []
            for name, score in responses[ID][config][metric].items():
                if score > 0:
                    scores.append(score)
            if len(scores) > 0:
                this_example = np.mean(scores)
                this_metric.append(this_example)
        responses_across_examples[config][metric] = np.round(np.mean(this_metric),2)

In [22]:
responses_target_across_examples = {config: {metric: {} for metric in metrics} for config in configs}
for config in configs:
    for metric in metrics:
        this_metric = []
        for ID in group_config.keys():
            if ID in exclude:
                continue
            if len(responses_target[ID][config][metric])>0:
                scores = []
                for name, score in responses_target[ID][config][metric].items():
                    if score > 0:
                        scores.append(score)
                if len(scores) > 0:
                    this_example = np.mean(scores)
                    this_metric.append(this_example)
        responses_target_across_examples[config][metric] = np.round(np.mean(this_metric),2)

In [23]:
responses_across_examples

{'target': {'fluency': 4.71,
  'coherence': 4.23,
  'relevance': 4.04,
  'overall': 3.87},
 'fid_abstract': {'fluency': 4.71,
  'coherence': 4.06,
  'relevance': 4.29,
  'overall': 3.77},
 'fid_context': {'fluency': 4.82,
  'coherence': 4.0,
  'relevance': 4.41,
  'overall': 3.95},
 'fid_oracle': {'fluency': 4.8,
  'coherence': 4.07,
  'relevance': 4.18,
  'overall': 3.86},
 'fid_keyword': {'fluency': 4.84,
  'coherence': 4.07,
  'relevance': 4.21,
  'overall': 3.84}}

In [24]:
responses_target_across_examples

{'target': {'fluency': 4.74,
  'coherence': 4.41,
  'relevance': 4.18,
  'overall': 4.16},
 'fid_abstract': {'fluency': 4.72,
  'coherence': 3.96,
  'relevance': 4.26,
  'overall': 3.63},
 'fid_context': {'fluency': 4.8,
  'coherence': 3.96,
  'relevance': 4.31,
  'overall': 3.85},
 'fid_oracle': {'fluency': 4.8,
  'coherence': 4.09,
  'relevance': 4.14,
  'overall': 3.8},
 'fid_keyword': {'fluency': 4.84,
  'coherence': 4.07,
  'relevance': 4.16,
  'overall': 3.76}}

In [None]:
average_responses = {}
for ID, results in responses.items():
    if ID not in exclude:
        this_example = {}
        for system, metric_results in results.items():
            this_system = {}
            for metric, scores in metric_results.items():
                this_system[metric] = np.round(np.mean([score for annotator, score in scores.items()]),2)
            this_example[system] = this_system
        average_responses[ID] = this_example

In [None]:
average_responses

In [None]:
for ID, results in average_responses.items():
    minimum = np.min([metric_results["overall"] for system, metric_results in results.items()])
    if results["fid_oracle"]["overall"] == minimum:
        result = {system: metric_results["overall"] for system, metric_results in results.items()}
        print(ID)
        print(result)

In [None]:
for ID, results in average_responses.items():
    maximum = np.max([metric_results["overall"] for system, metric_results in results.items()])
    if results["fid_oracle"]["overall"] == maximum:
        result = {system: metric_results["overall"] for system, metric_results in results.items()}
        print(ID)
        print(result)

In [None]:
responses["173990267_1_0_4@5068376"]

In [None]:
score_distribution = {config:[] for config in configs}
for ID, ratings in average_responses.items():
    for config, scores in ratings.items():
        score_distribution[config].append(scores["overall"])

In [None]:
fig, ax = plt.subplots(figsize=(6,4.5))
cfgs = []
for config, scores in score_distribution.items():
    ax.hist(scores, alpha=0.5)
    cfgs.append(config)
# Add a legend
plt.legend(cfgs, loc='best')

#plt.xlabel('# of Sentences per Paper')
#plt.ylabel('Fraction')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')
#plt.show()

In [None]:
for config, scores in score_distribution.items():
    print(config, np.std(scores))

In [None]:
for config, scores in score_distribution.items():
    fig, ax = plt.subplots(figsize=(6,4.5))
    ax.hist(scores, alpha=0.5)
    cfgs.append(config)
# Add a legend
    plt.legend([config], loc='best')

    #plt.xlabel('# of Sentences per Paper')
    #plt.ylabel('Fraction')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')