# Analysis
* DCT-based performance comparison
* Length-based performance comparison
* #events/doc-based performance comparison
* Topic-based performance comparison

In [None]:
from importlib import reload
import sys
import json
from pathlib import Path
from collections import defaultdict
from tqdm import tqdm
from typing import Any
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime
import numpy as np
from matplotlib.lines import Line2D

In [None]:
sys.path.append("../src/")
import utils_eval

In [None]:
with open("../data/preprocess/timeset-metadata/test.json", "r") as f:
    metadata = json.load(f)

In [None]:
metadata["health_1"]

In [None]:
metadata;

plot with data axis seems to be not easy to interpret. just binary comparions
* consider only Llama-2-7b

In [None]:
import statistics
def calculate_f1_per_doc(scores):
    
    filename2allscores = defaultdict(list)
    for template_id, filename2scores in scores.items():
        for filename, scores in filename2scores.items():
            filename2allscores[filename].append(scores["weighted avg"]["f1-score"])
    
    filename2score = defaultdict(float)
    for filename, all_scores in filename2allscores.items():
        filename2score[filename] = all_scores

    return filename2score

In [None]:
dirpath = Path("../output_score/comparison/")
scores = defaultdict(lambda: defaultdict(list))
formulations = ["nli", "pairwise", "mrc", "timeline"]
for formulation in formulations:
    for filepath in dirpath.glob(f"{formulation}/Llama-2-7b-hf*/*mention*eid*"):
        with open(filepath, "r") as f:
            data = json.load(f)
        if "document-and-pair-wise-scores" in data["individuals"]:
            for filename, score in calculate_f1_per_doc(data["individuals"]["document-and-pair-wise-scores"]).items():
                date_category = metadata[filename]["date(binary)"]
                scores[formulation][date_category].extend(score)
        else:
            print(filepath)

In [None]:
def define_box_properties(plot_name, color_code, label):
    for k, v in plot_name.items():
        plt.setp(plot_name.get(k), color=color_code, alpha=0.7)
    plt.plot([], c=color_code, label=label)
    for median in plot_name["medians"]:
        median.set_color('black')

ticks = ["NLI", "Pairwise", "MRC", "Timeline"]

plt.figure(figsize=(4, 3))        
for category, diff, color in zip(["Old", "New"], [0, 0.4], ["#0072b2", "#d55e00"]):
    points_flat = [
        scores[formulation][category]
        for formulation in formulations
    ]
    positions_x = np.array(np.arange(len(points_flat)))*1.0-0.2+diff
    plot = plt.boxplot(
        points_flat,
    	positions=positions_x, 
        widths=0.3,
        showfliers=False,
        patch_artist=True
    )
    define_box_properties(plot, color, category)

plt.xticks(np.arange(0, len(ticks) * 1, 1), ticks)
plt.grid(axis='y', color='g', linestyle=':', linewidth=0.3)
plt.ylim(-0.05, 1.0)
plt.legend()
plt.box(False)
plt.ylabel("F1")
# plt.xlabel("Formulation")
plt.savefig(f"./figures/result_formulation_comparison_date.pdf", format="pdf", bbox_inches='tight')
plt.show()

In [None]:
import statistics
def calculate_f1_per_doc(scores):
    
    filename2allscores = defaultdict(list)
    for template_id, filename2scores in scores.items():
        for filename, scores in filename2scores.items():
            filename2allscores[filename].append(scores["weighted avg"]["f1-score"])
    
    filename2score = defaultdict(float)
    for filename, all_scores in filename2allscores.items():
        filename2score[filename] = statistics.median(all_scores)

    return filename2score

In [None]:
dirpath = Path("../output_score/comparison/")
scores = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
formulations = ["nli", "pairwise", "mrc", "timeline"]
model_ids = ["Llama-2-7b-hf", "flan-t5-xl"]
for model_id in model_ids:
    for formulation in formulations:
        for filepath in dirpath.glob(f"{formulation}/{model_id}*/*mention*eid*"):
            with open(filepath, "r") as f:
                data = json.load(f)
            if "document-and-pair-wise-scores" in data["individuals"]:
                for filename, score in calculate_f1_per_doc(data["individuals"]["document-and-pair-wise-scores"]).items():
                    num = metadata[filename]["#event"]
                    scores[model_id][formulation][num].append(score)
            else:
                print(filepath)

In [None]:
scores['flan-t5-xl'];

In [None]:
configs = [
    ("nli", 'o', "-"), 
    ("pairwise", 'x', "--"), 
    ("mrc", 's', ":"), 
    ("timeline", '^', "-.")
]
colors = ['#ffb000', '#648fff']

plt.figure(figsize=(6, 4))
for model_id, color in zip(model_ids, colors):
    formulation2nums = defaultdict(list)
    formulation2scores = defaultdict(list)
    for formulation, num2scores in scores[model_id].items():
        for num, _scores in num2scores.items():
            formulation2nums[formulation].extend([num]*len(_scores))
            formulation2scores[formulation].extend(_scores)
    
    # Plotting
    for formulation, marker, linestyle in configs:
        x, y = np.array(formulation2nums[formulation]), np.array(formulation2scores[formulation])
        plot = plt.scatter(x, y, marker=marker, alpha=0.2, color=color)
        x_elements = plot.get_offsets()[:, 0]
        x = np.array(sorted(x_elements))
        a, b = np.polyfit(x, y, 1)
        plt.plot(x, a*x+b, linestyle=linestyle, color=color)

plt.xlabel('#event')
plt.ylabel('F1')

plt.box(False)
plt.grid(axis='y', color='g', linestyle=':', linewidth=0.3)
plt.ylim(-0.0, 1.05)

legend_elements = [
    Line2D([0], [0], color='#ffb000', label='Llama 2 (7B)'),
    Line2D([0], [0], color='#648fff', label='Flan-T5 (3B)'),
    Line2D([0], [0], marker='o', color='grey', label='NLI', markerfacecolor='grey', linestyle='-'),
    Line2D([0], [0], marker='x', color='grey', label='Pairwise', markerfacecolor='grey', linestyle='--'),
    Line2D([0], [0], marker='s', color='grey', label='MRC', markerfacecolor='grey', linestyle=':'),
    Line2D([0], [0], marker='^', color='grey', label='Timeline', markerfacecolor='grey', linestyle='-.'),
]
plt.legend(handles=legend_elements, ncol=3)

plt.savefig(f"./figures/result_formulation_comparison_num_event.pdf", format="pdf", bbox_inches='tight')
plt.show()

In [None]:
dirpath = Path("../output_score/comparison/")
scores = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
formulations = ["nli", "pairwise", "mrc", "timeline"]
model_ids = ["Llama-2-7b-hf", "flan-t5-xl"]
for model_id in model_ids:
    for formulation in formulations:
        for filepath in dirpath.glob(f"{formulation}/{model_id}*/*mention*eid*"):
            with open(filepath, "r") as f:
                data = json.load(f)
            if "document-and-pair-wise-scores" in data["individuals"]:
                for filename, score in calculate_f1_per_doc(data["individuals"]["document-and-pair-wise-scores"]).items():
                    num = metadata[filename]["#word"]
                    scores[model_id][formulation][num].append(score)
            else:
                print(filepath)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime
import numpy as np
from matplotlib.lines import Line2D

configs = [
    ("nli", 'o', "-"), 
    ("pairwise", 'x', "--"), 
    ("mrc", 's', ":"), 
    ("timeline", '^', "-.")
]
colors = ['#ffb000', '#648fff']

plt.figure(figsize=(6, 4))
for model_id, color in zip(model_ids, colors):
    formulation2nums = defaultdict(list)
    formulation2scores = defaultdict(list)
    for formulation, num2scores in scores[model_id].items():
        for num, _scores in num2scores.items():
            formulation2nums[formulation].extend([num]*len(_scores))
            formulation2scores[formulation].extend(_scores)
    
    # Plotting
    for formulation, marker, linestyle in configs:
        x, y = np.array(formulation2nums[formulation]), np.array(formulation2scores[formulation])
        plot = plt.scatter(x, y, marker=marker, alpha=0.2, color=color)
        x_elements = plot.get_offsets()[:, 0]
        x = np.array(sorted(x_elements))
        a, b = np.polyfit(x, y, 1)
        plt.plot(x, a*x+b, linestyle=linestyle, color=color)

plt.xlabel('#word')
plt.ylabel('F1')

plt.box(False)
plt.grid(axis='y', color='g', linestyle=':', linewidth=0.3)
plt.ylim(0.0, 1.05)

legend_elements = [
    Line2D([0], [0], color='#ffb000', label='Llama-2-7b'),
    Line2D([0], [0], color='#648fff', label='flan-t5-xl'),
    Line2D([0], [0], marker='o', color='grey', label='NLI', markerfacecolor='grey', linestyle='-'),
    Line2D([0], [0], marker='x', color='grey', label='Pairwise', markerfacecolor='grey', linestyle='--'),
    Line2D([0], [0], marker='s', color='grey', label='MRC', markerfacecolor='grey', linestyle=':'),
    Line2D([0], [0], marker='^', color='grey', label='Timeline', markerfacecolor='grey', linestyle='-.'),
]
plt.legend(handles=legend_elements, ncol=3)

plt.savefig(f"./figures/result_formulation_comparison_num_word.pdf", format="pdf", bbox_inches='tight')
plt.show()

# Statistics

In [None]:
with open("../data/preprocess/timeset-metadata/test.json", "r") as f:
    metadata_test = json.load(f)
with open("../data/preprocess/timeset-metadata/dev.json", "r") as f:
    metadata_dev = json.load(f)

In [None]:
dirpath = Path("../data/preprocess/timeset-sample/")

stats = defaultdict(int)
stats_each = defaultdict(lambda: defaultdict(int))
words = defaultdict(list)
topic = defaultdict(list)
sents = defaultdict(list)
for split in ["test", "dev"]:
    with open(f"../data/preprocess/timeset-metadata/{split}.json", "r") as f:
        metadata = json.load(f)
    with open(f"../data/preprocess/timeset-sample/{split}.json", "r") as f:
        data = json.load(f)
    for one_document in data:
        events = one_document['annotation']['events']
        relations = one_document['annotation']['relations']
        stats['num_event'] += len(events)
        stats_each[split]['num_event'] += len(events)
        stats['num_relation'] += len(relations)
        stats_each[split]['num_relation'] += len(relations)
        stats['num_argument'] += sum([len(x['arguments']) for x in events.values()])
        stats_each[split]['num_argument'] += sum([len(x['arguments']) for x in events.values()])
        
        _metadata = metadata[one_document['filename']]
        topic[_metadata['topic'][0]].append('_'.join(_metadata['topic']))
        words[split].append(_metadata['#word'])
        words['all'].append(_metadata['#word'])
        sents[split].append(_metadata['#sent'])
        sents['all'].append(_metadata['#sent'])
        
stats

In [None]:
stats_each

In [None]:
sum(words['test'])/40, sum(words['dev'])/10, sum(words['all'])/50

In [None]:
sum(sents['test'])/40, sum(sents['dev'])/10, sum(sents['all'])/50

In [None]:
topic

In [None]:
len(topic)

In [None]:
sorted([_x.replace("_", " ") for x in topic.values() for _x in set(x)])