# Create boxplot for results

In [None]:
import matplotlib.pyplot as plt
from pathlib import Path
import json
from collections import defaultdict
import numpy as np
import random

In [None]:
dirpath = Path("../output_score/comparison/")

def get_scores(model_id):
    
    scores = defaultdict(lambda: defaultdict(list))
    
    for filepath in dirpath.glob(f"*/{model_id}*/*.json"):
        if "mention" not in filepath.name or "eid" not in filepath.name:
            continue
        
        with open(filepath, "r") as f:
            data = json.load(f)
            
        if "document-and-pair-wise-scores" in data["average"]:
            _scores = [x for x in data["average"]["document-and-pair-wise-scores"]["individual"].values()]
        else:
            print('hhh')
    
        scores[data["args"]["dataset_name"]][data["args"]["num_demonstration"]].extend(_scores)
    return scores

In [None]:
def define_box_properties(plot_name, color_code, label):
    for k, v in plot_name.items():
        plt.setp(plot_name.get(k), color=color_code, alpha=0.7)
    plt.plot([], c=color_code, label=label)
    for median in plot_name["medians"]:
        median.set_color('black')

In [None]:
formulations = ["ctf-nli", "ctf-pairwise", "ctf-mrc", "ctf-timeline"]
ticks = ["NLI", "Pairwise", "MRC", "Timeline"]
model_names = ["Llama 2 (7B)", "Llama 2 Chat (7B)", "CodeLlama (7B)", "T5 (3B)", "Flan-T5 (3B)"]
model_ids = ["Llama-2-7b-hf", "Llama-2-7b-chat-hf", "CodeLlama-7b-hf", "t5-3b", "flan-t5-xl"]
colors = ["#ffb000", "#fe6100", "#dc267f", "#785ef0", "#648fff"]

In [None]:
scores = {}
for model_id in model_ids:
    scores[model_id] = get_scores(model_id)

In [None]:
plt.figure(figsize=(10, 5))

for model_id, color, diff, model_name in zip(model_ids, colors, [0, 0.35, 0.7, 1.05, 1.4], model_names):
    points_flat = [
        [x for xs in scores[model_id][formulation].values() for x in xs] 
        for formulation in formulations
    ]
    positions_x = np.array(np.arange(len(points_flat)))*2.0-0.7+diff
    plot = plt.boxplot(
        points_flat,
    	positions=positions_x, 
        widths=0.3,
        showfliers=False,
        patch_artist=True
    )
    define_box_properties(plot, color, model_name)
    # for idx, formulation in enumerate(formulations):
    #     for num, marker, diff in zip([0,1,2,3], ['o', 'x', 's', '^'], [0,0.1,0.3,0.4]):
    #         points = [x for x in scores[model_id][formulation][num]]
    #         positions = [positions_x[idx]-0.2+diff for _ in points]
    #         plt.scatter(positions, points, c=color, marker=marker, alpha=0.5)

# check notebooks/random-guess-perf.ipynb
plt.axhline(y=0.39, color='#0D98BA', label="random", alpha=0.5, linestyle='dotted')
plt.axhline(y=0.29788, color='#000000', label="majority", alpha=0.5, linestyle='dashed')

plt.grid(axis='y', color='g', linestyle=':', linewidth=0.3)

# set the x label values
plt.xticks(np.arange(0, len(ticks) * 2, 2), ticks)
# set the limit for y axis
plt.ylim(0.0, 0.68)

plt.legend(ncol=2)

plt.box(False)

plt.ylabel("F1")
plt.xlabel("Formulation")

plt.savefig(f"./figures/result_formulation_comparison_base.pdf", format="pdf", bbox_inches='tight')
plt.show()

In [None]:
formulations = ["ctf-mrc", "ctf-mrc-cot", "ctf-timeline", "ctf-timeline-cot", "ctf-timeline-code"]
ticks = ["MRC", "MRC(CoT)", "Timeline", "Timeline\n(CoT)", "Timeline\n(code)"]
model_ids = ["Llama-2-7b-hf", "flan-t5-xl"]
model_names = ["Llama 2 (7B)", "Flan-T5 (3B)"]
colors = ['#ffb000', '#648fff']

In [None]:
scores = {}
for model_id in model_ids:
    scores[model_id] = get_scores(model_id)

In [None]:
plt.figure(figsize=(5, 4))

for model_id, color, diff, model_name in zip(model_ids, colors, [0, 0.35, 0.7, 1.05, 1.4], model_names):
    points_flat = [[x for xs in scores[model_id][formulation].values() for x in xs] for formulation in formulations]
    positions_x = np.array(np.arange(len(points_flat)))*1.0-0.2+diff
    plot = plt.boxplot(
        points_flat,
    	positions=positions_x, 
        widths=0.3,
        showfliers=False,
        patch_artist=True
    )
    define_box_properties(plot, color, model_name)
    # for idx, formulation in enumerate(formulations):
    #     for num, marker, diff in zip([0,1,2,3], ['o', 'x', 's', '^'], [0,0.1,0.3,0.4]):
    #         points = [x for x in scores[model_id][formulation][num]]
    #         positions = [positions_x[idx]-0.2+diff for _ in points]
    #         plt.scatter(positions, points, c=color, marker=marker, alpha=0.5)

# plt.axhline(y=0.35, color='#0D98BA', label="random", alpha=0.5, linestyle='dotted')
# plt.axhline(y=0.387, color='#000000', label="majority", alpha=0.5, linestyle='dashed')

plt.grid(axis='y', color='g', linestyle=':', linewidth=0.3)

# set the x label values
plt.xticks(np.arange(0, len(ticks) * 1, 1), ticks)
# set the limit for y axis
plt.ylim(0.0, 0.45)

plt.legend(ncol=2)

plt.box(False)
plt.xticks(rotation=0, ha='center')

plt.ylabel("F1")
# plt.xlabel("Formulation")
# set the title
# plt.title('Grouped boxplot using matplotlib')
plt.savefig(f"./figures/result_formulation_comparison_prompt.pdf", format="pdf", bbox_inches='tight')
plt.show()

In [None]:
formulations = ["ctf-timeline"]
ticks = ["eid\nmention", "eid\nstructured", "star\nmention", "star\nstructured"]
model_ids = ["Llama-2-7b-hf"]
model_names = ["Llama 2 (7B)"]
colors = ["#ffb000"]

In [None]:
def get_scores_all(model_id):
    
    scores = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list))))
    
    for filepath in dirpath.glob(f"*/{model_id}*/*.json"):
        
        with open(filepath, "r") as f:
            data = json.load(f)
            
        if "document-and-pair-wise-scores" in data["average"]:
            _scores = [x for x in data["average"]["document-and-pair-wise-scores"]["individual"].values()]
        else:
            print('hhh')
    
        scores[data["args"]["dataset_name"]][data["args"]["marker"]][data["args"]["representation"]][data["args"]["num_demonstration"]].extend(_scores)
    return scores

In [None]:
scores = {}
for model_id in model_ids:
    scores[model_id] = get_scores_all(model_id)
scores;

In [None]:
scores["Llama-2-7b-hf"]["ctf-timeline"]["eid"]["mention"];

In [None]:
plt.figure(figsize=(4, 3))
color = "#ffb000"
model_id = "Llama-2-7b-hf"
model_name = "Llama 2 (7B)"

points_flat = []
for marker in ["eid", "star"]:
    for representation in ["mention", "structured"]:
        _points_flat = [
            x 
            for formulation in formulations
            for xs in scores[model_id][formulation][marker][representation].values() 
            for x in xs
        ]
        points_flat.append(_points_flat)
positions_x = np.array(np.arange(len(points_flat)))*1.0

plot = plt.boxplot(
    points_flat,
    positions=positions_x, 
    widths=0.3,
    showfliers=False,
    patch_artist=True
)
define_box_properties(plot, color, model_name)

plt.grid(axis='y', color='g', linestyle=':', linewidth=0.3)

plt.xticks(np.arange(0, len(ticks) * 1, 1), ticks, ha='center')
plt.ylim(0, 0.41)

plt.legend()

plt.box(False)
# plt.xticks(rotation=45, ha='right')

plt.ylabel("F1")
# plt.xlabel("Event representations")
# set the title
# plt.title('Grouped boxplot using matplotlib')
plt.savefig(f"./figures/result_formulation_comparison_repr.pdf", format="pdf", bbox_inches='tight')
plt.show()

In [None]:
# size with llama 2 and flan t5
def get_median(model_id):
    
    scores = defaultdict(lambda: defaultdict(list))
    
    for filepath in dirpath.glob(f"*/{model_id}*/*.json"):
        if "mention" not in filepath.name or "eid" not in filepath.name:
            continue
        
        with open(filepath, "r") as f:
            data = json.load(f)
            
        if "document-and-pair-wise-scores" in data["average"]:
            _score = data["average"]["document-and-pair-wise-scores"]["range"]["median"]
        else:
            print('hhh')
    
        scores[data["args"]["dataset_name"]][data["args"]["num_demonstration"]].append(_score)
    return scores

In [None]:
model_ids = ["Llama-2-7b-hf", "Llama-2-13b-hf", "Llama-2-70b-hf", "flan-t5-large", "flan-t5-xl", "flan-t5-xxl"]
# formulations = ["ctf-nli", "ctf-pairwise", "ctf-mrc", "ctf-timeline"]
ticks = ["NLI", "Pairwise", "MRC", "Timeline"]
colors = ["#dc267f", "#fe6100", "#648fff", "#785ef0"]

In [None]:
scores = {}
for model_id in model_ids:
    scores[model_id] = get_median(model_id)

In [None]:
scores;

In [None]:
from matplotlib.lines import Line2D

fig, ax = plt.subplots(figsize=(6, 4))
configs = [
    ("ctf-nli", 'o', "-"), 
    ("ctf-pairwise", 'x', "--"), 
    ("ctf-mrc", 's', ":"), 
    ("ctf-timeline", '^', "-.")
]
for (formulation, marker, linestyle), color in zip(configs, colors):
    nums = []
    for model_id in ["Llama-2-7b-hf", "Llama-2-13b-hf", "Llama-2-70b-hf"]:
        nums.append(max([x for x in scores[model_id][formulation].values()]))
    ax.plot([7, 13, 35], nums, label=formulation, marker=marker, color='#ffb000', linestyle=linestyle, alpha=0.7)
    nums = []
    for model_id in ["flan-t5-large", "flan-t5-xl", "flan-t5-xxl"]:
        nums.append(max([x for x in scores[model_id][formulation].values()]))
    ax.plot([0.7, 3, 11], nums, label=formulation, marker=marker, color='#648fff', linestyle=linestyle, alpha=0.7)

plt.xticks([0.7, 3, 7, 11, 13, 35])
plt.gca().set_xticklabels(['700M', '3B','7B', '11B', '13B', '70B'])

legend_elements = [
    Line2D([0], [0], color='#ffb000', label='Llama 2'),
    Line2D([0], [0], color='#648fff', label='Flan-T5'),
    Line2D([0], [0], marker='o', color='grey', label='NLI', markerfacecolor='grey', linestyle='-'),
    Line2D([0], [0], marker='x', color='grey', label='Pairwise', markerfacecolor='grey', linestyle='--'),
    Line2D([0], [0], marker='s', color='grey', label='MRC', markerfacecolor='grey', linestyle=':'),
    Line2D([0], [0], marker='^', color='grey', label='Timeline', markerfacecolor='grey', linestyle='-.'),
]
plt.xticks(rotation=15, ha='center')
plt.ylabel("F1")
# plt.xlabel("Model size")
plt.box(False)
plt.grid(axis='y', color='g', linestyle=':', linewidth=0.3)

plt.legend(handles=legend_elements, ncol=3)
plt.savefig(f"./figures/result_formulation_comparison_size.pdf", format="pdf", bbox_inches='tight')
plt.show()

In [None]:
from matplotlib.lines import Line2D

fig, ax = plt.subplots(figsize=(4, 3))
markers = ["o", "x", "s", "^"]
formulations = ["ctf-nli", "ctf-pairwise", "ctf-mrc", "ctf-timeline"]
for formulation, marker, linestyle in configs:
    model_id = "Llama-2-7b-hf"
    ax.plot(
        [0,1,2,3], 
        [scores[model_id][formulation][num] for num in [0,1,2,3]], 
        label=formulation, marker=marker, color='#ffb000', alpha=0.7, linestyle=linestyle
    )

    model_id = "flan-t5-xl"
    ax.plot(
        [0,1,2,3], 
        [scores[model_id][formulation][num] for num in [0,1,2,3]],
        label=formulation, marker=marker, color='#648fff', alpha=0.7, linestyle=linestyle
    )

plt.xticks([0,1,2,3])

legend_elements = [
    Line2D([0], [0], color='#ffb000', label='Llama 2 (7B)'),
    Line2D([0], [0], color='#648fff', label='Flan-T5 (3B)'),
    Line2D([0], [0], marker='o', color='grey', label='NLI', markerfacecolor='grey', linestyle='-'),
    Line2D([0], [0], marker='x', color='grey', label='Pairwise', markerfacecolor='grey', linestyle='--'),
    Line2D([0], [0], marker='s', color='grey', label='MRC', markerfacecolor='grey', linestyle=':'),
    Line2D([0], [0], marker='^', color='grey', label='Timeline', markerfacecolor='grey', linestyle='-.'),
]
# plt.xticks(rotation=45, ha='right')
plt.ylabel("F1")
# plt.xlabel("#demonstration")
plt.box(False)
plt.grid(axis='y', color='g', linestyle=':', linewidth=0.3)
plt.ylim(0.0, 0.95)

plt.legend(handles=legend_elements, ncol=2)
plt.savefig(f"./figures/result_formulation_comparison_demo.pdf", format='pdf', bbox_inches='tight')
plt.show()