In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

from src.reader import read_lusa, read_timebank

plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Arial']


ROOT = Path().resolve().parent.parent

In [2]:
models = [
    'llama2-7b',
    'llama2-7b-chat',
    'llama2-13b',
    'llama2-13b-chat',
    'llama2-70b',
    'llama2-70b-chat',
    'gpt3',
    'chatgpt',
    'gpt4',
]

## Prompt Selection

In [3]:
results_path = ROOT / "results" / "prompt_selection"

In [4]:
df_pt = pd.read_csv(results_path / "portuguese"/ "results.csv")
df_pt["language"] = "Portuguese"

df_en = pd.read_csv(results_path / "english"/ "results.csv")
df_en["language"] = "English"

df = pd.concat([df_pt, df_en])
df.reset_index(inplace=True, drop=True)
df["model"] = pd.Categorical(df.model, ordered=True, categories=models)
df

Unnamed: 0,model,template,entity,precision,recall,f1,f1_r,language
0,chatgpt,ext,event triggers,0.456522,0.260331,0.331579,0.605356,Portuguese
1,chatgpt,cls,event triggers,0.433511,0.336777,0.379070,0.617449,Portuguese
2,chatgpt,cls_def,event triggers,0.601266,0.392562,0.475000,0.784746,Portuguese
3,chatgpt,ext_def,event triggers,0.612903,0.392562,0.478589,0.782263,Portuguese
4,chatgpt,ext_exp,event triggers,0.490265,0.572314,0.528122,0.638241,Portuguese
...,...,...,...,...,...,...,...,...
355,llama2-7b-chat,cls_def_exp,time expressions,0.065421,0.113821,0.083086,0.138516,English
356,llama2-7b-chat,cls_exp,time expressions,0.067961,0.113821,0.085106,0.118924,English
357,llama2-7b-chat,ext_def_exp,time expressions,0.066390,0.130081,0.087912,0.109416,English
358,llama2-7b-chat,ext_exp,time expressions,0.077206,0.170732,0.106329,0.132826,English


In [5]:
languages = df.language.unique().tolist()
entities = df.entity.unique().tolist()
templates = ["ext", "cls",	"ext_def", "ext_exp", "cls_def", "ext_def_exp",  "cls_exp", "cls_def_exp"]
labels = ["_ _ _", "C _ _", "_ D _", "_ _ E", "C D _", "_ D E",  "C _ E", "C D E"]

In [6]:
data = df[["language", "model", "template", "entity", "f1"]].groupby(
    ["language", "entity", "model", "template"]).mean("f1")
data = data.unstack("template")
data = data[[('f1', "ext"),
            ('f1', "cls"),
            ('f1', "ext_def"),
            ('f1', "ext_exp"),
            ('f1', "cls_def"),
            ('f1', "ext_def_exp"),
            ('f1', "cls_exp"),
            ('f1', "cls_def_exp")]]
(data * 100).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1,f1,f1,f1,f1,f1,f1,f1
Unnamed: 0_level_1,Unnamed: 1_level_1,template,ext,cls,ext_def,ext_exp,cls_def,ext_def_exp,cls_exp,cls_def_exp
language,entity,model,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
English,event triggers,llama2-7b,0.0,1.34,0.0,10.3,0.0,15.4,11.06,7.75
English,event triggers,llama2-7b-chat,6.41,0.0,10.9,28.8,0.0,31.15,28.3,29.67
English,event triggers,llama2-13b,0.0,1.91,2.93,8.85,0.0,10.53,6.28,2.1
English,event triggers,llama2-13b-chat,2.28,0.31,2.43,20.9,0.3,16.79,18.68,18.15
English,event triggers,llama2-70b,8.34,3.31,14.96,26.44,3.1,25.45,13.77,26.29
English,event triggers,llama2-70b-chat,1.17,3.3,2.29,41.22,4.95,42.13,18.12,18.74
English,event triggers,gpt3,0.3,3.92,3.04,41.12,4.85,42.12,24.59,38.01
English,event triggers,chatgpt,8.29,33.93,35.96,55.08,40.6,56.32,57.3,59.28
English,event triggers,gpt4,20.45,57.82,21.36,72.81,33.33,72.93,72.6,74.68
English,participants,llama2-7b,,,,,,,,


## Test

In [21]:
results_path = ROOT / "results" / "test"

In [25]:
df_pt = pd.read_csv(results_path / "portuguese"/ "results.csv")
df_pt["language"] = "Portuguese"

df_en = pd.read_csv(results_path / "english"/ "results.csv")
df_en["language"] = "English"

df = pd.concat([df_pt, df_en])
df.reset_index(inplace=True, drop=True)
df["model"] = pd.Categorical(df.model, ordered=True, categories=models)
df

Unnamed: 0,model,template,entity,precision,recall,f1,f1_r,language
0,chatgpt,cls_def_exp,event triggers,0.487685,0.571492,0.526273,0.585791,Portuguese
1,gpt3,cls_def_exp,event triggers,0.439163,0.549734,0.488267,0.509444,Portuguese
2,gpt4,cls_exp,event triggers,0.597454,0.666963,0.630298,0.68447,Portuguese
3,llama2-13b,ext_def,event triggers,0.193182,0.098135,0.130153,0.183512,Portuguese
4,llama2-13b-chat,ext_def,event triggers,0.205496,0.076377,0.111363,0.384928,Portuguese
5,llama2-70b,cls_def_exp,event triggers,0.264538,0.309059,0.285071,0.273665,Portuguese
6,llama2-70b-chat,cls,event triggers,0.16857,0.137655,0.151552,0.252957,Portuguese
7,llama2-7b,ext_def,event triggers,0.087687,0.041741,0.056558,0.306139,Portuguese
8,llama2-7b-chat,ext_def,event triggers,0.169525,0.114121,0.136412,0.24664,Portuguese
9,chatgpt,cls_def,participants,0.241742,0.130206,0.169251,0.619661,Portuguese


In [26]:
data = df[["language", "entity", "model", "precision", "recall", "f1", "f1_r"]]

data = data.groupby(["language", "entity", "model"]).mean()
(data * 100).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,precision,recall,f1,f1_r
language,entity,model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
English,event triggers,llama2-7b,0.0,0.0,0.0,1.31
English,event triggers,llama2-7b-chat,15.14,7.81,10.3,24.91
English,event triggers,llama2-13b,11.4,2.0,3.41,13.01
English,event triggers,llama2-13b-chat,9.13,2.57,4.02,27.15
English,event triggers,llama2-70b,32.02,25.36,28.3,26.25
English,event triggers,llama2-70b-chat,17.37,14.91,16.05,14.24
English,event triggers,gpt3,6.89,6.18,6.52,17.88
English,event triggers,chatgpt,79.38,39.12,52.42,72.09
English,event triggers,gpt4,83.31,62.81,71.62,81.95
English,participants,llama2-7b,,,,
