In [3]:
from pathlib import Path

import pandas as pd
import matplotlib.ticker as mtick

from src.reader import read_lusa, read_timebank

ROOT = Path().resolve().parent

## Prompt Selection

In [4]:
results_path = ROOT / "results" / "prompt_selection"

In [9]:
df_pt = pd.read_csv(results_path / "portuguese"/ "results.csv")
df_pt["language"] = "Portuguese"

df_en = pd.read_csv(results_path / "english"/ "results.csv")
df_en["language"] = "English"

df = pd.concat([df_pt, df_en])
df.reset_index(inplace=True, drop=True)

In [13]:
to_plot = df[["model", "template", "entity", "language", "f1"]]
print(to_plot)

              model     template            entity    language        f1
0           chatgpt      ext_exp    event triggers  Portuguese  0.317295
1           chatgpt          ext    event triggers  Portuguese  0.331579
2           chatgpt          cls    event triggers  Portuguese  0.379070
3           chatgpt      cls_exp    event triggers  Portuguese  0.410072
4           chatgpt  ext_def_exp    event triggers  Portuguese  0.452592
..              ...          ...               ...         ...       ...
376  llama2-7b-chat  cls_def_exp  time expressions     English  0.032432
377  llama2-7b-chat      ext_exp  time expressions     English  0.070270
378  llama2-7b-chat          ext  time expressions     English  0.070652
379  llama2-7b-chat  ext_def_exp  time expressions     English  0.118380
380  llama2-7b-chat      ext_def  time expressions     English  0.118380

[381 rows x 5 columns]


In [31]:
to_plot.groupby(["model", "template", "entity", "language"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,f1
model,template,entity,language,Unnamed: 4_level_1
chatgpt,cls,event triggers,English,0.339321
chatgpt,cls,event triggers,Portuguese,0.379070
chatgpt,cls,participants,Portuguese,0.191159
chatgpt,cls,time expressions,English,0.263804
chatgpt,cls,time expressions,Portuguese,0.394558
...,...,...,...,...
llama2-7b-chat,ext_exp,event triggers,English,0.064133
llama2-7b-chat,ext_exp,event triggers,Portuguese,0.034934
llama2-7b-chat,ext_exp,participants,Portuguese,0.105747
llama2-7b-chat,ext_exp,time expressions,English,0.070270


In [32]:
to_plot.groupby(["entity", "language", "template"]).mean("f1")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1
entity,language,template,Unnamed: 3_level_1
event triggers,English,cls,0.11761
event triggers,English,cls_def,0.096826
event triggers,English,cls_def_exp,0.100079
event triggers,English,cls_exp,0.120322
event triggers,English,ext,0.052497
event triggers,English,ext_def,0.1043
event triggers,English,ext_def_exp,0.098714
event triggers,English,ext_exp,0.053181
event triggers,Portuguese,cls,0.127442
event triggers,Portuguese,cls_def,0.115902


In [41]:
to_plot.groupby(["entity", "language", "template"]).mean("f1").unstack("template")

Unnamed: 0_level_0,Unnamed: 1_level_0,f1,f1,f1,f1,f1,f1,f1,f1
Unnamed: 0_level_1,template,cls,cls_def,cls_def_exp,cls_exp,ext,ext_def,ext_def_exp,ext_exp
entity,language,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
event triggers,English,0.11761,0.096826,0.100079,0.120322,0.052497,0.1043,0.098714,0.053181
event triggers,Portuguese,0.127442,0.115902,0.121493,0.131735,0.060636,0.114655,0.111746,0.060961
participants,Portuguese,0.089373,0.101725,0.101002,0.089539,0.071431,0.090701,0.091239,0.073143
time expressions,English,0.161693,0.170661,0.170981,0.169233,0.229087,0.231129,0.230646,0.235521
time expressions,Portuguese,0.18724,0.18507,0.175436,0.181652,0.257012,0.273949,0.271718,0.29409


In [42]:
to_plot.groupby(["model", "language", "template"]).mean("f1").unstack("template")

Unnamed: 0_level_0,Unnamed: 1_level_0,f1,f1,f1,f1,f1,f1,f1,f1
Unnamed: 0_level_1,template,cls,cls_def,cls_def_exp,cls_exp,ext,ext_def,ext_def_exp,ext_exp
model,language,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
chatgpt,English,0.301563,0.350961,0.365969,0.301149,0.220568,0.332837,0.333036,0.243943
chatgpt,Portuguese,0.321595,0.370899,0.350555,0.312415,0.305727,0.357072,0.350809,0.295945
falcon-7b,Portuguese,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
gpt3,English,0.154656,0.132139,0.13249,0.161751,0.189555,0.211054,0.210736,0.191727
gpt3,Portuguese,0.266368,0.286933,0.269211,0.275862,0.213911,0.210546,0.210723,0.220192
gpt4,English,0.610016,0.494253,0.494067,0.649743,0.394497,0.375876,0.342687,0.40488
gpt4,Portuguese,0.410106,0.26461,0.276687,0.404509,0.189536,0.187797,0.19918,0.206339
llama2-13b,English,0.009539,0.0,0.0,0.009539,0.006623,0.031141,0.031141,0.006623
llama2-13b,Portuguese,0.008578,0.01065,0.01065,0.008578,0.021505,0.107334,0.107334,0.021505
llama2-13b-chat,English,0.046526,0.048372,0.048372,0.046413,0.116867,0.123269,0.123269,0.116867
