In [1]:
import pandas as pd
import os
import json
from src.data import load_splits
from src.data import create_metaphor_classification_prompt
from src.data import create_source_domain_v2_prompt
from src.data import create_target_domain_v2_prompt
from src.data import create_source_lexeme_prompt
from src.data import create_target_lexeme_prompt

RESULTS_DIR = "./Results"
os.makedirs(RESULTS_DIR, exist_ok=True)
DATASET_DIR = "./Data"
datasets = ["trofi", "vua_verb", "vua_pos", "metaphor_list", "lcc_en_subset"]

df = pd.DataFrame(columns=["Dataset", "Train", "Test", "Dev", "Length"])

for idx, d in enumerate(datasets):
    train, test, dev = load_splits(d)
    key = "text" if idx <= 2 else "example"
    # print(train["text"].str.split().str.len().mean())
    entire = pd.concat([train, test, dev])
    print(f"{d} - train len:", len(train))
    print(f"{d} - test len:", len(test))
    print(f"{d} - dev len:", len(dev))
    df.loc[len(df)+1] = [d, len(train), len(test), len(dev), round(entire[key].str.split().str.len().mean())]
    # s = pd.Series()
    # df = pd.concat([df, s])

df.to_csv(f"{RESULTS_DIR}/dataset_info.csv", index=False)
df

trofi - train len: 3838
trofi - test len: 1096
trofi - dev len: 548
vua_verb - train len: 2294
vua_verb - test len: 656
vua_verb - dev len: 328
vua_pos - train len: 3506
vua_pos - test len: 1002
vua_pos - dev len: 502
metaphor_list - train len: 132
metaphor_list - test len: 244
metaphor_list - dev len: 120
lcc_en_subset - train len: 1206
lcc_en_subset - test len: 345
lcc_en_subset - dev len: 172


Unnamed: 0,Dataset,Train,Test,Dev,Length
1,trofi,3838,1096,548,28
2,vua_verb,2294,656,328,27
3,vua_pos,3506,1002,502,29
4,metaphor_list,132,244,120,7
5,lcc_en_subset,1206,345,172,25


In [58]:
df_tasks = pd.DataFrame(columns=["Task", "Datasets", "Labels", "Prompt", "Prediction"])

datasets = ["trofi", "metaphor_list", "metaphor_list", "lcc_en_subset", "lcc_en_subset"]
tasks = ["Classification", "SD Prediction", "TD Prediction", "SL Prediction", "TL Prediction"]
datasets_for_tasks = [["TroFi", "VUA Verb", "VUA POS"], ["Metaphor List", "LCC (en)"], ["Metaphor List", "LCC (en)"], ["Metaphor List", "LCC (en)"], ["Metaphor List", "LCC (en)"]]

for i, t in enumerate(tasks):
    train, _, _ = load_splits(datasets[i])
    first_row = train.iloc[17]
    print(first_row)
    prompt_creator = create_metaphor_classification_prompt
    if t == "SD Prediction": 
        prompt_creator = create_source_domain_v2_prompt
    elif t == "TD Prediction":
        prompt_creator = create_target_domain_v2_prompt
    elif t == "SL Prediction":
        prompt_creator = create_source_lexeme_prompt
    elif t == "TL Prediction":
        prompt_creator = create_target_lexeme_prompt
    p, c = prompt_creator(first_row)
    p = p.replace("\n", "\\n")
    # p = p.replace(",", "\,")
    print(p)
    df_tasks.loc[len(df_tasks)+1] = [t, "; ".join(datasets_for_tasks[i]), "yes/no" if t == "Classification" else "-", p, c]

df_tasks.to_csv(f"{RESULTS_DIR}/tasks_info.csv", index=False)
df_tasks

text     For off - duty shifts , the Air Force is start...
span1                                             [28, 29]
label                                                    1
Name: 17, dtype: object
Sentence: For off - duty shifts , the Air Force is starting to build concrete dugouts for about 80 persons , where one takes off the chem suit and rests on a cot\nQuestion: Is the sentence metaphoric?\nAnswer:
Unnamed: 0                              1662
Title                      Time Is A Changer
Metaphor                   Time Is A Changer
source_domain                        changer
target_domain                           time
example          Time had made her look old.
Name: 17, dtype: object
Context: In linguistics, conceptual metaphors consists of understanding a given concept in terms of another\nTask: Extract the source domain from the sentence\nSentence: Time had made her look old.\nTarget domain: time\nAnswer:
Unnamed: 0                              1662
Title                 

Unnamed: 0,Task,Datasets,Labels,Prompt,Prediction
1,Classification,TroFi; VUA Verb; VUA POS,yes/no,"Sentence: For off - duty shifts , the Air Forc...",yes
2,SD Prediction,Metaphor List; LCC (en),-,"Context: In linguistics, conceptual metaphors ...",changer
3,TD Prediction,Metaphor List; LCC (en),-,"Context: In linguistics, conceptual metaphors ...",time
4,SL Prediction,Metaphor List; LCC (en),-,"Context: In linguistics, conceptual metaphors ...",demonizing
5,TL Prediction,Metaphor List; LCC (en),-,"Context: In linguistics, conceptual metaphors ...",guns


### Results

#### Classification

In [69]:
TEST_RESULTS_DIR = "./test"
FEW_SHOT_RESULTS = f"{TEST_RESULTS_DIR}/few_shot"
FINE_TUNING_RESULTS = f"{TEST_RESULTS_DIR}/fine_tuning"

df_class_res = pd.DataFrame(columns=["Model", "Dataset", "F1", "Precision", "Recall", "Acc"])

test_res = {
    "TroFi": ["1701264636_gpt3.5_best_config_test_result.json", "1700961412_llama2-7b_best_config_test_result.json"],
    "VUA Verb": ["1701269735_gpt3.5_best_config_test_result.json", "1701131923_llama2-7b_best_config_test_result.json"],
    "VUA POS": ["1701267713_gpt3.5_best_config_test_result.json", "1701136723_llama2-7b_best_config_test_result.json"]
}

for k, v in test_res.items():
    for path in v:
        with open(f"{FEW_SHOT_RESULTS}/classification/{k.lower().replace(' ', '_')}/{path}", "r") as f:
            res = json.load(f)
            df_class_res.loc[len(df_class_res)+1] = [res["model"], k, round(res["f1"], 2), round(res["precision"], 2), round(res["recall"], 2), round(res["acc"], 2)]

df_class_res = df_class_res.sort_values(by=["Model"], ascending=False)
df_class_res.to_csv(f"{RESULTS_DIR}/test_classification_info.csv", index=False)
df_class_res


Unnamed: 0,Model,Dataset,F1,Precision,Recall,Acc
2,llama2-7b,TroFi,0.59,0.53,0.67,0.54
4,llama2-7b,VUA Verb,0.63,0.59,0.67,0.61
6,llama2-7b,VUA POS,0.55,0.51,0.59,0.51
1,gpt3.5,TroFi,0.6,0.59,0.62,0.59
3,gpt3.5,VUA Verb,0.56,0.56,0.56,0.56
5,gpt3.5,VUA POS,0.6,0.57,0.63,0.58


In [18]:
df_class_met_res = pd.DataFrame(columns=["Model", "Dataset","F1", "Precision", "Recall", "Acc"])
df_pred_domain_res = pd.DataFrame(columns=["Model", "Dataset", "Task", "Similarity", "Std", "Acc"])

test_files = ["gpt_few_shot.json", "llama_few_shot.json", "llama_fine_tuned.json"]

test_results = []
for file in test_files:
    with open(f"./test/{file}", "r") as f:
        res = json.load(f)
        test_results.extend(res)

for res in test_results:
    model = res["model"]
    if "fine_tuning" in res:
        if res["fine_tuning"] is True:
            model += "_tuned"
    dataset = res["dataset_name"]
    task = res["task_name"]
    f1 = "-"
    if "f1" in res:
        f1 = round(res["f1"], 2)
    prec = "-"
    if "precision" in res:
        prec = round(res["precision"], 2)
    rec = "-"
    if "recall" in res:
        rec = round(res["recall"], 2)
    sim = "-"
    if "mean_em" in res:
        sim = round(res["mean_em"], 2)
    std = "-"
    if "std_em" in res:
        std = round(res["std_em"], 2)
    acc = "-"
    if "acc" in res:
        acc = round(res["acc"], 2)
    if f1 != "-":
        df_class_met_res.loc[len(df_class_met_res)+1] = [model, dataset, f1, prec, rec, acc]
    else:
        df_pred_domain_res.loc[len(df_pred_domain_res)+1] = [model, dataset, task, sim, std, acc]

df_class_met_res = df_class_met_res.sort_values(by=["Model", "Dataset"], ascending=True)
df_class_met_res.to_csv(f"{RESULTS_DIR}/test_classification_info.csv", index=False)
df_class_met_res

Unnamed: 0,Model,Dataset,F1,Precision,Recall,Acc
1,gpt3.5,trofi,0.59,0.57,0.61,0.58
2,gpt3.5,vua_pos,0.56,0.53,0.6,0.54
3,gpt3.5,vua_verb,0.59,0.57,0.61,0.57
4,llama2-7b,trofi,0.61,0.55,0.7,0.56
5,llama2-7b,vua_pos,0.55,0.52,0.58,0.52
6,llama2-7b,vua_verb,0.6,0.57,0.62,0.58
7,llama2-7b_tuned,trofi,0.67,0.5,1.0,0.5
8,llama2-7b_tuned,vua_pos,0.58,0.59,0.57,0.59
9,llama2-7b_tuned,vua_verb,0.65,0.64,0.67,0.65


#### Source/target domain/lexeme prediction results

In [19]:
df_pred_domain_res = df_pred_domain_res.sort_values(by=["Model", "Dataset", "Task"], ascending=True)
df_pred_domain_res.to_csv(f"{RESULTS_DIR}/test_prediction_info.csv", index=False)
df_pred_domain_res

Unnamed: 0,Model,Dataset,Task,Similarity,Std,Acc
3,gpt3.5,lcc_en_subset,source_domain_prediction,0.65,0.26,0.32
5,gpt3.5,lcc_en_subset,source_lexeme_prediction,0.84,0.27,0.7
4,gpt3.5,lcc_en_subset,target_domain_prediction,0.84,0.24,0.64
6,gpt3.5,lcc_en_subset,target_lexeme_prediction,0.88,0.26,0.8
1,gpt3.5,metaphor_list,source_domain_prediction,0.51,0.21,0.12
2,gpt3.5,metaphor_list,target_domain_prediction,0.6,0.24,0.19
9,llama2-7b,lcc_en_subset,source_domain_prediction,0.55,0.13,0.0
11,llama2-7b,lcc_en_subset,source_lexeme_prediction,0.58,0.18,0.0
10,llama2-7b,lcc_en_subset,target_domain_prediction,0.64,0.13,0.0
12,llama2-7b,lcc_en_subset,target_lexeme_prediction,0.63,0.17,0.0
