# Model evaluations

In [1]:
import pandas as pd
import json
import os
import re

In [2]:
MODEL_OUTPUT = "experiments/"

In [3]:
def parse_model_name(model):
    params = dict()
    params["model"] = re.findall(r"model_(.*)_max", model)[0]
    params["max_sequence_length"] = re.findall(r"max_sequence_length_([0-9]*)_epochs", model)[0]
    params["epochs"] = re.findall(r"epochs_([0-9]*)", model)[0]
    params["suffix"] = re.findall(r"epochs_[0-9](.*)", model)[0]
    return params

def parse_global_data(global_dict, model_name):
    final_dict = parse_model_name(model_name)
    final_dict.update(global_dict)
    return pd.DataFrame(final_dict, index=[0])

def parse_class_report(report):
    all_rows = [entry.strip() for entry in report.split("\n")]
    rows = [entry for entry in all_rows if entry]

    header = ["label"]
    header +=  re.findall(r"([^\s]+)", rows[0].replace("-score", ""))
    rows = rows[1:]

    data = []
    for row in rows:
        row = row.replace(" avg", "_avg")
        row = row.replace("accuracy", "accuracy _ _")
        data.append(re.findall(r"([^\s]+)", row))
    return pd.DataFrame(data=data, columns=header)



def parse_all_class_reports(report_list):
    all_reports = []
    for report in report_list:
        all_reports.append(parse_class_report(report))
    return all_reports

## Token-Level Classification

In [12]:
ne_global_metrics = pd.DataFrame()
ne_test_metrics = dict()
sent_test_metrics = dict()

for model in os.listdir(MODEL_OUTPUT):
    try:
        with open(os.path.join(MODEL_OUTPUT, model, "all_results.json"), "r") as f:
            model_res = json.load(f)
            
            dev_global = parse_global_data(model_res["dev"]["global"][-1], model)
            dev_global.insert(4, "set", "dev") #insert column at 5th position
            test_global = parse_global_data(model_res["test"]["global"], model)
            test_global["set"] = "test"
            ne_global_metrics = pd.concat([ne_global_metrics, dev_global, test_global], ignore_index=True)
            ne_test_metrics[model] = parse_class_report(model_res["test"]["token-level"])
            sent_test_metrics[model] = parse_class_report(model_res["test"]["sent-level"])
    
    except:
        print("Was not able to get results for model:", model)

ne_global_metrics["max_sequence_length"] = ne_global_metrics["max_sequence_length"].astype(int)
ne_global_metrics["epochs"] = ne_global_metrics["epochs"].astype(int)

Was not able to get results for model: model_dbmdz_distilbert_base_german_europeana_cased_max_sequence_length_64_epochs_3_de
Was not able to get results for model: model_xlm_roberta_base_max_sequence_length_64_epochs_3_multilingual


In [5]:
#raw output
#model_res

In [13]:
ne_global_metrics.loc[(ne_global_metrics["set"]=="test") &
                      (ne_global_metrics["max_sequence_length"] == 64) &
                      (ne_global_metrics["epochs"] == 3)].sort_values(by="f1", ascending=False)

Unnamed: 0,model,max_sequence_length,epochs,suffix,set,loss,precision,recall,f1
9,bert_base_german_cased,64,3,_de,test,0.067219,0.849057,0.882353,0.865385
15,bert_base_multilingual_cased,64,3,_multilingual,test,0.086044,0.786408,0.81407,0.8
33,dbmdz_bert_base_german_europeana_cased,64,3,_de,test,0.092032,0.763636,0.823529,0.792453
23,bert_base_cased,64,3,_de,test,0.086433,0.75,0.823529,0.785047
25,dbmdz_bert_base_french_europeana_cased,64,3,_fr,test,0.058745,0.766234,0.781457,0.77377
27,bert_base_cased,64,3,_multilingual,test,0.086364,0.771574,0.771574,0.771574
13,dbmdz_bert_base_historic_multilingual_cased,64,3,_multilingual,test,0.080329,0.713043,0.81592,0.761021
3,bert_base_multilingual_cased,64,3,_fr,test,0.073821,0.713376,0.756757,0.734426
5,xlm_roberta_base,64,3,_de,test,0.078416,0.74,0.72549,0.732673
19,bert_base_cased,64,3,_fr,test,0.083427,0.702703,0.712329,0.707483


In [15]:
ne_global_metrics[ne_global_metrics["set"]=="test"].sort_values(by="f1", ascending=False)

Unnamed: 0,model,max_sequence_length,epochs,suffix,set,loss,precision,recall,f1
29,bert_base_german_cased,64,5,_de,test,0.068613,0.884615,0.901961,0.893204
31,bert_base_cased,64,5,_de,test,0.085425,0.865385,0.882353,0.873786
9,bert_base_german_cased,64,3,_de,test,0.067219,0.849057,0.882353,0.865385
7,bert_base_cased,128,3,_de,test,0.08668,0.843137,0.843137,0.843137
15,bert_base_multilingual_cased,64,3,_multilingual,test,0.086044,0.786408,0.81407,0.8
33,dbmdz_bert_base_german_europeana_cased,64,3,_de,test,0.092032,0.763636,0.823529,0.792453
23,bert_base_cased,64,3,_de,test,0.086433,0.75,0.823529,0.785047
25,dbmdz_bert_base_french_europeana_cased,64,3,_fr,test,0.058745,0.766234,0.781457,0.77377
27,bert_base_cased,64,3,_multilingual,test,0.086364,0.771574,0.771574,0.771574
21,bert_base_cased,64,5,_fr,test,0.095649,0.737179,0.787671,0.761589


In [8]:
ne_global_metrics

Unnamed: 0,model,max_sequence_length,epochs,suffix,set,loss,precision,recall,f1
0,bert_base_multilingual_cased,64,3,_de,dev,0.052511,0.36,0.391304,0.375
1,bert_base_multilingual_cased,64,3,_de,test,0.082316,0.490909,0.529412,0.509434
2,bert_base_multilingual_cased,64,3,_fr,dev,0.066178,0.661654,0.807339,0.727273
3,bert_base_multilingual_cased,64,3,_fr,test,0.073821,0.713376,0.756757,0.734426
4,xlm_roberta_base,64,3,_de,dev,0.070952,0.695652,0.695652,0.695652
5,xlm_roberta_base,64,3,_de,test,0.078416,0.74,0.72549,0.732673
6,bert_base_cased,128,3,_de,dev,0.067827,0.6,0.652174,0.625
7,bert_base_cased,128,3,_de,test,0.08668,0.843137,0.843137,0.843137
8,bert_base_german_cased,64,3,_de,dev,0.078318,0.484848,0.695652,0.571429
9,bert_base_german_cased,64,3,_de,test,0.067219,0.849057,0.882353,0.865385


## Sentence-Level Classification

In [16]:
sent_global_metrics = pd.DataFrame()

for model in sent_test_metrics:
    test_metrics = sent_test_metrics[model]
    chosen_metrics = test_metrics[test_metrics["label"].isin(["0", "1", "weighted_avg"])] 

    #add model specifications at beginning of dataframe
    for i, items in enumerate(parse_model_name(model).items()):
        col, value = items
        chosen_metrics.insert(i, col, value)
    
    sent_global_metrics = pd.concat([sent_global_metrics, chosen_metrics])

In [17]:
sent_global_metrics[sent_global_metrics["label"]=="weighted_avg"].sort_values(by="f1", ascending=False)

Unnamed: 0,model,max_sequence_length,epochs,suffix,label,precision,recall,f1,support
4,dbmdz_bert_base_french_europeana_cased,64,3,_fr,weighted_avg,0.9919,0.992,0.9919,2881
4,bert_base_german_cased,64,5,_de,weighted_avg,0.9909,0.9907,0.9908,1077
4,dbmdz_bert_base_historic_multilingual_cased,64,3,_fr,weighted_avg,0.9891,0.9892,0.9892,2881
4,bert_base_german_cased,64,3,_de,weighted_avg,0.9887,0.9879,0.9882,1077
4,dbmdz_bert_base_historic_multilingual_cased,64,3,_multilingual,weighted_avg,0.9882,0.9881,0.9882,3958
4,bert_base_cased,64,5,_fr,weighted_avg,0.9879,0.9882,0.9879,2881
4,bert_base_multilingual_cased,64,3,_fr,weighted_avg,0.9877,0.9879,0.9878,2881
4,bert_base_cased,128,3,_fr,weighted_avg,0.9875,0.9879,0.9876,2881
4,bert_base_multilingual_cased,64,3,_multilingual,weighted_avg,0.9867,0.9869,0.9868,3958
4,bert_base_cased,64,5,_de,weighted_avg,0.9865,0.9861,0.9863,1077
