In [1]:
import sys

sys.path.insert(0, '../')

from src.utils import ModelInformation
import re
import matplotlib.pyplot as plt

import pandas as pd


def overview(experiments, selected_runs):
    from src.mlflow_utils import mlflow, get_run_list, download_run_data

    selected_runs += tuple(get_run_list(experiments))
    selected_runs = tuple(set(selected_runs))

    collected_data = dict()

    for run in selected_runs:
        artifacts = mlflow.artifacts.list_artifacts(run_id=run.name)
        for artifact in artifacts:
            if artifact_name := re.match(r"evaluation_[0-9a-z]*.json", artifact.path):
                artifact_name = artifact_name.group(0)
                dataset_id = artifact_name.replace('evaluation_', '').replace('.json', '')
                run_name = mlflow.get_run(run.name).info.run_name
                exp_id = mlflow.get_run(run.name).info.experiment_id
                evaluation_data = download_run_data(run.name, artifact_name)
                if not evaluation_data:
                    raise RuntimeError(f"Could not download evaluation data for run {run.name}")

                df_name = f"{run.type}"
                collected_data[df_name] = pd.DataFrame(data=evaluation_data['data'], columns=evaluation_data['columns'])

    collected_data = dict(sorted(collected_data.items(), key=lambda x: x[0]))
    return collected_data

In [2]:
selected_runs = [  # ModelInformation("wiktionary", "2bba85fab20b4ed6a1a89b0503fd5274", "rep-penality=1.5"),
    ModelInformation("wiktionary", "a8fa79a989fc4a9fb7c876be01ac42d2", "rep-penality=1.5"),
    ModelInformation("distillation", "d0a3f0265f404e48a07b56d7dc9e3f1b", "rep-penality=1.5")]

data = overview([], selected_runs)

In [5]:
import os
from itertools import permutations

metrics = [
    "meteor_score",
    "rouge_L",
    "bertscore_f1",
    "moverscore_score",
    "sentence-embedding_score",
]

metrics_names = [
    "MeteorScore",
    "ROUGE-L",
    "BERTscore-F1",
    "MoverScore",
    "Sentence Embedding",

]

mapping = {a: b for a, b in zip(metrics, metrics_names)}

metric_pairs = list(permutations(metrics, 2))

output_dir = "../diagrams/5_results"

for model in selected_runs:
    out_dir = os.path.join(output_dir, model.type)
    os.makedirs(out_dir, exist_ok=True)
    for metric, name in zip(metrics, metrics_names):
        plt.hist(data[model.type][metric], bins="auto")
        plt.xlabel(name)
        plt.ylabel("Frequency")
        plt.savefig(os.path.join(out_dir, f"hist_{metric}.pdf"))
        # plt.savefig(os.path.join(out_dir, f"hist_{metric}.png"))
        plt.close()
    for m_x, m_y in metric_pairs:
        plt.scatter(data[model.type][m_x], data[model.type][m_y], s=1)
        plt.xlabel(mapping[m_x])
        plt.ylabel(mapping[m_y])
        plt.savefig(os.path.join(out_dir, f"scatter_{m_x}-{m_y}.pdf"))
        # plt.savefig(os.path.join(out_dir, f"scatter_{m_x}-{m_y}.png"))
        plt.close()


In [45]:
data['wiktionary'].corr('spearman', numeric_only=True)

Unnamed: 0,bertscore_precision,bertscore_recall,bertscore_f1,bleurt_score,meteor_score,moverscore_score,rouge_1,rouge_2,rouge_L,rouge_Lsum,sentence-embedding_score
bertscore_precision,1.0,0.768089,0.945935,0.639742,0.648965,0.715327,0.737133,0.603541,0.738484,0.738484,0.74608
bertscore_recall,0.768089,1.0,0.930008,0.623136,0.685411,0.792021,0.681836,0.535508,0.685546,0.685546,0.721861
bertscore_f1,0.945935,0.930008,1.0,0.668596,0.705923,0.797134,0.753019,0.59902,0.755743,0.755743,0.778515
bleurt_score,0.639742,0.623136,0.668596,1.0,0.573208,0.639857,0.611224,0.499947,0.610382,0.610382,0.731321
meteor_score,0.648965,0.685411,0.705923,0.573208,1.0,0.705281,0.834261,0.659543,0.833685,0.833685,0.663449
moverscore_score,0.715327,0.792021,0.797134,0.639857,0.705281,1.0,0.748874,0.577651,0.749334,0.749334,0.75621
rouge_1,0.737133,0.681836,0.753019,0.611224,0.834261,0.748874,1.0,0.718501,0.992259,0.992259,0.689833
rouge_2,0.603541,0.535508,0.59902,0.499947,0.659543,0.577651,0.718501,1.0,0.722952,0.722952,0.533588
rouge_L,0.738484,0.685546,0.755743,0.610382,0.833685,0.749334,0.992259,0.722952,1.0,1.0,0.690151
rouge_Lsum,0.738484,0.685546,0.755743,0.610382,0.833685,0.749334,0.992259,0.722952,1.0,1.0,0.690151


In [69]:
print(data['wiktionary'][[    "meteor_score",
    "rouge_L",
    "bertscore_f1",
    "moverscore_score",
    "sentence-embedding_score",
]].corr('kendall', numeric_only=True).to_latex(float_format="%.2f"))

\begin{tabular}{lrrrrr}
\toprule
 & meteor_score & rouge_L & bertscore_f1 & moverscore_score & sentence-embedding_score \\
\midrule
meteor_score & 1.00 & 0.68 & 0.53 & 0.53 & 0.49 \\
rouge_L & 0.68 & 1.00 & 0.59 & 0.59 & 0.53 \\
bertscore_f1 & 0.53 & 0.59 & 1.00 & 0.61 & 0.59 \\
moverscore_score & 0.53 & 0.59 & 0.61 & 1.00 & 0.57 \\
sentence-embedding_score & 0.49 & 0.53 & 0.59 & 0.57 & 1.00 \\
\bottomrule
\end{tabular}



In [78]:
data['distillation'][data['distillation']["rouge_L"] == 0][data['distillation']["sentence-embedding_score" ] > 0.9]

  data['distillation'][data['distillation']["rouge_L"] == 0][data['distillation']["sentence-embedding_score" ] > 0.9]


Unnamed: 0,title,context_sentence,context_word,gt,prediction,bertscore_precision,bertscore_recall,bertscore_f1,bleurt_score,meteor_score,moverscore_score,rouge_1,rouge_2,rouge_L,rouge_Lsum,sentence-embedding_score
6234,Steinchen,"""Die Fini wirft Steinchen nach ihm.""",Steinchen,ein kleiner Stein,kleine Steine,0.841904,0.845217,0.843558,0.682295,0.172414,0.651513,0.0,0.0,0.0,0.0,0.946059
11197,Ökonomik,"""Natürlich ist die Erwerbsweise, die Produkte ...",Ökonomik,Wirtschaftswissenschaft,Wissenschaft von der Wirtschaft,0.746718,0.756224,0.751441,0.543742,0.0,0.605374,0.0,0.0,0.0,0.0,0.923528
13287,Dürreperiode,"""Afghanistan wird häufig von lang anhaltenden ...",Dürreperioden,längere Zeit andauernder Trockenheit,lange anhaltende Trockenperioden,0.74379,0.763609,0.753569,0.863182,0.0,0.603257,0.0,0.0,0.0,0.0,0.932457
23457,Kistchen,"""Die Kellnerin hatte wirklich hinten im Zelt e...",Kistchen,kleine Kiste,ein kleines Kistchen,0.793034,0.832755,0.812409,0.735068,0.238095,0.616662,0.0,0.0,0.0,0.0,0.903205
30246,Frauenschrei,"""Ich tätschele mich durch die Körperteile auf ...",Frauenschreie,Schrei einer Frau,Schreie von Frauen,0.92537,0.92537,0.92537,0.722119,0.0,0.716725,0.0,0.0,0.0,0.0,0.921304
33937,Friedhofsbesuch,"""Seither ging Henni, wann immer es ihre Zeit e...",Friedhofsbesuchen,Besuch eines Friedhofs,Besuche an einem Friedhof,0.85685,0.862754,0.859792,0.802104,0.322581,0.698008,0.0,0.0,0.0,0.0,0.976724


In [87]:
len(data['wiktionary'][data['wiktionary']["bertscore_f1"] > 0.5][data['wiktionary']["bertscore_f1"] < 0.7]) / len(data['wiktionary'])

  len(data['wiktionary'][data['wiktionary']["bertscore_f1"] > 0.5][data['wiktionary']["bertscore_f1"] < 0.7]) / len(data['wiktionary'])


0.5146000351103048

In [89]:
data['wiktionary'][data['wiktionary']["bertscore_f1"] > 0.5][data['wiktionary']["bertscore_f1"] < 0.7]["sentence-embedding_score"].max()

  data['wiktionary'][data['wiktionary']["bertscore_f1"] > 0.5][data['wiktionary']["bertscore_f1"] < 0.7]["sentence-embedding_score"].max()


0.9157649875

In [97]:
data['wiktionary'][data['wiktionary']["bertscore_f1"] >= 0.6][data['wiktionary']["bertscore_f1"] < 0.61]["sentence-embedding_score"].min()

  data['wiktionary'][data['wiktionary']["bertscore_f1"] >= 0.6][data['wiktionary']["bertscore_f1"] < 0.61]["sentence-embedding_score"].min()


-0.038461417