In [2]:
import sys

sys.path.insert(0, '../')

from src.utils import ModelInformation
from src.mlflow_utils import mlflow, download_run_data
import re
import pandas as pd
import numpy as np
import json

In [3]:
def sample_to_sw(sample, model_information: ModelInformation):
    ret_dict = {"model_information": model_information.to_dict()}
    selection = sample[['title', 'context_sentence', 'context_word', 'gt', 'prediction']].to_dict(orient='split')
    ret_dict.update(selection)
    return ret_dict

statistic_fns = {
    "Average": pd.Series.mean,
    "StdDev": pd.Series.std,
    "Median": pd.Series.median,
    "Max": pd.Series.max,
    "Min": pd.Series.min,
}


def calc_statistics(df):
    stats = dict()
    for column in df.columns[5:]:
        for k, v in statistic_fns.items():
            stats[(column, k)] = v(df[column])
    return stats


In [4]:
selected_runs = [# ModelInformation("wiktionary", "2bba85fab20b4ed6a1a89b0503fd5274", "rep-penality=1.5"),
                 ModelInformation("wiktionary", "a8fa79a989fc4a9fb7c876be01ac42d2", "rep-penality=1.5"),
                 ModelInformation("distillation", "d0a3f0265f404e48a07b56d7dc9e3f1b", "rep-penality=1.5")
                ]

In [5]:
selected_runs = tuple(set(selected_runs))
exports = []

for run in selected_runs:
    artifacts = mlflow.artifacts.list_artifacts(run_id=run.name)
    for artifact in artifacts:
        if artifact_name := re.match(r"evaluation_[0-9a-z]*.json", artifact.path):
            artifact_name = artifact_name.group(0)
            evaluation_data = download_run_data(run.name, artifact_name)
            if not evaluation_data:
                raise RuntimeError(f"Could not download evaluation data for run {run}")

            collected_data = pd.DataFrame(data=evaluation_data['data'], columns=evaluation_data['columns'])
            sample = collected_data.sample(frac=0.01, random_state=42)
            res = sample_to_sw(sample, run)
            exports.append((run, sample))
            with open(f"export/export_{run.name}_{run.type}.json", "w+") as file:
                json.dump(res, file)

In [113]:
exports[0]

{'model_information': {'type': 'distillation',
  'name': 'd0a3f0265f404e48a07b56d7dc9e3f1b',
  'example_prompt': 'rep-penality=1.5',
  'question_prompt': '',
  'system_prompt': ''},
 'index': [7104,
  18986,
  20011,
  11052,
  29716,
  5183,
  31333,
  11227,
  32933,
  14358,
  31959,
  12802,
  24742,
  3510,
  6049,
  27888,
  10475,
  25715,
  20077,
  1045,
  15416,
  25063,
  19526,
  10217,
  19793,
  17964,
  31540,
  9254,
  12957,
  566,
  19562,
  17808,
  30671,
  28738,
  10951,
  18049,
  24793,
  12238,
  22037,
  16171,
  3507,
  9584,
  9430,
  30083,
  9899,
  25132,
  29845,
  6138,
  16815,
  3067,
  11724,
  24843,
  6912,
  20405,
  8612,
  6888,
  2726,
  19208,
  1796,
  5460,
  1662,
  16685,
  12362,
  22583,
  12365,
  17877,
  7775,
  23687,
  3382,
  1520,
  15016,
  32804,
  952,
  31067,
  15778,
  23306,
  31945,
  2111,
  20305,
  13590,
  7302,
  16287,
  19931,
  14992,
  926,
  24757,
  14117,
  5688,
  28154,
  12959,
  9731,
  31113,
  25048,
  31

In [74]:
sample_b = collected_data.sample(frac=0.01, random_state=42)
sample_b

Unnamed: 0,title,context_sentence,context_word,gt,prediction,bertscore_precision,bertscore_recall,bertscore_f1,bleurt_score,meteor_score,moverscore_score,rouge_1,rouge_2,rouge_L,rouge_Lsum,sentence-embedding_score
7104,Spielschöpfung,"""Die Grafiken zeigen sowohl tradierte Spiele a...",Spielschöpfungen,kreative Neugestaltung einer Spielform oder ei...,"Ereignisse oder Handlungen, die während eines ...",0.706347,0.708531,0.707437,0.523777,0.136986,0.555608,0.235294,0.000000,0.235294,0.235294,0.535478
18986,Universitätsbildung,"""Ich würde gerne daran festhalten, dass verfüg...",Universitätsbildung,"Bildung, die in einem Universitätsstudium erwo...",Ausbildung an einer Universität,0.806967,0.674842,0.735014,0.568951,0.000000,0.552810,0.133333,0.000000,0.133333,0.133333,0.664373
20011,Kehrseite,"""Es bleibt also nur die Kehrseite des Guten, a...",Kehrseite,"die andere, entgegengesetzte, oftmals negative...",Die Kehrseite bezieht sich auf die äußere Seit...,0.671636,0.735254,0.702007,0.415403,0.273529,0.546962,0.250000,0.052632,0.200000,0.200000,0.308328
11052,mehrmalig,"""(...) nur durch ein mehrmaliges Werfen des sc...",mehrmaliges,"attributiv gebraucht, auch adverbial: mehr als...","einmalig bedeutet in diesem Kontext ""mehrmals""...",0.646891,0.628518,0.637572,0.448108,0.040650,0.531657,0.000000,0.000000,0.000000,0.000000,0.431801
29716,Teotihuacan,Die archäologische Stätte Teotihuacans gehört ...,Teotihuacans,prähistorische Ruinenstadt im Zentralen Hochla...,eine archäologische Stätte,0.760008,0.677525,0.716400,0.430773,0.000000,0.509784,0.000000,0.000000,0.000000,0.000000,0.232325
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11502,Lehnübersetzung,Ein Beispiel für eine Lehnübersetzung ist das ...,Lehnübersetzung,"Linguistik: ein (auch zusammengesetztes) Wort,...",Umsetzung eines Wortes aus einer anderen Sprac...,0.708310,0.637986,0.671311,0.378624,0.057252,0.539513,0.000000,0.000000,0.000000,0.000000,0.599027
24673,Chorraum,"""Im Chorraum der heutigen, im Jahre 1934 gewei...",Chorraum,Architektur: Altarraum in einer Kirche,"ein Raum in einem Gebäude, in dem sich die Kir...",0.737418,0.767538,0.752176,0.508865,0.151515,0.547370,0.235294,0.000000,0.235294,0.235294,0.758891
11044,Selm,Selm liegt im Westen Deutschlands,Selm,"eine Stadt in Nordrhein-Westfalen, Deutschland",eine Stadt,0.812981,0.621065,0.704181,0.237533,0.334821,0.584785,0.500000,0.333333,0.500000,0.500000,0.460503
5893,Felsenpinguin,"""Wie die meisten Pinguinarten nistet der Felse...",Felsenpinguin,Zoologie: eine Art aus der Familie der Pinguine,eine Vogelart,0.703371,0.632380,0.665989,0.162500,0.060241,0.538057,0.200000,0.000000,0.200000,0.200000,0.323958


In [56]:

sample_stats = calc_statistics(sample)
total_stats = calc_statistics(collected_data)

In [60]:
def compare_stats(total, subset):
    for k, v_total in total.items():
        v_subset = subset[k]
        print(f"{k}: {np.abs(v_total - v_subset) / v_total * 100:.2f} %")
        
compare_stats(total_stats, sample_stats)

('bertscore_precision', 'Average'): 0.88 %
('bertscore_precision', 'StdDev'): 1.37 %
('bertscore_precision', 'Median'): 0.85 %
('bertscore_precision', 'Max'): 0.00 %
('bertscore_precision', 'Min'): 9.14 %
('bertscore_recall', 'Average'): 0.67 %
('bertscore_recall', 'StdDev'): 0.84 %
('bertscore_recall', 'Median'): 0.62 %
('bertscore_recall', 'Max'): 0.00 %
('bertscore_recall', 'Min'): 0.00 %
('bertscore_f1', 'Average'): 0.77 %
('bertscore_f1', 'StdDev'): 0.13 %
('bertscore_f1', 'Median'): 0.66 %
('bertscore_f1', 'Max'): 0.00 %
('bertscore_f1', 'Min'): 0.00 %
('bleurt_score', 'Average'): 2.04 %
('bleurt_score', 'StdDev'): 0.05 %
('bleurt_score', 'Median'): 1.73 %
('bleurt_score', 'Max'): 0.00 %
('bleurt_score', 'Min'): -183.83 %
('meteor_score', 'Average'): 4.40 %
('meteor_score', 'StdDev'): 2.02 %
('meteor_score', 'Median'): 6.60 %
('meteor_score', 'Max'): 0.04 %
('meteor_score', 'Min'): nan %
('moverscore_score', 'Average'): 0.55 %
('moverscore_score', 'StdDev'): 2.62 %
('moverscore_s

  print(f"{k}: {np.abs(v_total - v_subset) / v_total * 100:.2f} %")


In [61]:
sample_stats

{('bertscore_precision', 'Average'): 0.7147142674546519,
 ('bertscore_precision', 'StdDev'): 0.09685577550322344,
 ('bertscore_precision', 'Median'): 0.6991021633,
 ('bertscore_precision', 'Max'): 1.0000002384,
 ('bertscore_precision', 'Min'): 0.4883389771,
 ('bertscore_recall', 'Average'): 0.6991255719491516,
 ('bertscore_recall', 'StdDev'): 0.09220142517053445,
 ('bertscore_recall', 'Median'): 0.6800351143,
 ('bertscore_recall', 'Max'): 1.0000002384,
 ('bertscore_recall', 'Min'): 0.3554219306,
 ('bertscore_f1', 'Average'): 0.705756886808777,
 ('bertscore_f1', 'StdDev'): 0.0904252456024084,
 ('bertscore_f1', 'Median'): 0.6888597012,
 ('bertscore_f1', 'Max'): 1.0000002384,
 ('bertscore_f1', 'Min'): 0.4423691332,
 ('bleurt_score', 'Average'): 0.3996637937767116,
 ('bleurt_score', 'StdDev'): 0.20525684982405243,
 ('bleurt_score', 'Median'): 0.3832337856,
 ('bleurt_score', 'Max'): 1.0809522867,
 ('bleurt_score', 'Min'): 0.0103823692,
 ('meteor_score', 'Average'): 0.16725934202656526,
 ('m

In [62]:
from scipy import stats

t_stat, p_value = stats.ttest_ind(collected_data["sentence-embedding_score"], sample["sentence-embedding_score"], equal_var=False)
t_stat, p_value

(0.8881104369218126, 0.3750961021751934)

In [93]:
def sample_to_sw(sample, model_information: ModelInformation):
    ret_dict = {"model_information": model_information.to_dict()}
    selection = sample[['title', 'context_sentence', 'context_word', 'gt', 'prediction']].to_dict(orient='split')
    ret_dict.update(selection)
    return ret_dict

In [97]:
import json


with open("export_example.json", "w+") as f:
    json.dump(sample_to_sw(sample_a, ModelInformation("Wiktionary", "2bba85fab20b4ed6a1a89b0503fd5274")), f)

{'index': [7104,
  18986,
  20011,
  11052,
  29716,
  5183,
  31333,
  11227,
  32933,
  14358,
  31959,
  12802,
  24742,
  3510,
  6049,
  27888,
  10475,
  25715,
  20077,
  1045,
  15416,
  25063,
  19526,
  10217,
  19793,
  17964,
  31540,
  9254,
  12957,
  566,
  19562,
  17808,
  30671,
  28738,
  10951,
  18049,
  24793,
  12238,
  22037,
  16171,
  3507,
  9584,
  9430,
  30083,
  9899,
  25132,
  29845,
  6138,
  16815,
  3067,
  11724,
  24843,
  6912,
  20405,
  8612,
  6888,
  2726,
  19208,
  1796,
  5460,
  1662,
  16685,
  12362,
  22583,
  12365,
  17877,
  7775,
  23687,
  3382,
  1520,
  15016,
  32804,
  952,
  31067,
  15778,
  23306,
  31945,
  2111,
  20305,
  13590,
  7302,
  16287,
  19931,
  14992,
  926,
  24757,
  14117,
  5688,
  28154,
  12959,
  9731,
  31113,
  25048,
  31956,
  25205,
  13118,
  32297,
  31929,
  1786,
  23119,
  6690,
  26902,
  22594,
  24621,
  1092,
  12623,
  6483,
  8617,
  1291,
  31876,
  22454,
  5139,
  20879,
  19372,
  31