In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import json
import itertools
import random
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import regex as re
import numpy as np
import math



# Fehleranalyse

In [3]:
# Neue Code

import os
import pandas as pd
from glob import glob
import plotly.graph_objects as go


def erroranalyse_relation_new(folder_path, relation_title=None):
    # ------------------------------------------------------------
    # 1. Alle CSVs zusammenführen
    # ------------------------------------------------------------
    csv_files = sorted(glob(os.path.join(folder_path, "*.csv")))
    if not csv_files:
        raise FileNotFoundError(f"Keine CSV-Dateien in {folder_path} gefunden!")

    merged_data = {}
    for file in csv_files:
        shot_label = os.path.basename(file).split(".")[0]
        df = pd.read_csv(file)  # evtl. encoding="utf-8" ergänzen

        # Gold-Spalte einmalig übernehmen
        if "answer_token" not in merged_data:
            merged_data["answer_token"] = df["answer_token"]

        # Für eine 1-Shot-Datei zusätzlich das letzte Prompt-Template speichern
        if "last_template" not in merged_data and "1shot" in shot_label:
            merged_data["last_template"] = (
                df["prompt"].apply(lambda x: x.strip().split("\n")[-1])
            )

        # Predictions pro Shot-Konfiguration
        merged_data[shot_label.split("_")[-1]] = df["predicted_top1_token"]

    merged_df = pd.DataFrame(merged_data)

    # ------------------------------------------------------------
    # 2. Spalten sinnvoll sortieren
    # ------------------------------------------------------------
    ordered_cols = ["answer_token"]
    if "last_template" in merged_df.columns:          # <-- nur falls vorhanden
        ordered_cols.append("last_template")

    shot_cols = sorted(
        [c for c in merged_df.columns if c.endswith("shot")],
        key=lambda x: int(x.replace("shot", ""))
    )
    ordered_cols.extend(shot_cols)
    merged_df = merged_df[ordered_cols]               # garantiert jetzt vorhanden

    # ------------------------------------------------------------
    # 3. Hilfsfunktionen
    # ------------------------------------------------------------
    def tokenize(s):
        return str(s).split()

    gold_col  = "answer_token"
    pred_cols = shot_cols

    # Zeilen markieren, in denen mindestens ein Shot falsch ist
    row_wrong = merged_df.apply(
        lambda r: any(tokenize(r[c]) != tokenize(r[gold_col]) for c in pred_cols),
        axis=1
    )

    total_samples = len(merged_df)

    # Die Spalte mit dem Prompt/Template bestimmen
    group_col_candidates = ["prompt", "last_template", "template"]
    group_col = next((c for c in group_col_candidates if c in merged_df.columns), None)
    if group_col is None:
        raise KeyError("Keine geeignete Template-/Prompt-Spalte gefunden!")

    prompts_with_error = merged_df.loc[row_wrong, group_col].unique()
    wrong_df = merged_df[merged_df[group_col].isin(prompts_with_error)]

    # Fehler pro Shot zählen
    error_counts = {
        col: sum(tokenize(r[col]) != tokenize(r[gold_col]) for _, r in wrong_df.iterrows())
        for col in pred_cols
    }

    # ------------------------------------------------------------
    # 4. Plotly-Tabelle zusammenstellen
    # ------------------------------------------------------------
    values, colors = [], []

    # erste Spalte = DataFrame-Index
    values.append([""] + list(wrong_df.index))
    colors.append(["white"] * (len(wrong_df) + 1))

    def col_vals(col):
        body = [
            " ".join(tokenize(v)) if col in pred_cols else v
            for v in wrong_df[col]
        ]
        if col == gold_col:
            header = f"Token/Subtoken: {total_samples}"
            return [header] + body
        if col in pred_cols:
            return [f"Error: {error_counts[col]}"] + body
        return [""] + body

    def col_cols(col):
        if col in pred_cols:
            body = [
                "#d4edda" if tokenize(v) == tokenize(g) else "#f8d7da"
                for v, g in zip(wrong_df[col], wrong_df[gold_col])
            ]
        else:
            body = ["white"] * len(wrong_df)
        return ["white"] + body

    for c in wrong_df.columns:
        values.append(col_vals(c))
        colors.append(col_cols(c))

    # Tabelle rendern
    title = relation_title or os.path.basename(folder_path)
    fig = go.Figure(data=[go.Table(
        header=dict(
            values=["Index"] + list(wrong_df.columns),
            fill_color="#333333",
            font=dict(color="white", size=12)
        ),
        cells=dict(
            values=values,
            fill_color=colors,
            font=dict(color="black", size=11),
            height=25
        )
    )])

    fig.update_layout(
        height=min(600, 40 + 25 * (len(wrong_df) + 1)),
        title=f"Erroranalyse: {title}",
        margin=dict(l=0, r=0, t=40, b=0)
    )
    fig.show()


In [5]:
# ---------- Abhängigkeiten (falls noch nicht installiert) ----------
!pip install -q -U plotly==6.1.1 kaleido==0.2.1 pandas

import os, re, pandas as pd, plotly.graph_objects as go
from glob import glob

def export_compact_table(folder_path, rows_to_show, out_name,
                         out_dir="/content/drive/MyDrive/master_thesis/ploted_data_hi",
                         only_errors=False,
                         show_title=False):              # <- NEUER PARAMETER
    """
    Erstellt aus mehreren .csv-Dateien eine farbcodierte Plotly-Tabelle,
    speichert sie als PDF/PNG und zeigt sie im Notebook an.

    Parameters
    ----------

    folder_path : str
        Ordner mit den .csv-Ergebnissen (eine Datei pro Shot-Einstellung).
    rows_to_show : list[int]
        Zeilen-Indizes, die in der Tabelle erscheinen sollen.
    out_name : str
        Basisname für die Ausgabedateien.
    out_dir : str, optional
        Zielordner für PDF/PNG.
    only_errors : bool, optional
        Wenn True, werden nur Zeilen mit mindestens einem Fehl-Token gezeigt.
    show_title : bool, optional
        Bestimmt, ob im Plot eine Titelüberschrift angezeigt wird.
    """

    # 1) CSVs mergen -------------------------------------------------------
    merged = {}
    for f in sorted(glob(os.path.join(folder_path, "*.csv"))):
        shot = os.path.basename(f).split(".")[0]
        df   = pd.read_csv(f, encoding="utf-8-sig")
        merged.setdefault("answer_token", df["answer_token"])
        if "last_template" not in merged and "1shot" in shot:
            merged["last_template"] = df["prompt"].str.strip().str.split("\n").str[-1]
        merged[shot.split("_")[-1]] = df["predicted_top1_token"]
    df = pd.DataFrame(merged)

    # 2) Fehlerfilter ------------------------------------------------------
    tok = lambda s: str(s).split()

    # --- NUMERISCH sortierte Shot-Spalten -------------------------------
    import re as _re
    shot_cols = sorted(
        [c for c in df if c.endswith("shot")],
        key=lambda s: int(_re.search(r"\d+", s).group())
    )

    wrong = df.apply(lambda r: any(tok(r[c]) != tok(r["answer_token"]) for c in shot_cols),
                     axis=1)
    base  = df[wrong] if only_errors else df

    # 3) Gewünschte Zeilen -------------------------------------------------
    sub = base.loc[[i for i in rows_to_show if i in base.index]]
    sub = sub.rename(columns={"answer_token": "Gold Token",
                              "last_template": "Template"})

    # --- Shot-Überschriften hübscher: 0-Shot, 1-Shot, … -----------------
    rename_map = {c: c.replace("shot", "-Shot") for c in shot_cols}
    sub = sub.rename(columns=rename_map)

    ordered_cols = ["Gold Token", "Template"] + list(rename_map.values())
    sub = sub[ordered_cols]

    # 4) Farben vorbereiten -----------------------------------------------
    colors = []
    for c in sub.columns:
        if c.endswith("-Shot"):
            colors.append(["#d4edda" if tok(v) == tok(g) else "#f8d7da"
                           for v, g in zip(sub[c], sub["Gold Token"])])
        else:
            colors.append(["white"] * len(sub))

    # 5) Plotly-Table ------------------------------------------------------
    col_values = [sub[c].apply(lambda x: " ".join(tok(x)) if c.endswith("-Shot") else x).tolist()
                  for c in sub.columns]

    fig = go.Figure(go.Table(
        header=dict(values=list(sub.columns), fill_color="#333",
                    font=dict(color="white", size=12)),
        cells=dict(values=col_values, fill_color=colors,
                   font=dict(color="black", size=11), height=26)))

    # ---------- kompakte Breite & Layout ----------
    w = 150 * 2 + 90 * len(rename_map)        # Breite in px
    h = 50 * (len(sub) + 1) + 40              # Höhe  in px

    if show_title:                            # Titel optional
        fig.update_layout(width=w, height=h, title=out_name,
                          margin=dict(l=0, r=0, t=40, b=0))
    else:
        fig.update_layout(width=w, height=h,
                          margin=dict(l=0, r=0, t=20, b=0))

    # 6) speichern ---------------------------------------------------------
    os.makedirs(out_dir, exist_ok=True)
    safe = re.sub(r"[^0-9A-Za-z_\-]+", "_", out_name).strip("_")
    pdf  = os.path.join(out_dir, f"{safe}.pdf")
    png  = os.path.join(out_dir, f"{safe}.png")
    fig.write_image(pdf, width=w, height=h, scale=1)
    fig.write_image(png, width=w, height=h, scale=1)
    print("✔︎ PDF :", pdf)
    print("✔︎ PNG :", png)
    fig.show()



[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m950.8 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.1/16.1 MB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m53.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.1 which is incompatible.
dask-cudf-cu12 25.6.0 requires pandas<2.2.4dev0,>=2.0, but you have pandas 2.3.1 which is incompatible.
cudf-cu12 25.6.0 requires pandas<2.2.4dev0,>=2.0, but you have pandas 2.3.1 which is incompatible.[0m[31m
[0m

#factual



###personen bezogene facten


#### person university

In [20]:
erroranalyse_relation_new("/content/drive/MyDrive/master_thesis/dataset_multilingual/factual/hi/result_10_accuracy/logits/permutation_0/person_university")

### Orts-länderbezogene facten

####landmark on continent

In [None]:
"landmark_on_continent"): [
        ("Eiffel Tower", "Europe"),
         ("Statue of Liberty", "North America"),
          ("Big Ben", "Europe"),
        ("Table Mountain", "Africa"),
         ("Mount Fuji", "Asia"),
          ("Machu Picchu", "South America"),
        ("Sydney Opera House", "Australia"),
         ("Colosseum", "Europe"),
          ("Mount Erebus", "Antarctica"),
        ("Uluru", "Australia")

        # von insgesamt 947 Tokens 916 das Label „Antarctica“ tragen.

        #700-704-705 ---> meist falsch vorkommende nord america statt an antarctida

In [19]:
erroranalyse_relation_new("/content/drive/MyDrive/master_thesis/dataset_multilingual/factual/hi/result_10_accuracy/logits/permutation_0/landmark_on_continent")


####food from country

In [None]:
 "food from country": [
    [
      "Souvlaki",
      "Griechenland"
    ],
    [
      "Ratatouille",
      "Frankreich"
    ],
    [
      "Kebab",
      "Türkei"
    ],
    [
      "Chatschapuri",
      "Georgia"
    ],
    [
      "Brigadeiro",
      "Brasilien"
    ],
    [
      "Gazpacho",
      "Spanien"
    ],
    [
      "Moules-frites",
      "Belgien"
    ],
    [
      "Poffertjes",
      "Niederlande"
    ],
    [
      "Tajine",
      "Marokko"
    ],
    [
      "Picarones",
      "Peru"
    ]
  ],

In [None]:


erroranalyse_relation_new("/content/drive/MyDrive/master_thesis/dataset_multilingual/factual/hi/result_10_accuracy/logits/permutation_0/food_from_country")

####city in country

In [None]:
  normalize("city_in_country"): [
        ("Paris", "France"),
        ("Tokyo", "Japan"),
        ("Berlin", "Germany"),
        ("Toronto", "Canada"),
        ("Madrid", "Spain"),
        ("Lisbon", "Portugal"),
        ("Cairo", "Egypt"),
        ("Rome", "Italy"),
        ("Seoul", "South Korea"),
        ("Athens", "Greece")
    ],

Hindi zeigt insgesamt eine sehr schwache Performance: Im Zero-Shot-Szenario sind fast alle Vorhersagen fehlerhaft. Selbst mit Few-Shot-Beispielen bleibt die Fehlerrate durchgehend hoch, ohne deutliche oder stabile Verbesserung.

In [None]:
erroranalyse_relation_new("/content/drive/MyDrive/master_thesis/dataset_multilingual/factual/hi/result_10_accuracy/logits/permutation_0/city_in_country")

####country capital city

In [None]:
 normalize("country_capital_city"): [
    ("Norway", "Oslo"), ("Finland", "Helsinki"), ("Denmark", "Copenhagen"),
    ("Netherlands", "Amsterdam"), ("Switzerland", "Bern"),
    ("Austria", "Vienna"), ("Belgium", "Brussels"), ("Czech Republic", "Prague"),
    ("Ireland", "Dublin"), ("Portugal", "Lisbon")
    ],

In [None]:
erroranalyse_relation_new("/content/drive/MyDrive/master_thesis/dataset_multilingual/factual/hi/result_10_accuracy/logits/permutation_0/country_capital_city")



####country language

In [27]:
rows = [0,1,2,3,4,5,6,7]

export_compact_table("/content/drive/MyDrive/master_thesis/dataset_multilingual/"
    "factual/hi/result_10_accuracy/logits/permutation_0/country_language",
    rows_to_show = rows,
    out_name     = "country_language_hi",
    only_errors  = True,
    show_title   = False          # <-- kein Titel im Plot
)

✔︎ PDF : /content/drive/MyDrive/master_thesis/ploted_data_hi/country_language_hi.pdf
✔︎ PNG : /content/drive/MyDrive/master_thesis/ploted_data_hi/country_language_hi.png


In [23]:
erroranalyse_relation_new("/content/drive/MyDrive/master_thesis/dataset_multilingual/factual/hi/result_10_accuracy/logits/permutation_0/country_language")

####country largest city

In [None]:
erroranalyse_relation_new("/content/drive/MyDrive/master_thesis/dataset_multilingual/factual/hi/result_10_accuracy/logits/permutation_0/country_largest_city")

####country currency

In [None]:
erroranalyse_relation_new("/content/drive/MyDrive/master_thesis/dataset_multilingual/factual/hi/result_10_accuracy/logits/permutation_0/country_currency")

#### product by company

###Historische/Zeit- & sonstige Fakten

####president birth year

In [None]:
("presidents_birth_year"): [
        ("Dwight D. Eisenhower", "1890"), ("Theodore Roosevelt", "1858"), ("Woodrow Wilson", "1856"),
        ("William McKinley", "1843"), ("Herbert Hoover", "1874"), ("Grover Cleveland", "1837"),
        ("Ulysses S. Grant", "1822"), ("Abraham Lincoln", "1809"), ("Chester A. Arthur", "1829"),
        ("Benjamin Harrison", "1833")

        # zero shot 35 fehler subtoken. 12 fehler mit zero shot von 19 samples
        -# 0, 6, typische Fehler (John Adams was born in: antwort: Braintree (now Quincy), Massachusetts
        # 19-20 falsche zahlen



In [28]:
rows = [52,53,54,55,56,64,65,66,67]

export_compact_table("/content/drive/MyDrive/master_thesis/dataset_multilingual/"
    "factual/hi/result_10_accuracy/logits/permutation_0/presidents_birth_year",
    rows_to_show = rows,
    out_name     = "presidents_birth_year_hi",
    only_errors  = True,
    show_title   = False          # <-- kein Titel im Plot
)

✔︎ PDF : /content/drive/MyDrive/master_thesis/ploted_data_hi/presidents_birth_year_hi.pdf
✔︎ PNG : /content/drive/MyDrive/master_thesis/ploted_data_hi/presidents_birth_year_hi.png


In [None]:
erroranalyse_relation_new("/content/drive/MyDrive/master_thesis/dataset_multilingual/factual/hi/result_10_accuracy/logits/permutation_0/presidents_birth_year")

#### president election year

In [None]:
normalize("presidents_election_year"): [
        ("Dwight D. Eisenhower", "1952"), ("Theodore Roosevelt", "1904"), ("Woodrow Wilson", "1912"),
        ("William McKinley", "1896"), ("Herbert Hoover", "1928"), ("Grover Cleveland", "1885"),
        ("Ulysses S. Grant", "1869"), ("Abraham Lincoln", "1860"), ("Donald Trump", "2016"),
        ("Benjamin Harrison", "1888")

        # von 19 samples mit 0 shot 2 falsch
        # mit einem shot 0 fehler
        # 2-5 shot einen samples falsch
        # 7 shots 2 samples falsch
        #und 10 shot keinen fehler


In [None]:
erroranalyse_relation_new("/content/drive/MyDrive/master_thesis/dataset_multilingual/factual/hi/result_10_accuracy/logits/permutation_0/presidents_election_year")

#linguistic

#### Adj antonym

In [7]:
rows = [15,16,17,18,19,20,21]

export_compact_table("/content/drive/MyDrive/master_thesis/dataset_multilingual/linguistic"
                    "/hi/result_10_accuracy/logits/permutation_0/adjective_antonym",

    rows_to_show = rows,
    out_name     = "adjective_antonym_hi",
    only_errors  = False,
    show_title   = False
)

✔︎ PDF : /content/drive/MyDrive/master_thesis/ploted_data_hi/adjective_antonym_hi.pdf
✔︎ PNG : /content/drive/MyDrive/master_thesis/ploted_data_hi/adjective_antonym_hi.png


In [None]:
erroranalyse_relation_new("/content/drive/MyDrive/master_thesis/dataset_multilingual/linguistic/hi/result_10_accuracy/logits/permutation_0/adjective_antonym")
#49-51


####adjective_comparative

In [8]:
rows = [0,1,2,3,4,5,6,7,8,9]

export_compact_table("/content/drive/MyDrive/master_thesis/dataset_multilingual/linguistic"
                    "/hi/result_10_accuracy/logits/permutation_0/adjective_comparative",

    rows_to_show = rows,
    out_name     = "adjective_comparative_hi",
    only_errors  = False,
    show_title   = False
)

✔︎ PDF : /content/drive/MyDrive/master_thesis/ploted_data_hi/adjective_comparative_hi.pdf
✔︎ PNG : /content/drive/MyDrive/master_thesis/ploted_data_hi/adjective_comparative_hi.png


In [None]:
erroranalyse_relation_new("/content/drive/MyDrive/master_thesis/dataset_multilingual/linguistic/hi/result_10_accuracy/logits/permutation_0/adjective_comparative")


####adjective_superlative

In [None]:
erroranalyse_relation_new("/content/drive/MyDrive/master_thesis/dataset_multilingual/linguistic/hi/result_10_accuracy/logits/permutation_0/adjective_superlative")




####verb_past_tense

In [None]:
erroranalyse_relation_new("/content/drive/MyDrive/master_thesis/dataset_multilingual/linguistic/hi/result_10_accuracy/logits/permutation_0/verb_past_tense")


####word_first_letter

In [None]:
erroranalyse_relation_new("/content/drive/MyDrive/master_thesis/dataset_multilingual/linguistic/hi/result_10_accuracy/logits/permutation_0/word_first_letter")

####word_last_letter

In [None]:
erroranalyse_relation_new("/content/drive/MyDrive/master_thesis/dataset_multilingual/linguistic/hi/result_10_accuracy/logits/permutation_0/word_last_letter")
#hi