In [68]:
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score, classification_report

# Function to load and preprocess a CSV file
def load_and_preprocess_csv(file_path, column_names, has_header=False):
    """
    Loads a CSV file, assigns column names, and handles optional headers.

    Args:
        file_path (str): Path to the CSV file.
        column_names (list): List of column names to assign.
        has_header (bool): Whether the file has a header row.

    Returns:
        pd.DataFrame: Preprocessed DataFrame.
    """
    if has_header:
        df = pd.read_csv(file_path, header=0, names=column_names)
    else:
        df = pd.read_csv(file_path, header=None, names=column_names)
    return df

# Function to extract specific columns as lists
def extract_columns(df, columns):
    """
    Extracts specified columns from a DataFrame as lists.

    Args:
        df (pd.DataFrame): The DataFrame to extract from.
        columns (list): List of column names to extract.

    Returns:
        dict: A dictionary with column names as keys and their corresponding lists as values.
    """
    return {col: df[col].tolist() for col in columns}

# Function to clean annotations
def clean_annotations(annotations):
    """
    Cleans a list of annotations by removing leading/trailing spaces and normalizing case.

    Args:
        annotations (list): List of annotations to clean.

    Returns:
        list: Cleaned list of annotations.
    """
    return [annotation.strip().lower() for annotation in annotations]

# Load and preprocess the CSV files
llm_results = load_and_preprocess_csv(
    "Query_Ollama_Test_updated.csv",
    column_names=["Prompt", "Answer1", "Answer2"],
    has_header=True
)

group_results = load_and_preprocess_csv(
    "Taxonomie_Einteilung.csv",
    column_names=["Prompt", "Answer"],
    has_header=False
)

# Extract relevant columns
llm_columns = extract_columns(llm_results, ["Prompt", "Answer1", "Answer2"])
group_columns = extract_columns(group_results, ["Prompt", "Answer"])

# Assign extracted columns to variables for clarity
queries = llm_columns["Prompt"]
# Clean annotations
annotations1 = clean_annotations(llm_columns["Answer1"])
annotations2 = clean_annotations(group_columns["Answer"])

# Ensure that the lists are of the same length for `llm_results`
if not (len(queries) == len(annotations1) == len(annotations2)):
    raise ValueError("Die Listen (queries, annotations1, annotations2) müssen gleich lang sein.")



In [69]:
# Kategorien sammeln
categories = sorted(list(set(annotations1 + annotations2)))

In [70]:
# Statistische Auswertungen
def evaluate_annotations(annotations1, annotations2):
    """Vergleicht zwei Annotationen-Listen und gibt statistische Metriken zurück."""

    # Grundlegende Metriken
    total = len(annotations1)
    agreement = sum(a1 == a2 for a1, a2 in zip(annotations1, annotations2))
    agreement_rate = agreement / total

    # Cohen's Kappa
    kappa = cohen_kappa_score(annotations1, annotations2, labels=categories)

    # Detaillierter Report
    report = classification_report(annotations1, annotations2, labels=categories, output_dict=True, zero_division=0)

    return {
        "Total": total,
        "Agreement": agreement,
        "Agreement Rate": agreement_rate,
        "Cohen's Kappa": kappa,
        "Detailed Report": report,
    }

In [71]:
# Auswerten
ergebnisse = evaluate_annotations(annotations1, annotations2)

In [72]:
# Ergebnisse ausgeben
print("Statistische Auswertungen:")
print(f"Gesamtanzahl: {ergebnisse['Total']}")
print(f"Übereinstimmungen: {ergebnisse['Agreement']}")
print(f"Übereinstimmungsrate: {ergebnisse['Agreement Rate']:.2f}")
print("Cohen's Kappa: {:.2f}".format(ergebnisse["Cohen's Kappa"]))


print("\nDetaillierter Report:")
for label, metrics in ergebnisse['Detailed Report'].items():
    if isinstance(metrics, dict):
        print(f"Kategorie '{label}': {metrics}")

Statistische Auswertungen:
Gesamtanzahl: 50
Übereinstimmungen: 27
Übereinstimmungsrate: 0.54
Cohen's Kappa: 0.14

Detaillierter Report:
Kategorie 'ambiguous': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 12.0}
Kategorie 'event': {'precision': 0.6666666666666666, 'recall': 0.6666666666666666, 'f1-score': 0.6666666666666666, 'support': 3.0}
Kategorie 'time-independent': {'precision': 0.75, 'recall': 0.7058823529411765, 'f1-score': 0.7272727272727273, 'support': 34.0}
Kategorie 'timeliness': {'precision': 0.09090909090909091, 'recall': 1.0, 'f1-score': 0.16666666666666666, 'support': 1.0}
Kategorie 'macro avg': {'precision': 0.37689393939393934, 'recall': 0.5931372549019608, 'f1-score': 0.3901515151515152, 'support': 50.0}
Kategorie 'weighted avg': {'precision': 0.5518181818181818, 'recall': 0.54, 'f1-score': 0.5378787878787878, 'support': 50.0}
