In [7]:
!pip install krippendorff
!pip install scikit-learn
import krippendorff
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score, classification_report

#Function to load and preprocess csv file
def load_and_preprocess_csv(file_path, column_names, has_header=False):
    """
    Loads a CSV file, assigns column names, and handles optional headers.

    Args:
        file_path (str): Path to the CSV file.
        column_names (list): List of column names to assign.
        has_header (bool): Whether the file has a header row.

    Returns:
        pd.DataFrame: Preprocessed DataFrame.
    """
    if has_header:
        df = pd.read_csv(file_path, header=0, names=column_names)
    else:
        df = pd.read_csv(file_path, header=None, names=column_names)
    return df

#Function to extract specific columns as lists
def extract_columns(df, columns):
    """
    Extracts specified columns from a DataFrame as lists.

    Args:
        df (pd.DataFrame): The DataFrame to extract from.
        columns (list): List of column names to extract.

    Returns:
        dict: A dictionary with column names as keys and their corresponding lists as values.
    """
    return {col: df[col].tolist() for col in columns}

#Function to clean annotations
def clean_annotations(annotations):
    """
    Cleans a list of annotations by removing leading/trailing spaces and normalizing case.

    Args:
        annotations (list): List of annotations to clean.

    Returns:
        list: Cleaned list of annotations.
    """
    return [annotation.strip().lower() for annotation in annotations]

#Load and preprocess the csv files
llm_results = load_and_preprocess_csv(
    "Query2_sampled_queries_2022-07_updated.csv",
    column_names=["qid", "query", "Answer"],
    has_header=True
)

group_results = load_and_preprocess_csv(
    "Annotatedv2_sampled_queries_2022-07.csv",
    column_names=["qid", "query", "Answer"],
    has_header=True
)

#Extract relevant columns
llm_columns = extract_columns(llm_results, ["qid", "query", "Answer"])
group_columns = extract_columns(group_results, ["qid", "query", "Answer"])

#Assign extracted columns to variables for clarity
queries = llm_columns["query"]
#Clean annotations
annotations1 = clean_annotations(llm_columns["Answer"])
annotations2 = clean_annotations(group_columns["Answer"])

#Ensure that lists are of the same length for `llm_results`
if not (len(queries) == len(annotations1) == len(annotations2)):
    raise ValueError("Die Listen (queries, annotations1, annotations2) müssen gleich lang sein.")



In [8]:
#Kategorien sammeln
categories = sorted(list(set(annotations1 + annotations2)))

In [9]:
#Statistische Auswertungen
def evaluate_annotations(annotations1, annotations2):
    """Vergleicht zwei Annotationen-Listen und gibt statistische Metriken zurück."""

    #Grundlegende Metriken
    total = len(annotations1)
    agreement = sum(a1 == a2 for a1, a2 in zip(annotations1, annotations2))
    agreement_rate = agreement / total

    #Cohen's Kappa
    kappa = cohen_kappa_score(annotations1, annotations2, labels=categories)

    #Krippendorff's Alpha
    #Transform the annotations into a matrix format suitable for Krippendorff's alpha
    data_matrix = np.array([annotations1, annotations2])
    alpha = krippendorff.alpha(reliability_data=data_matrix, level_of_measurement='nominal')

    #Detaillierter Report
    report = classification_report(annotations1, annotations2, labels=categories, output_dict=True, zero_division=0)

    return {
        "Total": total,
        "Agreement": agreement,
        "Agreement Rate": agreement_rate,
        "Cohen's Kappa": kappa,
        "Krippendorff's Alpha": alpha,
        "Detailed Report": report,
    }

In [10]:
#Auswerten
ergebnisse = evaluate_annotations(annotations1, annotations2)

In [11]:
#Ergebnisse ausgeben
print("Statistische Auswertungen:")
print(f"Gesamtanzahl: {ergebnisse['Total']}")
print(f"Übereinstimmungen: {ergebnisse['Agreement']}")
print(f"Übereinstimmungsrate: {ergebnisse['Agreement Rate']:.2f}")
print("Cohen's Kappa: {:.2f}".format(ergebnisse["Cohen's Kappa"]))
print("Krippendorff's Alpha: {:.2f}".format(ergebnisse["Krippendorff's Alpha"]))


print("\nDetaillierter Report:")
for label, metrics in ergebnisse['Detailed Report'].items():
    if isinstance(metrics, dict):
        print(f"Kategorie '{label}': {metrics}")

Statistische Auswertungen:
Gesamtanzahl: 100
Übereinstimmungen: 80
Übereinstimmungsrate: 0.80
Cohen's Kappa: 0.45
Krippendorff's Alpha: 0.44

Detaillierter Report:
Kategorie 'explicit-temporal': {'precision': 1.0, 'recall': 0.5, 'f1-score': 0.6666666666666666, 'support': 2.0}
Kategorie 'implicit-temporal': {'precision': 0.39285714285714285, 'recall': 0.7857142857142857, 'f1-score': 0.5238095238095238, 'support': 14.0}
Kategorie 'not-temporal': {'precision': 0.9577464788732394, 'recall': 0.8095238095238095, 'f1-score': 0.8774193548387097, 'support': 84.0}
Kategorie 'macro avg': {'precision': 0.7835345405767941, 'recall': 0.6984126984126983, 'f1-score': 0.6892985151049666, 'support': 100.0}
Kategorie 'weighted avg': {'precision': 0.8795070422535212, 'recall': 0.8, 'f1-score': 0.8236989247311829, 'support': 100.0}


In [12]:
print(set(annotations1))
print(set(annotations2))

{'implicit-temporal', 'explicit-temporal', 'not-temporal'}
{'implicit-temporal', 'explicit-temporal', 'not-temporal'}
