# DIGI405 Lab 4.3: Facilitator Notebook

This notebook can be used by the class facilitator or tutor to collate the class annotations, measure agreement with Krippendorff's Alpha, and find the texts with the lowest and highest average confidence, and the greatest confidence range.

You don't need to change any code apart from adding the zip file path.

1. Upload this notebook to JupyterHub along with the zip file of labelled CSV files from the class
2. Enter the path to the zip as the `zip_file_path` variable
3. Run the cells - do a visual check of output to make sure it is as expected
4. Share the `=== CONFIDENCE ANALYSIS PER TEXT ===` results and the box plot with the class
5. Share the `Krippendorff's alpha:` score with the class and the `encoded_annotations.csv` which is in a format that students can upload directly to the online K-Alpha calculator (test this first - the result using the 'Nominal' data type option should be the same as calculated here.



In [None]:
import pandas as pd
import numpy as np
import zipfile
import re
import io
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Union

In [None]:
zip_file_path = "df_labelled_test.zip"

In [None]:
def process_csv_zip(zip_file_path):
    """
    Given multiple CSV files in a zip archive, concatenate into a single dataframe.

    This function processes CSV files containing the pattern "*df_labelled_*.csv" from
    a zip archive. It merges them horizontally based on the "text" column, renames
    the "label" and "confidence" columns with suffixes based on the filename, and
    adds encoded columns that convert text labels to integers.
    """
    with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
        csv_files = [f for f in zip_ref.namelist() if "df_labelled_" in f and f.endswith(".csv")]

        if not csv_files:
            print("No matching CSV files found in the zip archive.")
            return None

        first_file = csv_files[0]
        match = re.search(r"df_labelled_([^\.]+)\.csv", first_file)
        first_suffix = match.group(1) if match else "unknown"

        with zip_ref.open(first_file) as f:
            result_df = pd.read_csv(io.TextIOWrapper(f, encoding = "utf-8"))

        # Store original text order
        original_text_order = result_df["text"].tolist()

        # Rename columns in first dataframe
        result_df = result_df.rename(columns = {
            "label": f"label_{first_suffix}",
            "confidence": f"confidence_{first_suffix}"
        })

        # Create mapping dictionary for label encoding
        label_mapping = {"NEGATIVE": 0, "NEUTRAL": 1, "POSITIVE": 2}

        # Create encoded column for first dataframe
        result_df[f"encoded_{first_suffix}"] = result_df[f"label_{first_suffix}"].map(label_mapping)

        # Process remaining CSV files
        for file in csv_files[1:]:
            # Extract suffix from filename
            match = re.search(r"df_labelled_([^\.]+)\.csv", file)
            if match:
                suffix = match.group(1)

                with zip_ref.open(file) as f:
                    df = pd.read_csv(io.TextIOWrapper(f, encoding = "utf-8"))

                # Create temporary df with renamed columns
                temp_df = df[["text",
                              "label",
                              "confidence"]].rename(columns={
                                  "label": f"label_{suffix}",
                                  "confidence": f"confidence_{suffix}"
                                  })

                # Create encoded column
                temp_df[f"encoded_{suffix}"] = temp_df[f"label_{suffix}"].map(label_mapping)

                # Merge with result dataframe
                result_df = pd.merge(result_df, temp_df, on = "text", how = "outer")

        # Restore original row order (for texts present in the first file)
        result_df["_original_order"] = result_df["text"].apply(
            lambda x: original_text_order.index(x) if x in original_text_order else len(original_text_order)
        )

        # Sort by original order, then put new texts (not in first file) at the end
        result_df = result_df.sort_values("_original_order").drop(columns=["_original_order"])

        result_df = result_df.reset_index(drop=True)

        confidence_cols = [col for col in result_df.columns if "confidence_" in col]

        for col in confidence_cols:
            result_df[col] = (
            result_df[col]
            .astype(str)
            .str.rstrip("%")
            .replace("nan", pd.NA)
            .astype("Int64")  # Capital I, allows NA
            )

        result_df["tweet_id"] = [f"Tweet {i+1}" for i in range(len( result_df))]

        return result_df

In [None]:
merged_df = process_csv_zip(zip_file_path)

In [None]:
# Check the resulting df
merged_df

In [None]:
def analyse_confidence_per_text(df):
    """
    Get the average, min, max, and interquartile range of
    confidence per text.
    """
    confidence_cols = [col for col in df.columns if "confidence_" in col]

    df["avg_confidence"] = df[confidence_cols].mean(axis=1)
    df["min_confidence"] = df[confidence_cols].min(axis=1)
    df["max_confidence"] = df[confidence_cols].max(axis=1)
    df["q1_confidence"] = df[confidence_cols].quantile(0.25, axis=1)
    df["q3_confidence"] = df[confidence_cols].quantile(0.75, axis=1)
    df["iqr_confidence"] = df["q3_confidence"] - df["q1_confidence"]
    df["range_confidence"] = df["max_confidence"] - df["min_confidence"]

    # Find the maximum/minimum values
    highest_avg = df["avg_confidence"].max()
    lowest_avg = df["avg_confidence"].min()
    highest_iqr = df["iqr_confidence"].max()

    # Find all tweets with these values
    highest_avg_tweets = df[df["avg_confidence"] == highest_avg]
    lowest_avg_tweets = df[df["avg_confidence"] == lowest_avg]
    highest_iqr_tweets = df[df["iqr_confidence"] == highest_iqr]

    # Print results including all ties
    print("\n=== CONFIDENCE ANALYSIS PER TEXT ===\n")

    print(f"TEXT(S) WITH HIGHEST AVERAGE CONFIDENCE ({highest_avg:.2f}):")
    for _, row in highest_avg_tweets.iterrows():
        print(f"{row['tweet_id']}: '{row['text']}'")
    print()

    print(f"TEXT(S) WITH LOWEST AVERAGE CONFIDENCE ({lowest_avg:.2f}):")
    for _, row in lowest_avg_tweets.iterrows():
        print(f"{row['tweet_id']}: '{row['text']}'")
    print()

    print(f"TEXT(S) WITH BIGGEST CONFIDENCE INTERQUARTILE RANGE (IQR = {highest_iqr:.2f}):")
    for _, row in highest_iqr_tweets.iterrows():
        print(f"{row['tweet_id']}: '{row['text']}'")
        print(f"  Q1: {row['q1_confidence']:.2f}, Q3: {row['q3_confidence']:.2f}, Range: {row['range_confidence']:.2f}")


In [None]:
analyse_confidence_per_text(merged_df)

In [None]:
def plot_confidence_boxplots(merged_df):
    """
    Create boxplots of confidence scores for each tweet.
    """
    confidence_cols = [col for col in merged_df.columns if "confidence_" in col]

    df = merged_df.copy()
    df_long = pd.melt(
        df,
        id_vars=["tweet_id"],
        value_vars = confidence_cols,
        var_name = "Annotator",
        value_name = "Confidence"
    )

    df_long["Annotator"] = df_long["Annotator"].str.replace("confidence_", "")

    fig, ax = plt.subplots(figsize=(10, 6))

    sns.boxplot(
        data = df_long,
        x = "tweet_id",
        y = "Confidence",
        ax = ax,
        color = "royalblue"
    )

    plt.title("Confidence Scores: Distribution by Tweet", fontsize = 10)
    plt.ylabel("Confidence Score", fontsize = 9)
    # plt.xlabel("Tweet Number", fontsize = 9)
    plt.xticks(rotation = 90, ha = "center", fontsize = 9)
    plt.yticks(fontsize = 9)
    plt.tight_layout()

    return fig

In [None]:
fig = plot_confidence_boxplots(merged_df)
plt.show()

In [None]:
# Calculate Krippendorff's alpha
# Developed with help from Claude 3.7 Sonnet
# Using method 'C. Nominal data, any number of observers, missing data' from:
# Krippendorff, K. (2011, January 25). Computing Krippendorff’s Alpha-Reliability. https://repository.upenn.edu/handle/20.500.14332/2089

def calculate_krippendorff_alpha(data: Union[pd.DataFrame, np.ndarray], verbose=True) -> float:
    """
    Calculate Krippendorff's alpha for nominal data.

    Args:
        data: DataFrame or array where rows are units and columns are observers
        verbose: Whether to print detailed information about the calculation

    Returns:
        Krippendorff's alpha coefficient
    """
    # Convert to numpy array with NaN for missing values
    if isinstance(data, pd.DataFrame):
        data = data.to_numpy(dtype="float64", na_value=np.nan)
    else:
        data = np.array(data, dtype="float64")

    # Initialise coincidence matrix as dictionary
    coincidences = {}
    value_counts = {}
    total_coincidences = 0
    total_values = 0
    total_pairs = 0  # Track the actual number of pairs

    # For each unit (row)
    for unit in data:
        # Get valid values in this unit
        valid_values = unit[~np.isnan(unit)]
        mu = len(valid_values)

        if mu <= 1:
            continue  # Skip units with 0 or 1 valid value

        # Add to total values that can be paired
        total_values += mu

        # Add to total pairs within this unit
        unit_pairs = mu * (mu - 1)
        total_pairs += unit_pairs

        # Calculate all coincidences within this unit
        for i, val1 in enumerate(valid_values):
            for j, val2 in enumerate(valid_values):
                if i != j:  # Don't pair a value with itself
                    # Create pair key for coincidence matrix
                    pair = (val1, val2)

                    # Add to coincidence matrix with weight 1/(mu-1)
                    coincidences[pair] = coincidences.get(pair, 0) + 1/(mu-1)

                    # Update value count for val1
                    value_counts[val1] = value_counts.get(val1, 0) + 1/(mu-1)

                    # Add to total coincidences
                    total_coincidences += 1/(mu-1)

    # Calculate observed disagreement (Do)
    Do = 0
    for (val1, val2), count in coincidences.items():
        if val1 != val2:  # For nominal data, disagreement when values differ
            Do += count

    # Calculate expected disagreement (De)
    De = 0
    for val1, count1 in value_counts.items():
        for val2, count2 in value_counts.items():
            if val1 != val2:
                De += (count1 * count2) / (total_coincidences - 1)

    # Calculate alpha
    alpha = 1 - (Do / De)

    if verbose:
        print(f"Total pairable values: {total_values}")
        print(f"Total pairs compared: {total_pairs}")
        # print(f"Total coincidences (n): {total_coincidences}")
        # print(f"Value counts (n.c): {value_counts}")
        # print(f"Observed disagreement (Do): {Do}")
        # print(f"Expected disagreement (De): {De}")
        print(f"Krippendorff's alpha: {alpha:.3f}")

    return alpha

In [None]:
confidence_cols = [col for col in merged_df.columns if "confidence_" in col]
encoded_cols = [col for col in merged_df.columns if "encoded_" in col]
encoded_df = merged_df[encoded_cols].copy()

# For each annotator, check their confidence scores
# Replace encoded values with NA where confidence is 0
for i, (enc_col, conf_col) in enumerate(zip(encoded_cols, confidence_cols)):
    zero_confidence_mask = merged_df[conf_col] == 0
    if zero_confidence_mask.any():
        print(f"Found {zero_confidence_mask.sum()} zero confidence scores for {enc_col}")
        encoded_df.loc[zero_confidence_mask, enc_col] = pd.NA  # Use pd.NA, not np.nan
    # Ensure column is Int64 after assignment
    encoded_df[enc_col] = encoded_df[enc_col].astype("Int64")

print(f"Calculating Krippendorff's alpha for {len(encoded_cols)} encoded columns")
print(f"Number of texts analysed: {len(encoded_df)}")

alpha = calculate_krippendorff_alpha(encoded_df, verbose=True)

In [None]:
encoded_df.head(5)

In [None]:
encoded_df.to_csv("encoded_annotations.csv", header=False, index=False, na_rep="NA")
print("Saved encoded values to encoded_annotations.csv (no headers, no index)\nDownload this file and share with the class for testing with online K-Alpha calculator\nPlease test first!")

In [None]:
# Now get a single annotation per tweet for the class based on the mode for each tweet
row_modes = encoded_df.mode(axis=1, dropna=True)[0]

In [None]:
# Compare the class mode label to the original dataset label
orig_labels = pd.read_csv("/srv/source-data/tweets_original_labels.csv", header = 0)
row_modes_df = row_modes.to_frame().reset_index(drop=True)

# Concatenate horizontally
combined_class_orig = pd.concat([row_modes_df, orig_labels], axis=1)
combined_class_orig = combined_class_orig.rename(columns={0: "class_label", "label": "orig_label"})

combined_class_orig_enc = combined_class_orig.drop(columns=["text"])

In [None]:
alpha_2 = calculate_krippendorff_alpha(combined_class_orig_enc, verbose=True)

In [None]:
combined_class_orig

In [None]:
# Save without header or index
combined_class_orig_enc.to_csv("row_modes_and_orig_labels.csv", header=False, index=False)