In [1]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, spearmanr, kendalltau
from sklearn.metrics import cohen_kappa_score

# Define the function to analyze annotation scores
def analyze_annotation_scores(file_path):
    """
    Reads the annotation CSV file and computes key metrics:
    - Average scores for both annotators
    - Pearson, Spearman, and Kendall correlations
    - Cohen’s Kappa for inter-annotator agreement
    - Exact and close matches (±1 difference)
    """
    # Load the CSV file
    annotations_df = pd.read_csv(file_path)

    # Extract annotator scores
    jacob_scores = annotations_df["Jacob's Score"]
    tim_scores = annotations_df["Tim's Score"]

    # Compute basic statistics
    avg_jacob_score = np.mean(jacob_scores)
    avg_tim_score = np.mean(tim_scores)
    overall_avg_score = np.mean([jacob_scores, tim_scores])

    # Compute Pearson, Spearman, and Kendall correlations
    pearson_corr, _ = pearsonr(jacob_scores, tim_scores)
    spearman_corr, _ = spearmanr(jacob_scores, tim_scores)
    kendall_corr, _ = kendalltau(jacob_scores, tim_scores)

    # Compute Cohen's Kappa for inter-annotator agreement
    cohen_kappa = cohen_kappa_score(jacob_scores, tim_scores)

    # Compute absolute differences to analyze agreement
    score_differences = np.abs(jacob_scores - tim_scores)
    mean_difference = np.mean(score_differences)
    num_exact_matches = np.sum(score_differences == 0)
    num_close_matches = np.sum(score_differences <= 1)

    # Return computed metrics
    return {
        "Jacob's Average Score": avg_jacob_score,
        "Tim's Average Score": avg_tim_score,
        "Overall Average Score": overall_avg_score,
        "Pearson Correlation": pearson_corr,
        "Spearman Correlation": spearman_corr,
        "Kendall Tau Correlation": kendall_corr,
        "Cohen's Kappa": cohen_kappa,
        "Mean Score Difference": mean_difference,
        "Exact Score Matches": num_exact_matches,
        "Close Matches (±1)": num_close_matches,
    }

# Define the file path and analyze the annotation scores
file_path = "/Users/jacob/Desktop/MDS/COLX_565/COLX_565_Project_Jacob-Tim/milestone2/Detoxification Annotations.csv"
annotation_metrics = analyze_annotation_scores(file_path)

# Display the computed metrics
annotation_metrics

{"Jacob's Average Score": 8.2,
 "Tim's Average Score": 7.2,
 'Overall Average Score': 7.7,
 'Pearson Correlation': 0.8781179573797714,
 'Spearman Correlation': 0.8088710220015727,
 'Kendall Tau Correlation': 0.7048020519376482,
 "Cohen's Kappa": 0.1361256544502616,
 'Mean Score Difference': 1.1333333333333333,
 'Exact Score Matches': 4,
 'Close Matches (±1)': 11}