In [2]:
# ============================
# 1. LOADING & NORMALIZING DATA
# ============================

import pandas as pd
from sklearn.metrics import cohen_kappa_score
import numpy as np

# --- Option A: upload in Colab (uncomment if using upload widget) ---
# from google.colab import files
# uploaded = files.upload()
# CSV_PATH = next(iter(uploaded.keys()))

# --- Option B: hard-code path if the CSV is already in the environment ---
CSV_PATH = "/content/Final_annotation_result.csv"  # TODO: update to your new CSV file name

df = pd.read_csv(CSV_PATH)
print("Loaded:", CSV_PATH)
df.head()

Loaded: /content/Final_annotation_result.csv


Unnamed: 0,#,title,paragraph,paragraph_length,michelle_label,aigerim_label,chelsea_label,Unnamed: 7,any match,all match,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,0,A Room with a View,"“I have been a failure,” said Miss Bartlett, a...",50,low,high,high,,Match,No Match,,"no match (no, high, low)",61,,
1,1,A Room with a View,Lucy paused. “Cecil said one day—and I thought...,63,high,none,low,,No Match,No Match,,all match,236,0.3933333333,
2,2,A Room with a View,"Miss Bartlett, who was poor at figures, became...",61,high,high,high,,Match,Match,,,Michelle,Aigerim,Chelsea
3,3,A Room with a View,An engagement is so potent a thing that sooner...,128,low,low,high,,Match,No Match,,High,242,181,198
4,4,A Room with a View,“In the course of conversation they said that ...,56,high,none,low,,No Match,No Match,,Low,114,121,183


In [3]:
# ============================
# 2. STANDARDIZE COLUMN NAMES
# ============================

df = df.rename(columns={
    df.columns[0]: "id",         # TODO: check that first column really is an ID
    "ID": "id",
    "Text": "paragraph",         # TODO: update if your text column has a different name
    "aigerim_label": "Aigerim_label",
    "chelsea_label": "Chelsea_label",
    "michelle_label": "Michelle_label"
})

In [4]:
df = df[["id", "title", "paragraph", "Aigerim_label", "Chelsea_label", "Michelle_label"]]
df=df.dropna()
df.head()


Unnamed: 0,id,title,paragraph,Aigerim_label,Chelsea_label,Michelle_label
0,0,A Room with a View,"“I have been a failure,” said Miss Bartlett, a...",high,high,low
1,1,A Room with a View,Lucy paused. “Cecil said one day—and I thought...,none,low,high
2,2,A Room with a View,"Miss Bartlett, who was poor at figures, became...",high,high,high
3,3,A Room with a View,An engagement is so potent a thing that sooner...,low,high,low
4,4,A Room with a View,“In the course of conversation they said that ...,none,low,high


In [5]:
# ============================
# 3. NORMALIZE LABELS: high / low / none
# ============================

# Canonical label set for the new task
LABELS = ["none", "low", "high"]

def norm_label(x):
    """
    Map messy/free-form labels into canonical: 'none', 'low', or 'high'.
    Extend this mapping based on how annotators actually wrote labels.
    """
    s = str(x).strip().lower()

    # TODO: add any extra variants your annotators used
    if s in ["none", "no", "0", "absent"]:
        return "none"
    if s in ["low", "1", "mild"]:
        return "low"
    if s in ["high", "2", "strong"]:
        return "high"

    # Fallback: if unknown, treat as 'none' (or raise an error)
    # TODO: Decide how you want to handle unknown labels.
    return np.nan

for col in ["Aigerim_label", "Chelsea_label", "Michelle_label"]:
    df[col] = df[col].apply(norm_label)

df.head()

Unnamed: 0,id,title,paragraph,Aigerim_label,Chelsea_label,Michelle_label
0,0,A Room with a View,"“I have been a failure,” said Miss Bartlett, a...",high,high,low
1,1,A Room with a View,Lucy paused. “Cecil said one day—and I thought...,none,low,high
2,2,A Room with a View,"Miss Bartlett, who was poor at figures, became...",high,high,high
3,3,A Room with a View,An engagement is so potent a thing that sooner...,low,high,low
4,4,A Room with a View,“In the course of conversation they said that ...,none,low,high


In [6]:
# convert label to numeric values
annotator_cols = ["Aigerim_label", "Chelsea_label", "Michelle_label"]
label_map = {
    "none": 0,
    "low": 1,
    "high": 2
}

# none/low/high → 0/1/2
df_numeric = df[annotator_cols].replace(label_map).astype(float)
df_numeric.head()


  df_numeric = df[annotator_cols].replace(label_map).astype(float)


Unnamed: 0,Aigerim_label,Chelsea_label,Michelle_label
0,2.0,2.0,1.0
1,0.0,1.0,2.0
2,2.0,2.0,2.0
3,1.0,2.0,1.0
4,0.0,1.0,2.0


In [7]:
!pip install krippendorff


Collecting krippendorff
  Downloading krippendorff-0.8.2-py3-none-any.whl.metadata (3.0 kB)
Downloading krippendorff-0.8.2-py3-none-any.whl (18 kB)
Installing collected packages: krippendorff
Successfully installed krippendorff-0.8.2


In [8]:
#!pip install krippendorff

import krippendorff

# need shape = (num_annotators, num_items)
ratings_matrix = df_numeric.to_numpy().T  # annotators x items

# Compute Krippendorff’s alpha（ordinal）
alpha_ordinal = krippendorff.alpha(
    reliability_data=ratings_matrix,
    level_of_measurement='ordinal'
)

print("Krippendorff's alpha (ordinal) =", alpha_ordinal)

Krippendorff's alpha (ordinal) = 0.48439895013392387


In [9]:
# ============================
# 4. AGREEMENT METRICS (PAIRWISE & FLEISS' KAPPA)
# ============================



A, C, M = "Aigerim_label", "Chelsea_label", "Michelle_label"

# ---- 4.1 Pairwise percent agreement & Cohen’s kappa (multiclass) ----
pairs = [(A, C), (A, M), (C, M)]
pair_stats = []

for x, y in pairs:
    agree = (df[x] == df[y]).mean()
    kappa = cohen_kappa_score(df[x], df[y])  # supports >2 classes
    pair_stats.append({
        "pair": f"{x} vs {y}",
        "percent_agree": round(agree, 3),
        "kappa": round(kappa, 3)
    })

pair_stats_df = pd.DataFrame(pair_stats)
print(pair_stats_df)

# Mean pairwise agreement
mean_pair_agree = np.mean([(df[x] == df[y]).mean() for x, y in pairs])
print("Mean pairwise agreement:", round(mean_pair_agree, 3))

TypeError: '<' not supported between instances of 'str' and 'float'

In [None]:
# ---- 4.2 Fleiss' kappa for 3 raters, 3 categories ----

def fleiss_kappa(counts):
    """
    Generic Fleiss' kappa for N items, k categories.
    counts: N x k matrix with counts per item per category.
    Assumes the same number of raters for all items.
    """
    counts = np.asarray(counts)
    N, k = counts.shape
    n = counts.sum(axis=1)[0]  # number of raters per item (assume constant)

    # Proportion of all assignments to each category
    p = counts.sum(axis=0) / (N * n)

    # Agreement for each item
    P = ( (counts * (counts - 1)).sum(axis=1) / (n * (n - 1)) )

    Pbar = P.mean()
    PbarE = (p ** 2).sum()
    return (Pbar - PbarE) / (1 - PbarE)

# Build N x k counts matrix for ['none','low','high']
label_order = LABELS  # ['none','low','high']
counts = []

for _, row in df[[A, C, M]].iterrows():
    counts.append([(row == lab).sum() for lab in label_order])

fk = fleiss_kappa(counts)
print("Fleiss' kappa:", round(float(fk), 3))

In [10]:
# ============================
# 5. GOLD LABEL (MAJORITY VOTE) & LABEL BALANCE
# ============================

from collections import Counter

def majority_vote_row(row):
    """
    Majority vote across A, C, M for a multiclass label.
    If there is a tie (e.g., 'none','low','high' all different),
    we need a tie-breaking rule.
    """
    labels = [row[A], row[C], row[M]]
    counts = Counter(labels).most_common()

    # counts is like [('low', 2), ('high', 1)]
    # If clear majority:
    if len(counts) == 1 or counts[0][1] > counts[1][1]:
        return counts[0][0]
    else:
        return "low"

    # Tie case: e.g., ('none',1),('low',1),('high',1) or ('low',1),('high',1)
    # TODO: Decide tie-breaking policy. Some options:
    #   - return "low"
    #   - return "none"
    #   - return a special label like "tie" and exclude from modeling

    #return "low"  # <-- placeholder choice; change if needed

df["gold_label"] = df.apply(majority_vote_row, axis=1)

print("Label counts:\n", df["gold_label"].value_counts())
print("Label balance (%):\n", df["gold_label"].value_counts(normalize=True).round(3))


Label counts:
 gold_label
none    237
high    204
low     156
Name: count, dtype: int64
Label balance (%):
 gold_label
none    0.397
high    0.342
low     0.261
Name: proportion, dtype: float64


In [11]:
# clean gold for future models
df[['title','paragraph','gold_label']].to_csv('interiority_gold.csv', index=False)
print("Saved: interiority_gold.csv")

Saved: interiority_gold.csv


In [12]:
# ============================
# 6. ROW-LEVEL AGREEMENT SCORE (UNCHANGED)
# ============================

# 1 = all three agree, 2/3 = two agree, 0 = all different
df["row_agreement"] = (
    ((df[A] == df[C]).astype(int) +
     (df[A] == df[M]).astype(int) +
     (df[C] == df[M]).astype(int)) / 3
)

disagreements = df[df["row_agreement"] < 1].copy()
disagreements[["id", "paragraph", A, C, M, "row_agreement"]].head()

# Save for manual review
disagreements[["id", "paragraph", A, C, M, "row_agreement"]].to_csv(
    "NEW_TASK_disagreements.csv", index=False  # TODO: rename file if you want
)
print("Saved: NEW_TASK_disagreements.csv")


Saved: NEW_TASK_disagreements.csv


In [13]:
# ============================
# 7. BASELINES (SKELETON FOR 3-CLASS TASK)
# ============================

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

y = df["gold_label"]
texts = df["paragraph"].fillna("")

def report_multi(y_true, y_pred, title):
    print(f"\n=== {title} ===")
    acc = accuracy_score(y_true, y_pred)
    # Macro-average over the three classes
    pr, rc, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, labels=LABELS, average="macro", zero_division=0
    )
    print("Accuracy:", round(acc, 3))
    print("Macro Precision:", round(pr, 3))
    print("Macro Recall:", round(rc, 3))
    print("Macro F1:", round(f1, 3))

    print("\nConfusion matrix (rows=true, cols=pred):\n",
          pd.DataFrame(
              confusion_matrix(y_true, y_pred, labels=LABELS),
              index=[f"true_{l}" for l in LABELS],
              columns=[f"pred_{l}" for l in LABELS]
          ))
    return {"Model": title, "Accuracy": round(acc, 3), "Macro_F1": round(f1, 3)}

results = []

# ---- 7.1 Majority-class baseline (3-class) ----
maj = y.mode()[0]
y_pred = np.full(len(y), maj)
results.append(report_multi(y, y_pred, f"Majority baseline (always '{maj}')"))

# ---- 7.2 Rule-based baseline (OPTIONAL / TODO) ----
# For now, just a placeholder that always predicts 'low'.
# TODO: Replace with a task-specific heuristic if you have one.
rule_pred = np.full(len(y), "low")
results.append(report_multi(y, rule_pred, "Simple rule baseline (always 'low')"))

# ---- 7.3 Logistic Regression (TF-IDF 1–2 grams, multiclass) ----
X_train, X_test, y_train, y_test = train_test_split(
    texts, y, test_size=0.2, random_state=42, stratify=y
)

vec = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
Xtr, Xte = vec.fit_transform(X_train), vec.transform(X_test)

# Multiclass logistic regression (one-vs-rest by default)
clf = LogisticRegression(max_iter=1000)
clf.fit(Xtr, y_train)
lr_pred = clf.predict(Xte)
results.append(report_multi(y_test, lr_pred, "LogReg (TF-IDF 1–2 grams, 3-class)"))

pd.DataFrame(results)


=== Majority baseline (always 'none') ===
Accuracy: 0.397
Macro Precision: 0.132
Macro Recall: 0.333
Macro F1: 0.189

Confusion matrix (rows=true, cols=pred):
            pred_none  pred_low  pred_high
true_none        237         0          0
true_low         156         0          0
true_high        204         0          0

=== Simple rule baseline (always 'low') ===
Accuracy: 0.261
Macro Precision: 0.087
Macro Recall: 0.333
Macro F1: 0.138

Confusion matrix (rows=true, cols=pred):
            pred_none  pred_low  pred_high
true_none          0       237          0
true_low           0       156          0
true_high          0       204          0

=== LogReg (TF-IDF 1–2 grams, 3-class) ===
Accuracy: 0.55
Macro Precision: 0.435
Macro Recall: 0.504
Macro F1: 0.445

Confusion matrix (rows=true, cols=pred):
            pred_none  pred_low  pred_high
true_none         30         5         13
true_low          18         1         12
true_high          6         0         35


Unnamed: 0,Model,Accuracy,Macro_F1
0,Majority baseline (always 'none'),0.397,0.189
1,Simple rule baseline (always 'low'),0.261,0.138
2,"LogReg (TF-IDF 1–2 grams, 3-class)",0.55,0.445
