In [184]:
import pandas as pd
import json
import numpy as np
from scipy import stats

from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from pyprojroot import here

import krippendorff

import pickle
from sklearn.metrics import jaccard_score
from itertools import combinations

# Intercoder reliability for the second round of annotation 

This notebook computes the intercoder reliability for the second round of annotation using the bootstrapping method.
Additionally, this notebook processes 


## Load data

In [185]:

# Read the annotation JSON file into a DataFrame
complete_df = pd.read_json(here("data/individual_policy_data/policy-recoding-2025-05-21-complete.json"))
df_with_inner_id = pd.read_json(here("data/individual_policy_data/policy-at-2025-04-27.json"))

In [186]:
# Extract internal_id from the nested 'data' field
df_with_inner_id['internal_id'] = df_with_inner_id['data'].apply(lambda x: x['internal_id'])


In [187]:
# Make a mapping from internal_id -> inner_id
internal_to_inner = dict(zip(df_with_inner_id['internal_id'], df_with_inner_id['inner_id']))


In [188]:
# Count the number of annotators per transcript
annotator_counts = complete_df.groupby("internal_id")["annotator"].nunique()

# Count transcripts where 2 or more annotators annotated the same transcript
num_transcripts_with_multiple_annotators = (annotator_counts >= 2).sum()

print(f"Number of unique transcripts with two or more annotators: {num_transcripts_with_multiple_annotators}")




Number of unique transcripts with two or more annotators: 750


## Compute intercoder reliability

In [189]:
# Normalize policy column to always be a list
def standardize_policy(entry):
    if isinstance(entry, dict) and 'choices' in entry:
        return entry['choices']
    elif isinstance(entry, str):
        return [entry]
    elif isinstance(entry, list):
        return entry
    else:
        return []

# Apply normalization
complete_df['policy'] = complete_df['policy'].apply(standardize_policy)

In [190]:
# Ensure 'updated_at' is in datetime format
complete_df['updated_at'] = pd.to_datetime(complete_df['updated_at'])

# Sort by internal_id, annotator, and updated_at (newest last)
df_sorted = complete_df.sort_values(by=['internal_id', 'annotator', 'updated_at'])

# Keep only the most recent annotation for each (internal_id, annotator) pair
df_deduped = df_sorted.drop_duplicates(subset=['internal_id', 'annotator'], keep='last')

In [191]:
# df = complete_df[['internal_id','annotator','policy']]
df = df_deduped[['internal_id','annotator','policy']]

In [192]:
## define set of policies
all_policies = [ "Paris Agreement", "Green New Deal", "Executive action", "Emergency declaration", "(De)regulation and laws", "Renewable energy", "Emission reduction", "Oil and gas industry", "Other climate policy"]

In [193]:
# One-hot encode each policy annotation
def one_hot_encode(policies, all_policies):
    return [1 if policy in policies else 0 for policy in all_policies]

In [194]:
# Ensure `policy_encoded` is a DataFrame-friendly list
df["policy_encoded"] = df["policy"].apply(lambda x: one_hot_encode(x, all_policies))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["policy_encoded"] = df["policy"].apply(lambda x: one_hot_encode(x, all_policies))


In [195]:
# Convert list column into separate one-hot encoded columns
policy_df = pd.DataFrame(df["policy_encoded"].tolist(), columns=all_policies)
df_binary = pd.concat([df.drop(columns=["policy_encoded"]), policy_df], axis=1)


In [196]:
# List of one-hot encoded policy columns
policy_columns = all_policies  # Ensure this contains your policy labels

# Pivot DataFrame (internal_id × annotator)
df_pivot = df_binary.pivot(index="internal_id", columns="annotator", values=policy_columns)

# Drop rows where all annotator responses are missing
df_pivot = df_pivot.dropna(how="all")


## Jaccard assessment

In [197]:

# To store results
jaccard_results = []

# Group by transcript
for transcript_id, group in df_binary.groupby('internal_id'):
    # Each annotator's policy vector
    annotator_vectors = group.set_index('annotator')[policy_columns]
    
    # Skip if only one annotator
    if len(annotator_vectors) < 2:
        continue

    # Compute pairwise Jaccard for all annotator pairs
    pair_scores = []
    for a1, a2 in combinations(annotator_vectors.index, 2):
        v1 = annotator_vectors.loc[a1].values
        v2 = annotator_vectors.loc[a2].values
        score = jaccard_score(v1, v2)
        pair_scores.append(score)
    
    # Average pairwise Jaccard for the transcript
    jaccard_results.append({
        'internal_id': transcript_id,
        'avg_jaccard': sum(pair_scores) / len(pair_scores),
        'n_annotators': len(annotator_vectors)
    })

# Convert to DataFrame
jaccard_df = pd.DataFrame(jaccard_results)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [198]:
## fix zeros
jaccard_df.loc[jaccard_df['avg_jaccard'] == 0, 'avg_jaccard'] = 1


In [199]:
jaccard_df['avg_jaccard'].mean()

np.float64(0.9406730158730159)

## By-policy Krippendorff's alpha

In [200]:

def bootstrap_alpha(data, level_of_measurement='nominal', n_boot=10000, conf_level=0.95):
    """
    Bootstraps Krippendorff’s alpha with confidence intervals.
    
    Parameters:
    - data: 2D numpy array (coders × items)
    - level_of_measurement: 'nominal', 'ordinal', etc.
    - n_boot: number of bootstrap samples
    - conf_level: confidence level for CI (e.g., 0.95)

    Returns:
    - alpha_hat: point estimate of alpha
    - ci_lower, ci_upper: confidence interval bounds
    """
    n_items = data.shape[1]
    alphas = []

    for _ in range(n_boot):
        sample_indices = np.random.choice(n_items, n_items, replace=True)
        sample = data[:, sample_indices]
        try:
            alpha = krippendorff.alpha(reliability_data=sample, level_of_measurement=level_of_measurement)
        except Exception:
            alpha = np.nan
        alphas.append(alpha)

    alphas = np.array(alphas)
    alphas = alphas[~np.isnan(alphas)]
    
    alpha_hat = krippendorff.alpha(reliability_data=data, level_of_measurement=level_of_measurement)
    ci_lower = np.percentile(alphas, (1 - conf_level) / 2 * 100)
    ci_upper = np.percentile(alphas, (1 + conf_level) / 2 * 100)

    return alpha_hat, ci_lower, ci_upper


In [201]:
# Dictionary to store results
policy_results = {}

# Iterate through each policy
for policy in policy_columns:
    policy_data = np.array(df_pivot[policy].T)  # Transpose to (coders × items)

    # Bootstrap Krippendorff’s alpha and CI
    alpha, ci_lower, ci_upper = bootstrap_alpha(policy_data, level_of_measurement='nominal')

    # Identify disagreements
    disagreements = []
    for i, transcript in enumerate(df_pivot.index):
        annotator_responses = policy_data[:, i]
        annotator_responses = annotator_responses[~np.isnan(annotator_responses)]
        if len(set(annotator_responses)) > 1:
            disagreements.append(transcript)

    # Store results
    policy_results[policy] = {
        "krippendorff_alpha": alpha,
        "alpha_ci_lower": ci_lower,
        "alpha_ci_upper": ci_upper,
        "num_disagreements": len(disagreements),
        "disagreeing_transcripts": disagreements
    }

# Print results
for policy, results in policy_results.items():
    print(f"\n{policy}")
    print(f"  Krippendorff’s Alpha: {results['krippendorff_alpha']:.3f}")
    print(f"  95% CI: ({results['alpha_ci_lower']:.3f}, {results['alpha_ci_upper']:.3f})")
    print(f"  Number of Disagreements: {results['num_disagreements']}")



Paris Agreement
  Krippendorff’s Alpha: 0.958
  95% CI: (0.936, 0.978)
  Number of Disagreements: 15

Green New Deal
  Krippendorff’s Alpha: 0.983
  95% CI: (0.966, 0.996)
  Number of Disagreements: 4

Executive action
  Krippendorff’s Alpha: 0.967
  95% CI: (0.945, 0.986)
  Number of Disagreements: 9

Emergency declaration
  Krippendorff’s Alpha: 0.943
  95% CI: (0.845, 1.000)
  Number of Disagreements: 2

(De)regulation and laws
  Krippendorff’s Alpha: 0.987
  95% CI: (0.970, 1.000)
  Number of Disagreements: 3

Renewable energy
  Krippendorff’s Alpha: 0.909
  95% CI: (0.876, 0.938)
  Number of Disagreements: 32

Emission reduction
  Krippendorff’s Alpha: 0.918
  95% CI: (0.888, 0.946)
  Number of Disagreements: 29

Oil and gas industry
  Krippendorff’s Alpha: 0.918
  95% CI: (0.888, 0.945)
  Number of Disagreements: 30

Other climate policy
  Krippendorff’s Alpha: 0.810
  95% CI: (0.749, 0.866)
  Number of Disagreements: 38


### Overall policy Krippendorff's alpha

In [202]:


# Step 1: Melt the binary frame into long format
policy_columns = ['Paris Agreement', 'Green New Deal', 'Executive action', 'Emergency declaration',
                  '(De)regulation and laws', 'Renewable energy', 'Emission reduction',
                  'Oil and gas industry', 'Other climate policy']

df_long = df_binary.melt(
    id_vars=['internal_id', 'annotator'],
    value_vars=policy_columns,
    var_name='policy',
    value_name='value'
)

# Step 2: Define each item as a transcript-policy pair
df_long['item'] = df_long['internal_id'].astype(str) + "::" + df_long['policy']

# Step 3: Pivot to get item × annotator matrix
df_matrix = df_long.pivot(index='item', columns='annotator', values='value')

# Step 4: Check number of valid annotations per item
annotation_counts = df_matrix.notna().sum(axis=1)

# Step 5: Filter out items with fewer than 2 annotations
df_filtered = df_matrix[annotation_counts >= 2]

df_filtered = df_filtered.dropna(axis=1, how='all')

# Step 6: Convert to numpy and compute alpha
overall_alpha = krippendorff.alpha(
    reliability_data=df_filtered.to_numpy().T,
    level_of_measurement='nominal'
)

print(f"Krippendorff’s alpha (overall): {overall_alpha:.3f}")


Krippendorff’s alpha (overall): 0.945


In [203]:
# Create kripp_df from existing policy_results
kripp_df = pd.DataFrame.from_dict(policy_results, orient='index').reset_index()
kripp_df = kripp_df.rename(columns={'index': 'Policy'})

# Compute number of unique transcripts per policy using df_binary
num_transcripts_per_policy = {
    policy: df_binary.loc[df_binary[policy] == 1, 'internal_id'].nunique()
    for policy in policy_columns
}

# Add accurate Num_transcripts based on df_binary counts
kripp_df['num_transcripts'] = kripp_df['Policy'].map(num_transcripts_per_policy)

# Drop the list of disagreements
kripp_df = kripp_df.drop(columns=['disagreeing_transcripts'], errors='ignore')

# Compute number of disagreements across all items
overall_disagreements = df_filtered.apply(lambda row: len(set(row.dropna())) > 1, axis=1).sum()

# Convert to numpy and compute alpha with CI
matrix = df_filtered.to_numpy().T  # coders × items
alpha, ci_lower, ci_upper = bootstrap_alpha(matrix, level_of_measurement='nominal', n_boot=10000)


# Add overall alpha row
kripp_df = pd.concat([
    kripp_df,
    pd.DataFrame([{
        'Policy': 'TOTAL',
        'krippendorff_alpha': overall_alpha,
        "alpha_ci_lower": ci_lower,
        "alpha_ci_upper": ci_upper,
        'num_disagreements': overall_disagreements,
        'num_transcripts': complete_df['internal_id'].nunique()
    }])
], ignore_index=True)

# Save to CSV
kripp_df.to_csv("output/individual_policy_krippendorff_results.csv", index=False)


In [204]:


print(f"Krippendorff’s alpha (overall): {alpha:.3f}")
print(f"95% CI: ({ci_lower:.3f}, {ci_upper:.3f})")


Krippendorff’s alpha (overall): 0.945
95% CI: (0.937, 0.953)


In [205]:
# Find transcripts with disagreements for policies with alpha < 0.8
disagreement_transcripts = set()

for policy, results in policy_results.items():
    if results["krippendorff_alpha"] < 0.8:
        disagreement_transcripts.update(results["disagreeing_transcripts"])

# Get the count of unique transcripts
num_unique_disagreement_transcripts = len(disagreement_transcripts)

print(f"Number of unique transcripts with disagreements (Krippendorff < 0.8): {num_unique_disagreement_transcripts}")


Number of unique transcripts with disagreements (Krippendorff < 0.8): 0


## Analysis of multiple mentions and "other climate policy" binary flags

In [206]:
# Convert "yes"/"no" to True/False
complete_df['multiple-answer'] = complete_df['multiple-answer'].str.lower() == 'yes'
complete_df['outside-answer'] = complete_df['outside-answer'].str.lower() == 'yes'

In [207]:
# Count unique internal_id where "multiple-answer" is True
unique_multiple_answer = complete_df.loc[complete_df['multiple-answer'], 'internal_id'].nunique()

# Count unique internal_id where "outside-answer" is True
unique_outside_answer = complete_df.loc[complete_df['outside-answer'], 'internal_id'].nunique()

print(f"Unique internal_id count where multiple views are expressed is Yes: {unique_multiple_answer}")
print(f"Unique internal_id count where policy is mentioned outside the relevant chunk is Yes: {unique_outside_answer}")


Unique internal_id count where multiple views are expressed is Yes: 79
Unique internal_id count where policy is mentioned outside the relevant chunk is Yes: 46


In [208]:
inner_id = internal_to_inner.get(transcript_id, None)

In [209]:
# Filter rows where 'multiple-answer' is True
multiple_explained_df = complete_df.loc[complete_df['multiple-answer'], ['internal_id', 'q3_coder_response', 'multiple_explained']]
multiple_explained_df['inner_id'] = multiple_explained_df['internal_id'].map(internal_to_inner)

cols = ['inner_id', 'internal_id', 'q3_coder_response', 'multiple_explained']
multiple_explained_df = multiple_explained_df[cols]

In [211]:

# Display the filtered DataFrame
multiple_explained_df.to_csv('output/multiple_policies_explained.csv')

In [212]:
# Filter rows where 'multiple-answer' is True
outside_explained_df = complete_df.loc[complete_df['outside-answer'], ['internal_id', 'q3_coder_response', 'outside-policy', 'more-comments-explained']]
outside_explained_df['inner_id'] = outside_explained_df['internal_id'].map(internal_to_inner)

cols = ['inner_id','internal_id', 'q3_coder_response', 'outside-policy','more-comments-explained']
outside_explained_df = outside_explained_df[cols]

In [213]:
outside_explained_df.head()

Unnamed: 0,inner_id,internal_id,q3_coder_response,outside-policy,more-comments-explained
1,1,0,Supports,"{'choices': ['Paris Agreement', 'Green New Dea...",didn't add UN climate accord as other climate ...
17,9,36,,Oil and gas industry,May have to throw this one out because it does...
24,13,77,Opposes,"{'choices': ['Renewable energy', 'Oil and gas ...","Agree with Kate except for ""emergency declarat..."
25,13,77,Opposes,"{'choices': ['Renewable energy', 'Oil and gas ...",Disagree with Zade because not referring to an...
42,22,187,Opposes,(De)regulation and laws,Changed my coding to reflect election timeline...


In [214]:
outside_explained_df.to_csv('output/outside_policy_explained.csv')

In [215]:
# Convert boolean values to integers (True -> 1, False -> 0)
complete_df['multiple-answer'] = complete_df['multiple-answer'].astype(int)
complete_df['outside-answer'] = complete_df['outside-answer'].astype(int)

df_kripp = complete_df.pivot(index="internal_id", columns="annotator", values=['multiple-answer', 'outside-answer'])

# Drop rows where all annotator responses are missing (NaN)
df_kripp = df_kripp.dropna(how="all")

# Convert to numpy arrays and transpose for Krippendorff’s alpha (coders x items)
alpha_multiple = krippendorff.alpha(reliability_data=np.array(df_kripp['multiple-answer']).T, level_of_measurement='nominal')
alpha_outside = krippendorff.alpha(reliability_data=np.array(df_kripp['outside-answer']).T, level_of_measurement='nominal')

# Print results
print(f"Krippendorff’s Alpha for 'multiple-answer': {alpha_multiple:.3f}")
print(f"Krippendorff’s Alpha for 'outside-answer': {alpha_outside:.3f}")


Krippendorff’s Alpha for 'multiple-answer': 0.620
Krippendorff’s Alpha for 'outside-answer': 0.543
