In [2]:
import pandas as pd
import itertools
from sklearn.metrics import cohen_kappa_score
from statsmodels.stats.inter_rater import fleiss_kappa

# Load the dataset
file_path = '/content/datasethumanattribution.xlsx'
data = pd.read_excel(file_path)

# Ensuring annotator columns are strings to avoid type mismatch issues
annotator_columns = ['label', 'gemini_label', 'gpt4_label']
data[annotator_columns] = data[annotator_columns].astype(str)


cohen_kappa_results = {}
for (annotator1, annotator2) in itertools.combinations(annotator_columns, 2):
    kappa = cohen_kappa_score(data[annotator1], data[annotator2])
    cohen_kappa_results[(annotator1, annotator2)] = kappa

# Display the Cohen's Kappa results
print("Cohen's Kappa Results:")
for pair, kappa in cohen_kappa_results.items():
    print(f"{pair}: {kappa}")

# Fleiss' Kappa
label_counts = data[annotator_columns].apply(lambda x: x.value_counts(), axis=1).fillna(0).astype(int)

# Calculate Fleiss' Kappa using the label counts
fleiss_kappa_value = fleiss_kappa(label_counts.values, method='fleiss')

# Display Fleiss' Kappa
print("\nFleiss' Kappa:", fleiss_kappa_value)


Cohen's Kappa Results:
('label', 'gemini_label'): 0.41102537894298996
('label', 'gpt4_label'): 0.4677206851119895
('gemini_label', 'gpt4_label'): 0.6789511408433879

Fleiss' Kappa: 0.5170788145567219


In [6]:
# Required Libraries
import pandas as pd
import itertools
from sklearn.metrics import cohen_kappa_score
from statsmodels.stats.inter_rater import fleiss_kappa

# Load the dataset
file_path = '/content/datasethumanattribution.xlsx'
data = pd.read_excel(file_path)

# Ensuring annotator columns are strings to avoid type mismatch issues
annotator_columns = ['label', 'gemini_label', 'gpt4_label']
data[annotator_columns] = data[annotator_columns].astype(str)

# Function to calculate Cohen's and Fleiss' Kappa within each category
def calculate_kappas_by_category(data, annotator_columns):
    # Initialize dictionaries to store results
    category_kappa_results = {}

    # Iterate over each unique category
    for category, group_data in data.groupby('CATEGORY'):
        print(f"\nCategory: {category}")

        # 1. Cohen's Kappa for Annotator Pairs in the current category
        cohen_kappa_results = {}
        for (annotator1, annotator2) in itertools.combinations(annotator_columns, 2):
            kappa = cohen_kappa_score(group_data[annotator1], group_data[annotator2])
            cohen_kappa_results[(annotator1, annotator2)] = kappa

        # Display Cohen's Kappa results
        print("Cohen's Kappa Results:")
        for pair, kappa in cohen_kappa_results.items():
            print(f"{pair}: {kappa}")

        # 2. Fleiss' Kappa for All Annotators in the current category
        # Transform the subset data to count each label per claim across annotators
        label_counts = group_data[annotator_columns].apply(lambda x: x.value_counts(), axis=1).fillna(0).astype(int)
        fleiss_kappa_value = fleiss_kappa(label_counts.values, method='fleiss')

        # Display Fleiss' Kappa
        print("Fleiss' Kappa:", fleiss_kappa_value)

        # Store results in dictionary
        category_kappa_results[category] = {
            "cohen_kappa": cohen_kappa_results,
            "fleiss_kappa": fleiss_kappa_value
        }

    return category_kappa_results

# Run the function and capture results
category_kappa_results = calculate_kappas_by_category(data, annotator_columns)

# Print out all results
print("\nSummary of Kappa Results by Category:")
for category, results in category_kappa_results.items():
    print(f"\nCategory: {category}")
    print("Cohen's Kappa:", results['cohen_kappa'])
    print("Fleiss' Kappa:", results['fleiss_kappa'])



Category: Blood
Cohen's Kappa Results:
('label', 'gemini_label'): 0.5553623188405796
('label', 'gpt4_label'): 0.5949656750572082
('gemini_label', 'gpt4_label'): 0.5741254858411994
Fleiss' Kappa: 0.5738722757222502

Category: Bone health
Cohen's Kappa Results:
('label', 'gemini_label'): 0.31746031746031744
('label', 'gpt4_label'): 0.30152284263959395
('gemini_label', 'gpt4_label'): 0.7722457627118644
Fleiss' Kappa: 0.4513793103448274

Category: COVID
Cohen's Kappa Results:
('label', 'gemini_label'): 0.6500000000000001
('label', 'gpt4_label'): 0.6500000000000001
('gemini_label', 'gpt4_label'): 1.0
Fleiss' Kappa: 0.7576923076923076

Category: Cancer
Cohen's Kappa Results:
('label', 'gemini_label'): nan
('label', 'gpt4_label'): nan
('gemini_label', 'gpt4_label'): nan
Fleiss' Kappa: nan

Category: Cardiovascular Health


  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  kappa = (p_mean - p_mean_exp) / (1- p_mean_exp)


Cohen's Kappa Results:
('label', 'gemini_label'): 0.5227272727272727
('label', 'gpt4_label'): 0.475
('gemini_label', 'gpt4_label'): 0.8223350253807107
Fleiss' Kappa: 0.6020573903627503

Category: Dental Health
Cohen's Kappa Results:
('label', 'gemini_label'): 0.014285714285714346
('label', 'gpt4_label'): 0.36614173228346447
('gemini_label', 'gpt4_label'): 0.5740740740740741
Fleiss' Kappa: 0.30420168067226916

Category: Diabetes
Cohen's Kappa Results:
('label', 'gemini_label'): 0.44615384615384623
('label', 'gpt4_label'): 0.5764705882352941
('gemini_label', 'gpt4_label'): 0.4098360655737706
Fleiss' Kappa: 0.4620174346201744

Category: Ear
Cohen's Kappa Results:
('label', 'gemini_label'): 0.2749003984063745
('label', 'gpt4_label'): 0.28627450980392166
('gemini_label', 'gpt4_label'): 0.6988416988416988
Fleiss' Kappa: 0.4141384388807066

Category: Eye
Cohen's Kappa Results:
('label', 'gemini_label'): 0.17021276595744683
('label', 'gpt4_label'): 0.3085106382978724
('gemini_label', 'gpt4_lab

  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  kappa = (p_mean - p_mean_exp) / (1- p_mean_exp)


Fleiss' Kappa: 0.5245509793295654

Category: Hair
Cohen's Kappa Results:
('label', 'gemini_label'): 0.3928571428571429
('label', 'gpt4_label'): 0.44805194805194803
('gemini_label', 'gpt4_label'): 0.7888198757763976
Fleiss' Kappa: 0.5395833333333333

Category: Kidney
Cohen's Kappa Results:
('label', 'gemini_label'): 0.0
('label', 'gpt4_label'): 0.0
('gemini_label', 'gpt4_label'): nan
Fleiss' Kappa: -0.5000000000000001

Category: Men's health
Cohen's Kappa Results:
('label', 'gemini_label'): 0.417004048582996
('label', 'gpt4_label'): 0.37662337662337664
('gemini_label', 'gpt4_label'): 0.654054054054054
Fleiss' Kappa: 0.4623217922606924

Category: Mental Health
Cohen's Kappa Results:
('label', 'gemini_label'): 0.7428571428571429
('label', 'gpt4_label'): 0.7272727272727273
('gemini_label', 'gpt4_label'): 0.7692307692307692


  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)


Fleiss' Kappa: 0.7452830188679245

Category: Muscles
Cohen's Kappa Results:
('label', 'gemini_label'): 0.2857142857142857
('label', 'gpt4_label'): 0.014084507042253502
('gemini_label', 'gpt4_label'): 0.5918367346938775
Fleiss' Kappa: 0.2193308550185874

Category: Neurological health
Cohen's Kappa Results:
('label', 'gemini_label'): 0.4125874125874126
('label', 'gpt4_label'): 0.36019242333132884
('gemini_label', 'gpt4_label'): 0.48712077847738977
Fleiss' Kappa: 0.4173537495092264

Category: Skin
Cohen's Kappa Results:
('label', 'gemini_label'): 0.38250272826482346
('label', 'gpt4_label'): 0.5257227891156462
('gemini_label', 'gpt4_label'): 0.5324434556915091
Fleiss' Kappa: 0.47170953101361535

Category: Throat
Cohen's Kappa Results:
('label', 'gemini_label'): 0.5528846153846154
('label', 'gpt4_label'): 0.6105527638190955
('gemini_label', 'gpt4_label'): 0.9138888888888889
Fleiss' Kappa: 0.6813249571673327

Category: Vascular
Cohen's Kappa Results:
('label', 'gemini_label'): 0.153846153846