## Error rate evaluation

In [4]:
import pandas as pd
import ast
final_df = pd.read_csv("fixation_annotations.csv")

# Parse string representations of lists into actual lists
list_columns = [
    'incorrect_line_annotator1',
    'incorrect_line_annotator2',
    'other_annotator1',
    'other_annotator2'
]

for col in list_columns:
    final_df[col] = final_df[col].apply(ast.literal_eval)
final_df.head()

Unnamed: 0,file_name,num_lines,number_of_fixations,participant_id,trial_index,other_annotator1,other_annotator2,incorrect_line_annotator1,incorrect_line_annotator2
0,10_056_l2_324_40.pdf,10,56,l2_324,40,"[5, 9, 12, 18, 19, 20, 21, 26, 31, 32, 33, 41,...","[5, 9, 12, 18, 19, 26, 31, 33, 40, 41, 46, 49,...",[36],[]
1,10_056_l48_535_44.pdf,10,56,l48_535,44,"[6, 11, 12, 14, 15, 16, 22, 27, 31, 38, 39, 44...","[6, 11, 12, 16, 17, 22, 27, 31, 38, 39, 44, 50...","[36, 42, 43]","[36, 42, 43]"
2,10_057_l36_280_56.pdf,10,57,l36_280,56,"[8, 9, 12, 16, 22, 26, 27, 29, 36, 42, 46, 47,...","[8, 9, 12, 13, 16, 22, 26, 27, 32, 36, 37, 42,...","[7, 20, 41]","[7, 20, 41]"
3,10_059_l52_477_34.pdf,10,59,l52_477,34,"[5, 10, 12, 14, 18, 24, 25, 30, 31, 36, 37, 42...","[5, 10, 12, 14, 18, 24, 25, 30, 31, 36, 37, 42...","[6, 11]","[6, 11]"
4,10_060_l48_140_33.pdf,10,60,l48_140,33,"[8, 16, 17, 23, 24, 29, 35, 39, 43, 49, 50, 56]","[8, 16, 17, 23, 24, 29, 35, 39, 43, 49, 50, 56]",[11],[11]


In [5]:
# Combine incorrect_line and other_line from both annotators using union
final_df['incorrect_line'] = [
    sorted(list(set(final_df.iloc[i]['incorrect_line_annotator1']) | set(final_df.iloc[i]['incorrect_line_annotator2'])))
    for i in range(len(final_df))
]

final_df['other'] = [
    sorted(list(set(final_df.iloc[i]['other_annotator1']) | set(final_df.iloc[i]['other_annotator2'])))
    for i in range(len(final_df))
]

# Calculate mistake count, other count, and error rate
final_df['mistake_count'] = final_df['incorrect_line'].apply(len)
final_df['other_count'] = final_df['other'].apply(len)
final_df['error_rate'] = final_df['mistake_count'] / (final_df['number_of_fixations'])

In [7]:
from sklearn.metrics import cohen_kappa_score

# Create combined binary lists for overall annotation (mistake=1, unknown=2, correct=0)
annotator1_overall = []
annotator2_overall = []

for i in range(len(final_df)):
    row = final_df.iloc[i]
    
    for line_idx in range(row['number_of_fixations']):
        # Annotator1's annotation
        if line_idx in row['incorrect_line_annotator1']:
            annotator1_overall.append(1)
        elif line_idx in row['other_annotator1']:
            annotator1_overall.append(2)
        else:
            annotator1_overall.append(0)
        
        # Annotator2's annotation
        if line_idx in row['incorrect_line_annotator2']:
            annotator2_overall.append(1)
        elif line_idx in row['other_annotator2']:
            annotator2_overall.append(2)
        else:
            annotator2_overall.append(0)

# Compute Cohen's Kappa for overall agreement
overall_agreement = cohen_kappa_score(annotator1_overall, annotator2_overall)

print(f"Total fixations annotated: {len(annotator1_overall)}")
print(f"Overall inter-annotator agreement (Cohen's Kappa): {overall_agreement:.3f}")
# Count occurrences of each label
a1_counts = {
    "correct": annotator1_overall.count(0),
    "mistake": annotator1_overall.count(1),
    "other": annotator1_overall.count(2)
}

a2_counts = {
    "correct": annotator2_overall.count(0),
    "mistake": annotator2_overall.count(1),
    "other": annotator2_overall.count(2)
}

print("\nAnnotator 1 label distribution:")
for label, count in a1_counts.items():
    print(f"  {label}: {count} ({count/len(annotator1_overall):.1%})")

print("\nAnnotator 2 label distribution:")
for label, count in a2_counts.items():
    print(f"  {label}: {count} ({count/len(annotator2_overall):.1%})")

# Calculate overall distribution based on the combined incorrect_line and unknown_line fields
total_fixations = sum(final_df['number_of_fixations'])
total_mistakes = sum(final_df['mistake_count'])
total_other = sum(final_df['other_count'])
total_correct = total_fixations - total_mistakes - total_other

print("\nOverall distribution based on final (combined) annotations:")
print(f"  correct: {total_correct} ({total_correct/total_fixations:.1%})")
print(f"  mistake: {total_mistakes} ({total_mistakes/total_fixations:.1%})")
print(f"  other: {total_other} ({total_other/total_fixations:.1%})")

# print average of % fixations marked as correct and as other between annotators
avg_correct_rate = (a1_counts['correct']/len(annotator1_overall) + a2_counts['correct']/len(annotator2_overall)) / 2
avg_other_rate = (a1_counts['other']/len(annotator1_overall) + a2_counts['other']/len(annotator2_overall)) / 2
print(f"\nAverage correct rate between annotators: {avg_correct_rate:.1%}")
print(f"Average other rate between annotators: {avg_other_rate:.1%}")


Total fixations annotated: 1296
Overall inter-annotator agreement (Cohen's Kappa): 0.945

Annotator 1 label distribution:
  correct: 929 (71.7%)
  mistake: 50 (3.9%)
  other: 317 (24.5%)

Annotator 2 label distribution:
  correct: 949 (73.2%)
  mistake: 48 (3.7%)
  other: 299 (23.1%)

Overall distribution based on final (combined) annotations:
  correct: 924 (71.3%)
  mistake: 50 (3.9%)
  other: 322 (24.8%)

Average correct rate between annotators: 72.5%
Average other rate between annotators: 23.8%
