### Import libraries and preprocess manual annotation files


In [37]:
# Import libraries
import pandas as pd
from sklearn.metrics import cohen_kappa_score
import openpyxl

In [38]:
# Set paths for files to HB annotation and UY annotation
HB_PATH = r"C:\Users\seohy\nlplearnerdata\interrater_reliability\HB_annotation.xlsx"
UY_PATH = r"C:\Users\seohy\nlplearnerdata\interrater_reliability\UY_annotation.xlsx"

In [39]:
# Read in and process HB annotation
HB_sheets = pd.read_excel(HB_PATH, sheet_name = None)

# Combine sheets into one df
HB_combined = pd.concat(HB_sheets.values(), axis = 0, ignore_index = True)

# Drop and select relevant columns
HB_combined = HB_combined[["ID", "FORM", "POS", "HEAD", "DEPREL"]]

# Rename columns to include HB
HB_combined = HB_combined.rename(
    {
        "ID": "ID_HB", 
        "FORM": "FORM_HB",
        "POS": "POS_HB",
        "HEAD": "HEAD_HB",
        "DEPREL": "DEPREL_HB"
    }, 
    axis=1
)

In [40]:
# Check and verify preprocessing results
HB_combined.head(5)

Unnamed: 0,ID_HB,FORM_HB,POS_HB,HEAD_HB,DEPREL_HB
0,1,I,PRP,2,nsubj
1,2,introduce,VBP,0,root
2,3,things,NNS,2,obj
3,4,what,WP,6,obj
4,5,l,PRP,6,nsubj


In [41]:
# HB_combined.to_csv("HB_combined.csv", encoding="utf-8-sig")

In [42]:
# Check for rows with NAN values 
# Should return nothing before computing interrater reliability
HB_combined[HB_combined.isnull().any(axis=1)]

Unnamed: 0,ID_HB,FORM_HB,POS_HB,HEAD_HB,DEPREL_HB


In [43]:
# Read in and process UY annotation
UY_sheets = pd.read_excel(UY_PATH, sheet_name = None)

# Combine sheets into one df
UY_combined = pd.concat(UY_sheets.values(), axis = 0, ignore_index = True)

# Drop and select only relevant columns
UY_combined = UY_combined[["ID", "FORM", "POS", "HEAD", "DEPREL"]]

# Rename columns to include UY
UY_combined = UY_combined.rename(
    {
        "ID": "ID_UY", 
        "FORM": "FORM_UY",
        "POS": "POS_UY",
        "HEAD": "HEAD_UY",
        "DEPREL": "DEPREL_UY"
    }, 
    axis=1
)

In [44]:
# Convert heads (floats) into integers
UY_combined["HEAD_UY"] = UY_combined["HEAD_UY"].astype("int64")

In [45]:
# Check and verify preprocessing results
UY_combined[UY_combined.isnull().any(axis=1)]

Unnamed: 0,ID_UY,FORM_UY,POS_UY,HEAD_UY,DEPREL_UY


In [46]:
# Combine the two processed dfs
df_combined = pd.concat([HB_combined, UY_combined], axis = 1)

# Reorder columns to align Align rows
df_combined = df_combined[[
    "ID_HB",
    "ID_UY",
    "FORM_HB",
    "FORM_UY",
    "POS_HB",
    "POS_UY",
    "HEAD_HB",
    "HEAD_UY",
    "DEPREL_HB",
    "DEPREL_UY"
]]

In [47]:
# Check and verify preprocessing results
df_combined.head(5)

Unnamed: 0,ID_HB,ID_UY,FORM_HB,FORM_UY,POS_HB,POS_UY,HEAD_HB,HEAD_UY,DEPREL_HB,DEPREL_UY
0,1,1,I,I,PRP,PRP,2,2,nsubj,nsubj
1,2,2,introduce,introduce,VBP,VBP,0,0,root,root
2,3,3,things,things,NNS,NNS,2,2,obj,obj
3,4,4,what,what,WP,WP,6,6,obj,mark
4,5,5,l,l,PRP,PRP,6,6,nsubj,nsubj


### !!Sanity check!!


In [48]:
# Sainty check - tokenization is the same
# Should return nothing (no rows) if tokenization matches and align
df_combined[df_combined["FORM_HB"] != df_combined["FORM_UY"]]

Unnamed: 0,ID_HB,ID_UY,FORM_HB,FORM_UY,POS_HB,POS_UY,HEAD_HB,HEAD_UY,DEPREL_HB,DEPREL_UY


### Compute interrater reliability for POS and DP


In [49]:
# Cohen's kappa for POS 
pos_kappa = cohen_kappa_score(df_combined["POS_HB"], df_combined["POS_UY"])
print(f"Cohen's kappa for POS annotation: {pos_kappa}")

Cohen's kappa for POS annotation: 0.938468391659714


In [None]:
# Computing cohen's kappa for head and relationships
head_kappa = cohen_kappa_score(df_combined["HEAD_HB"].astype(str), df_combined["HEAD_UY"].astype(str))
deprl_kappa = cohen_kappa_score(df_combined["DEPREL_HB"].astype(str), df_combined["DEPREL_UY"].astype(str))

# Retrieve the total number of tokens
tokens = len(df_combined)

# Compute UAS and LAS
same_heads = (df_combined["HEAD_HB"] == df_combined["HEAD_UY"]).sum()  
UAS =  same_heads / tokens * 100
same_heads_and_relation = ((df_combined["HEAD_HB"] == df_combined["HEAD_UY"]) & (df_combined["DEPREL_HB"] == df_combined["DEPREL_UY"])).sum()
LAS = same_heads_and_relation / tokens  * 100

# Print output for kappa, UAS, and LAS
print(f"Cohen's kappa for heads: {head_kappa}")
print(f"Cohen's kappa for relationships: {deprl_kappa}")
print(f"UAS for DP: {UAS}%")
print(f"LAS for DP: {LAS}%")

Cohen's kappa for heads: 0.9140524923094185
Cohen's kappa for relationships: 0.9079822063201939
UAS for DP: 92.16710182767625%
LAS for DP: 86.29242819843343%


### Compute interrater reliability for learner errors
