### Import libraries and preprocess manual annotation files


In [1]:
# Import libraries
import pandas as pd
from sklearn.metrics import cohen_kappa_score
import openpyxl

In [2]:
# Helper function for stripping whitespace from whole dataframe
# Source 1: https://www.geeksforgeeks.org/pandas/pandas-strip-whitespace-from-entire-dataframe/
# Source 2: https://pandas.pydata.org/docs/reference/api/pandas.Series.str.strip.html

def whitespaceremove(df):
    # Iterate over each column in the dataframe
    for column in df.columns:
        df[column] = df[column].astype(str).str.strip()

In [3]:
# Set paths for files to HB annotation and UY annotation
HB_PATH_1 = r"C:\Users\seohy\nlplearnerdata\interrater_reliability\HB-annotation-1.xlsx"
UY_PATH_1 = r"C:\Users\seohy\nlplearnerdata\interrater_reliability\UY-annotation-1.xlsx"
HB_PATH_2 = r"C:\Users\seohy\nlplearnerdata\interrater_reliability\HB-annotation-2.xlsx"
UY_PATH_2 = r"C:\Users\seohy\nlplearnerdata\interrater_reliability\UY-annotation-2.xlsx"

In [4]:
# Read in and process HB annotation
HB_sheets_1 = pd.read_excel(HB_PATH_1, sheet_name = None)
HB_sheets_2 = pd.read_excel(HB_PATH_2, sheet_name = None)

# Combine sheets in each excel file into one df
HB_combined_1 = pd.concat(HB_sheets_1.values(), axis = 0, ignore_index = True)
HB_combined_2 = pd.concat(HB_sheets_2.values(), axis = 0, ignore_index = True)

# Combine the two combined dfs into one total combined df
HB_combined = pd.concat([HB_combined_1, HB_combined_2], axis = 0, ignore_index=True)

# Drop and select relevant columns
HB_combined = HB_combined[["ID", "FORM", "POS", "HEAD", "DEPREL"]]

# Rename columns to include HB
HB_combined = HB_combined.rename(
    {
        "ID": "ID_HB", 
        "FORM": "FORM_HB",
        "POS": "POS_HB",
        "HEAD": "HEAD_HB",
        "DEPREL": "DEPREL_HB"
    }, 
    axis=1
)

In [5]:
# Check and verify preprocessing results
HB_combined.head(5)

Unnamed: 0,ID_HB,FORM_HB,POS_HB,HEAD_HB,DEPREL_HB
0,1,I,PRP,2,nsubj
1,2,introduce,VBP,0,root
2,3,things,NNS,2,obj
3,4,what,WP,6,obj
4,5,l,PRP,6,nsubj


In [6]:
# Check for rows with NAN values 
# Should return nothing before computing interrater reliability
HB_combined[HB_combined.isnull().any(axis=1)]

Unnamed: 0,ID_HB,FORM_HB,POS_HB,HEAD_HB,DEPREL_HB


In [7]:
# Strip all whitespace from each column
# Call function here so that NaN is not converted into a string 
whitespaceremove(HB_combined)

In [8]:
# Read in and process UY annotation
UY_sheets_1 = pd.read_excel(UY_PATH_1, sheet_name = None)
UY_sheets_2 = pd.read_excel(UY_PATH_2, sheet_name = None)

# Combine sheets in each excel file into one df
UY_combined_1 = pd.concat(UY_sheets_1.values(), axis = 0, ignore_index = True)
UY_combined_2 = pd.concat(UY_sheets_2.values(), axis = 0, ignore_index = True)

# Combine the two combined dfs into one total combined df
UY_combined = pd.concat([UY_combined_1, UY_combined_2], axis = 0, ignore_index=True)

# Drop and select only relevant columns
UY_combined = UY_combined[["ID", "FORM", "POS", "HEAD", "DEPREL"]]

# Strip all whitespace from each column
whitespaceremove(UY_combined)

# Rename columns to include UY
UY_combined = UY_combined.rename(
    {
        "ID": "ID_UY", 
        "FORM": "FORM_UY",
        "POS": "POS_UY",
        "HEAD": "HEAD_UY",
        "DEPREL": "DEPREL_UY"
    }, 
    axis=1
)

In [9]:
# Check for rows with NAN values 
# Should return nothing before computing interrater reliability
UY_combined[UY_combined.isnull().any(axis=1)]

Unnamed: 0,ID_UY,FORM_UY,POS_UY,HEAD_UY,DEPREL_UY


In [10]:
# Convert heads (floats) into integers
UY_combined["HEAD_UY"] = UY_combined["HEAD_UY"].astype("int64")

In [11]:
# Strip all whitespace from each column
# Call function here so that NaN is not converted into a string 
whitespaceremove(UY_combined)

In [12]:
# Combine the two processed dfs
df_combined = pd.concat([HB_combined, UY_combined], axis = 1)

# Reorder columns to align Align rows
df_combined = df_combined[[
    "ID_HB",
    "ID_UY",
    "FORM_HB",
    "FORM_UY",
    "POS_HB",
    "POS_UY",
    "HEAD_HB",
    "HEAD_UY",
    "DEPREL_HB",
    "DEPREL_UY"
]]

In [13]:
# Check and verify preprocessing results
df_combined.head(5)

Unnamed: 0,ID_HB,ID_UY,FORM_HB,FORM_UY,POS_HB,POS_UY,HEAD_HB,HEAD_UY,DEPREL_HB,DEPREL_UY
0,1,1,I,I,PRP,PRP,2,2,nsubj,nsubj
1,2,2,introduce,introduce,VBP,VBP,0,0,root,root
2,3,3,things,things,NNS,NNS,2,2,obj,obj
3,4,4,what,what,WP,WP,6,6,obj,mark
4,5,5,l,l,PRP,PRP,6,6,nsubj,nsubj


### !!Sanity check!!


In [14]:
# Sainty check - tokenization is the same
# Should return nothing (no rows) if tokenization matches and align
df_combined[df_combined["FORM_HB"] != df_combined["FORM_UY"]]

Unnamed: 0,ID_HB,ID_UY,FORM_HB,FORM_UY,POS_HB,POS_UY,HEAD_HB,HEAD_UY,DEPREL_HB,DEPREL_UY


### Compute interrater reliability for POS and DP


In [15]:
# Cohen's kappa for POS 
pos_kappa = cohen_kappa_score(df_combined["POS_HB"], df_combined["POS_UY"])
print(f"Cohen's kappa for POS annotation: {pos_kappa}")

Cohen's kappa for POS annotation: 0.9399872110444109


In [36]:
# Cohen's kappa for POS with POS_UY KOR --> FW
df_combined["POS_UY"] = df_combined["POS_UY"].replace("KOR", "FW")

pos_kappa = cohen_kappa_score(df_combined["POS_HB"], df_combined["POS_UY"])
print(f"Cohen's kappa for POS annotation: {pos_kappa}")

Cohen's kappa for POS annotation: 0.9503537376898976


In [17]:
# Computing cohen's kappa for head and relationships
head_kappa = cohen_kappa_score(df_combined["HEAD_HB"].astype(str), df_combined["HEAD_UY"].astype(str))
deprl_kappa = cohen_kappa_score(df_combined["DEPREL_HB"].astype(str), df_combined["DEPREL_UY"].astype(str))

# Retrieve the total number of tokens
tokens = len(df_combined)

# Compute UAS and LAS
same_heads = (df_combined["HEAD_HB"] == df_combined["HEAD_UY"]).sum()  
UAS =  same_heads / tokens * 100
same_heads_and_relation = ((df_combined["HEAD_HB"] == df_combined["HEAD_UY"]) & (df_combined["DEPREL_HB"] == df_combined["DEPREL_UY"])).sum()
LAS = same_heads_and_relation / tokens  * 100

# Print output for kappa, UAS, and LAS
print(f"Cohen's kappa for heads: {head_kappa}")
print(f"Cohen's kappa for relationships: {deprl_kappa}")
print(f"UAS for DP: {UAS}%")
print(f"LAS for DP: {LAS}%")

Cohen's kappa for heads: 0.9082238493361722
Cohen's kappa for relationships: 0.9219984221672534
UAS for DP: 91.63208852005532%
LAS for DP: 86.86030428769018%


In [18]:
# Return mismatches between POS for the two raters
df_combined["POS_MIS"] = ""
pos_mis = df_combined["POS_HB"] != df_combined["POS_UY"]
df_combined.loc[pos_mis, "POS_MIS"] = "O"

In [19]:
# Return mismatches between heads between the two raters
df_combined["HEAD_MIS"] = ""
head_mis = df_combined["HEAD_HB"] != df_combined["HEAD_UY"]
df_combined.loc[head_mis, "HEAD_MIS"] = "O"

In [20]:
# Return mismatches between dependency relations between the two raters
df_combined["DEPREL_MIS"] = ""
dep_mis = df_combined["DEPREL_HB"] != df_combined["DEPREL_UY"]
df_combined.loc[dep_mis, "DEPREL_MIS"] = "O"

In [21]:
df_combined

Unnamed: 0,ID_HB,ID_UY,FORM_HB,FORM_UY,POS_HB,POS_UY,HEAD_HB,HEAD_UY,DEPREL_HB,DEPREL_UY,POS_MIS,HEAD_MIS,DEPREL_MIS
0,1,1,I,I,PRP,PRP,2,2,nsubj,nsubj,,,
1,2,2,introduce,introduce,VBP,VBP,0,0,root,root,,,
2,3,3,things,things,NNS,NNS,2,2,obj,obj,,,
3,4,4,what,what,WP,WP,6,6,obj,mark,,,O
4,5,5,l,l,PRP,PRP,6,6,nsubj,nsubj,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1441,1,1,it,it,PRP,PRP,3,3,nsubj,nsubj,,,
1442,2,2,is,is,VBZ,VBZ,3,3,cop,cop,,,
1443,3,3,funny,funny,JJ,JJ,0,0,root,root,,,
1444,4,4,to,to,TO,TO,5,5,case,case,,,


In [22]:
# Reorder columns and save to csv
df_combined = df_combined.iloc[:, [0, 1, 2, 3, 4, 5, 10, 6, 7, 11, 8, 9, 12]]

df_combined.to_csv("interrater_results.csv", encoding="utf-8-sig")

### Compute interrater reliability for learner errors
