# Inter-annotator Agreement Scores
1. Preprocess labeled data.
    1. Collapse labels
    2. Rename labels
    3. Convert rows with no labels to "not infection"
    4. Fill NAs

2. Match to the actual test sets
3. Calculate the Cohen-Kappa score

## Import Libraries and Data

In [1]:
# Import libraries
import os
import pandas as pd
import numpy as np

from pathlib import Path
from collections import defaultdict

# Show current working directory
print(os.getcwd())

/Users/kyuan/DPhil/Projects/EHR-Indication-Processing/03_Evaluation


Specify paths and other runtime options

In [2]:
# --- Set Paths
# Set base data path
base_data_path = Path("../00_Data")

assert base_data_path.is_dir(),\
  f"{base_data_path} either doesn't exist or is not a directory."

# Set output data path
output_path = base_data_path/"publication_ready"/"evaluation"
output_path.mkdir(exist_ok=True)

# Data Sets path
data_sets_path = base_data_path/"data_sets"

# Raw data path
raw_labels_path = base_data_path/"raw_labels"

Read the raw reviewer data

In [3]:
# --- Import data
# Read all label CSV files starting with "indication_labels_*.csv"
reviewer_labels_raw = {
    "Reviewer_A": None,
    "Reviewer_B": None,
}

print("Reading consensus labels:")
for reviewer in reviewer_labels_raw.keys():
    print("---")
    print("For reviewer: ", reviewer)

    raw_label_list = []

    for file in sorted(raw_labels_path.glob(f"indication_labels_v*-{reviewer}.csv")):
        print(file.stem)
        raw_label_list.append(pd.read_csv(
          file,
          dtype=str,
          keep_default_na=False,
          na_values=["NA"],
        ))
    
    # Concatenate all the dataframes
    reviewer_labels_raw[reviewer] = pd.concat(raw_label_list, ignore_index=True)

print("---")
print("Summary:")
print({key: value.shape for key, value in reviewer_labels_raw.items()})

Reading consensus labels:
---
For reviewer:  Reviewer_A
indication_labels_v2-Reviewer_A
indication_labels_v3-Reviewer_A
indication_labels_v4.1-Reviewer_A
indication_labels_v4.2-Reviewer_A
---
For reviewer:  Reviewer_B
indication_labels_v2-Reviewer_B
indication_labels_v3-Reviewer_B
indication_labels_v4.1-Reviewer_B
indication_labels_v4.2-Reviewer_B
---
Summary:
{'Reviewer_A': (5648, 15), 'Reviewer_B': (5648, 15)}


In [4]:
# Read the Unlabelled Data
unlabelled_data_dict = {
  "training": data_sets_path/'Oxford/Train_Set_4000.csv',
  "test_oxford": data_sets_path/'Oxford/Test_Set_2000.csv',
  "test_banbury": data_sets_path/'Banbury/Test_Set_2000.csv',
}

unlabelled_data = {}

print("Reading unlabelled data:")
for key, value in unlabelled_data_dict.items():
    print(value.stem)
    unlabelled_data[key] = pd.read_csv(
      value,
      dtype={"Indication": str},
      keep_default_na=False,
      na_values=None,
    )

Reading unlabelled data:
Train_Set_4000
Test_Set_2000
Test_Set_2000


## Pre-Process the Datasets
Unify the labels, collapse some categories, convert to the same datatype.

Then apply labels to our data sets

### Unify labels

Define function to collapse labels & perform some cleaning steps

In [5]:
def collapse_labels(input_df: pd.DataFrame) -> pd.DataFrame:
    # -- Convert all columns to integer except for "Indication"
    input_df = input_df.replace(r'^\s*$', np.nan, regex=True)
    input_df = input_df.astype({col: 'Int64' for col in input_df.columns if col != "Indication"})

    # -- Collapse ["prophylaxis", "procedural"] into "prophylaxis", and
    # ["immunosuppression", "viral"] into "not_infection"
    input_df["prophylaxis"] = input_df[["prophylaxis", "procedural"]].any(axis=1) * 1
    input_df["not_informative"] = input_df[['viral']].any(axis=1) * 1

    # Drop the collapsed columns
    labels_to_drop = ['procedural', 'viral']
    input_df = input_df.drop(columns = labels_to_drop)

    # -- Clean column names & columns
    # Rename "other" column to "other_specific"
    input_df = input_df.rename(columns={"other": "other_specific"})

    # Drop "immunosuppression" column
    input_df = input_df.drop(columns = ['immunosuppression'])

    # -- Convert all entries with no specified indication to "not_informative"
    # Get mask of rows with no label
    df_mask = ~input_df.drop(columns=["Indication", "uncertainty"]).any(axis=1)
    # Apply binary or to "not_informative" column and mask
    input_df["not_informative"] = input_df["not_informative"] | (df_mask * 1)

    print("Number of entries with no label converted to 'not_informative':",
        df_mask.sum())
    print("Added \"not_informative\" indications:\n", input_df[df_mask].Indication)
    
    # -- Unify Data Types
    # Fill NaN and empty strings with 0
    input_df = input_df.fillna(0)  

    # Convert numbers to integers
    input_num_cols = input_df.select_dtypes(np.number)
    input_df[input_num_cols.columns] = input_num_cols.astype('Int64')

    # -- Make distinct. Keep last occurence
    input_df = input_df.drop_duplicates(subset=["Indication"], keep="last")

    return input_df

Convert the training data and collapse labels.

In [6]:
# Collapse columns
reviewer_labels = {}

for reviewer, raw_label_df in reviewer_labels_raw.items():
    print("---")
    print("Reviewer: ", reviewer)
    reviewer_labels[reviewer] = collapse_labels(raw_label_df)
    print("\n")

    # Show final shape
    print("Final shape:")
    print(reviewer_labels[reviewer].shape)

---
Reviewer:  Reviewer_A
Number of entries with no label converted to 'not_informative': 2
Added "not_informative" indications:
 2414                   n/a
5419    ?hap- pen allergic
Name: Indication, dtype: object


Final shape:
(5615, 13)
---
Reviewer:  Reviewer_B
Number of entries with no label converted to 'not_informative': 8
Added "not_informative" indications:
 1                               ?
690                           bro
1109                         cons
1319            e. cloacae in csf
1435               esbl urosepsis
2092    intrabdominal collections
2414                          n/a
3396               sepsis ? focus
Name: Indication, dtype: object


Final shape:
(5615, 13)


### Subscript data 
Convert back to the sizes for `Oxford Training`, `Oxford Test` and `Banbury Test`

In [7]:
def join_to_original(original_df: pd.DataFrame, label_df: pd.DataFrame) -> pd.DataFrame:
    # Join input_df with original_df
    joined_df = original_df[["Indication"]].\
        join(
            label_df.set_index("Indication"),
            on="Indication", 
            how="left",
            validate="many_to_one")

    # Verify that we don't have any missing entries
    missing_rows = joined_df[joined_df.isna().any(axis=1)]
    print("Rows with NA values:")
    print(missing_rows.Indication)

    assert missing_rows.shape[0] == 0, "There are unlabelled entries in the training data."

    return joined_df

#### Training

Join back with the original data, keep only the "Indication" column

In [8]:
# Join test_unlabeled_df with complete_labels
oxford_training = {reviewer: join_to_original(unlabelled_data["training"], reviewer_labels[reviewer]) for reviewer in reviewer_labels.keys()}

Rows with NA values:
Series([], Name: Indication, dtype: object)
Rows with NA values:
Series([], Name: Indication, dtype: object)


#### Test
Join back with the original test data, keep only a subset of the columns neede to identify the prescription.

For publication strip the identifiable data & only keep the indication

In [9]:
oxford_test = {reviewer: join_to_original(unlabelled_data["test_oxford"], reviewer_labels[reviewer]) for reviewer in reviewer_labels.keys()}

banbury_test = {reviewer: join_to_original(unlabelled_data["test_banbury"], reviewer_labels[reviewer]) for reviewer in reviewer_labels.keys()}

Rows with NA values:
Series([], Name: Indication, dtype: object)
Rows with NA values:
Series([], Name: Indication, dtype: object)
Rows with NA values:
Series([], Name: Indication, dtype: object)
Rows with NA values:
Series([], Name: Indication, dtype: object)


## Calculate Kappa Score

Calculate the per class Cohen Kappa Score and an averaged version

In [15]:
# Calculate the Cohen's Kappa Score between the two reviewers for each class and an average
from sklearn.metrics import cohen_kappa_score

def calculate_kappa_score(reviewer1_df: pd.DataFrame, reviewer2_df: pd.DataFrame) -> pd.DataFrame:
    # Calculate Cohen's Kappa Score
    kappa_scores = {}

    for col in reviewer1_df.columns[1:]:
        kappa_scores[col] = cohen_kappa_score(reviewer1_df[col], reviewer2_df[col])

    # Average Kappa Score
    kappa_scores["Average"] = sum(kappa_scores.values()) / len(kappa_scores)

    return pd.DataFrame(kappa_scores, index=["Kappa Score"])

# Format the data to two decimal places
pd.options.display.float_format = "{:.2f}".format

Oxford Training

In [23]:
# Oxford Training Data
print("Oxford Training Data:")
calculate_kappa_score(oxford_training["Reviewer_A"], oxford_training["Reviewer_B"])

Oxford Training Data:


Unnamed: 0,urinary,respiratory,abdominal,neurological,skin_soft_tissue,ent,orthopaedic,other_specific,no_specific_source,prophylaxis,uncertainty,not_informative,Average
Kappa Score,0.9,0.88,0.83,0.91,0.84,0.59,0.74,0.55,0.76,0.81,0.96,0.8,0.8


Oxford Test

In [24]:
# Oxford Test Data
print("Oxford Test Data:")
calculate_kappa_score(oxford_test["Reviewer_A"], oxford_test["Reviewer_B"])

Oxford Test Data:


Unnamed: 0,urinary,respiratory,abdominal,neurological,skin_soft_tissue,ent,orthopaedic,other_specific,no_specific_source,prophylaxis,uncertainty,not_informative,Average
Kappa Score,0.95,0.93,0.88,0.81,0.84,0.73,0.75,0.43,0.86,0.93,0.8,0.81,0.81


Banbury Test

In [25]:
# Banbury Test Data
print("Banbury Test Data:")
calculate_kappa_score(banbury_test["Reviewer_A"], banbury_test["Reviewer_B"])

Banbury Test Data:


Unnamed: 0,urinary,respiratory,abdominal,neurological,skin_soft_tissue,ent,orthopaedic,other_specific,no_specific_source,prophylaxis,uncertainty,not_informative,Average
Kappa Score,0.96,0.98,0.85,0.92,0.95,0.66,0.94,0.45,0.93,0.94,0.93,0.9,0.87
