# IAA Quick Check

Khuyen and Sandra, first 200 notes.

In [1]:
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import cohen_kappa_score, confusion_matrix

In [2]:
FILE_K = '../data/interim/2020-09-22-kd-200-annotations.csv'
FILE_S = '../data/interim/2020-09-28-sz-200-annotations.csv'
FILE_original = '../data/interim/caregivers_set13Jul2020.csv'

N_ROWS_TO_USE = 234                        # see note below on why this is

df_o = pd.read_csv(FILE_original)          # original file provided to annotators, in case it's helpful later
df_k = pd.read_csv(FILE_K)
df_s = pd.read_csv(FILE_S)

ids_k = df_k[:N_ROWS_TO_USE]['id']
ids_s = df_s[:N_ROWS_TO_USE]['id']

ids = set(ids_k).intersection(set(ids_s))  # common annotations, in theory ids == ids_k == ids_s and len(ids) == 200

assert ids_k.equals(ids_s)
assert len(ids) == 200

#### Note on Clinical Regex output peculiarity
The rows in the imported DataFrame don't actually correspond to the entry numbers displayed when using Clinical Regex. 

For example, Khuyen's file loaded into Clinical Regex indicates she stopped at entry 201, which has `id=105582` (`id` here corresponds to `HADM_ID` in the original data file). But `id=105582` is at index 234 in the DataFrame (see below).

For now I will assume the set of unique values in `all_ids[:235]` is the set which was annotated. (The size of this set here does indeed turn out to be 200, so it's plausible that this is the case.)

In [3]:
# I use Khuyen's data for this, but it's easy to check it's the same case for Sandra's as well

ids_unique = df_k['id'].unique()
ids_all = df_k['id']

id_final = 105582
id_final_index = list(ids_all).index(id_final)

print('The index of the final id within the imported DataFrame is {}.'.format(id_final_index))

assert list(ids_unique).index(id_final) == 200
assert len(ids_all[:id_final_index].unique()) == 200

The index of the final id within the imported DataFrame is 234.


## Minor Cleanup

- Rehsape annotation data (annotations are provided in a pseudo-JSON string column)
- Resolve rows with duplicate `id`s
- Select rows that were annotated by both annotators

In [4]:
def reshape_cr_json(df):
    """Convert ClinicalRegex's 'annotationValues' from a 'JSON' string
    (into appropriate columns)."""
    df_annotations = pd.json_normalize(df['annotationValues'].map(json.loads))
    
    return pd.concat([df, df_annotations], axis=1)

def resolve_annotations(arr, binary=False):
    """An entry (in this case this is a HADM) gets a value of 1 if
    any of the notes are annotated with 1.
    
    I provide a binary option here to be able to treat the '9's as '1's.
    """
    
    if binary:
        arr = [val if val != 9 else 1 for val in arr]
    
    unique = set(arr)
    
    if len(arr) <= 0:
        return arr

    elif len(unique) == 1:
        return list(arr)[0]
    
    elif unique == {0, 1}:
        return 1
    
    else:
        raise ValueError('Resolution of annotations unclear.', arr)
        
def get_annotation_values(df, binary=False):
    """"""
    return df.groupby('id')\
             .agg({'L1_annotation': lambda v: resolve_annotations(v, binary),
                   'L2_annotation': lambda v: resolve_annotations(v, binary),
                   'L3_annotation': lambda v: resolve_annotations(v, binary)})\
             .reset_index()

def select_annotated_rows(df):
    """Take the subset of common annotations."""
    return df[df['id'].isin(ids)]

In [5]:
# raw annotations
df_k_raw = df_k.pipe(reshape_cr_json)\
               .pipe(get_annotation_values)\
               .pipe(select_annotated_rows)
df_s_raw = df_s.pipe(reshape_cr_json)\
               .pipe(get_annotation_values)\
               .pipe(select_annotated_rows)

# annotations if 9 is resolved to 1
df_k_res = df_k.pipe(reshape_cr_json)\
               .pipe(get_annotation_values, binary=True)\
               .pipe(select_annotated_rows)
df_s_res = df_s.pipe(reshape_cr_json)\
               .pipe(get_annotation_values, binary=True)\
               .pipe(select_annotated_rows)

## Cohen Kappa Score

### Label 1: Child

In [6]:
cohen_kappa_score(df_k_raw['L1_annotation'],
                  df_s_raw['L1_annotation'])

0.8299167487243756

In [7]:
cohen_kappa_score(df_k_res['L1_annotation'],
                  df_s_res['L1_annotation'])

0.8779405237461163

### Label 2: Spouse

In [8]:
cohen_kappa_score(df_k_raw['L2_annotation'],
                  df_s_raw['L2_annotation'])

0.8181893310193793

In [9]:
cohen_kappa_score(df_k_res['L2_annotation'],
                  df_s_res['L2_annotation'])

0.8984513835998984

## Confusion Matrix / Value Counts

In [10]:
df_k_raw[['L1_annotation', 'L2_annotation']].apply(pd.Series.value_counts)

Unnamed: 0,L1_annotation,L2_annotation
1,116,105
0,66,77
9,18,18


In [11]:
df_s_raw[['L1_annotation', 'L2_annotation']].apply(pd.Series.value_counts)

Unnamed: 0,L1_annotation,L2_annotation
1,111,87
0,67,80
9,20,28
5,2,5


In [12]:
confusion_matrix(df_k_raw['L1_annotation'],
                 df_s_raw['L1_annotation'])

array([[ 62,   4,   0,   0],
       [  3, 105,   2,   6],
       [  0,   0,   0,   0],
       [  2,   2,   0,  14]])

In [13]:
confusion_matrix(df_k_raw['L2_annotation'],
                 df_s_raw['L2_annotation'])

array([[76,  1,  0,  0],
       [ 4, 85,  5, 11],
       [ 0,  0,  0,  0],
       [ 0,  1,  0, 17]])