# IAA Quick Check

Khuyen and Sandra, first 200 notes.

In [1]:
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import cohen_kappa_score, confusion_matrix, precision_score, recall_score

In [2]:
FILE_K = '../data/raw/2020-09-22-kd-200-annotations.csv'
FILE_S = '../data/raw/2020-09-28-sz-200-annotations.csv'
FILE_original = '../data/raw/caregivers_set13Jul2020.csv'

# what to rename columns containing annotations as
COL_CHILD = 'child'
COL_SPOUSE = 'spouse'

N_ROWS_TO_USE = 234                        # see note below on why this is

df_o = pd.read_csv(FILE_original)          # original file provided to annotators, in case it's helpful later
df_k = pd.read_csv(FILE_K)
df_s = pd.read_csv(FILE_S)

ids_k = df_k[:N_ROWS_TO_USE]['id']
ids_s = df_s[:N_ROWS_TO_USE]['id']

ids = set(ids_k) & set(ids_s)  # common annotations, in theory ids == ids_k == ids_s and len(ids) == 200

assert ids_k.equals(ids_s)
assert len(ids) == 200

In [3]:
df_k

Unnamed: 0,id,text,L1,L2,L3,annotationValues,labels,settings,update
0,103165,[. . .] ...,"[{""keyword"":""daughter"",""start"":8194,""length"":8}]",[],[],"{""L1_annotation"":1,""L2_annotation"":0,""L3_annot...","{""L1_title"":""child"",""L1_keywords"":""child, chil...","{""reviewMode"":""0"",""displayMode"":""0"",""numWords""...",0
1,142313,[. . .] ...,"[{""keyword"":""daughter"",""start"":9857,""length"":8}]",[],[],"{""L1_annotation"":1,""L2_annotation"":0,""L3_annot...",,,0
2,160258,[. . .] ...,[],"[{""keyword"":""husband"",""start"":5591,""length"":7}]",[],"{""L1_annotation"":0,""L2_annotation"":1,""L3_annot...",,,0
3,134799,[. . .] ...,[],"[{""keyword"":""girlfriend"",""start"":661,""length"":...",[],"{""L1_annotation"":0,""L2_annotation"":1,""L3_annot...",,,0
4,130639,[. . .] ...,[],"[{""keyword"":""fiance"",""start"":4078,""length"":6}]",[],"{""L1_annotation"":0,""L2_annotation"":1,""L3_annot...",,,0
...,...,...,...,...,...,...,...,...,...
9179,138719,[. . .] ...,"[{""keyword"":""Son"",""start"":11537,""length"":3},{""...",[],[],"{""L1_annotation"":1,""L2_annotation"":0,""L3_annot...",,,0
9180,194618,[. . .] ...,[],"[{""keyword"":""husband"",""start"":4952,""length"":7}]",[],"{""L1_annotation"":0,""L2_annotation"":1,""L3_annot...",,,0
9181,138836,[. . .] ...,"[{""keyword"":""son"",""start"":5057,""length"":3},{""k...",[],[],"{""L1_annotation"":1,""L2_annotation"":0,""L3_annot...",,,0
9182,134768,This is a 63yoM w/h/o 4V CABG in [**4-21**] an...,[],"[{""keyword"":""wife"",""start"":198,""length"":4},{""k...",[],"{""L1_annotation"":0,""L2_annotation"":1,""L3_annot...",,,0


#### Note on Clinical Regex output peculiarity
The rows in the imported DataFrame don't actually correspond to the entry numbers displayed when using Clinical Regex. 

For example, Khuyen's file loaded into Clinical Regex indicates she stopped at entry 201, which has `id=105582` (`id` here corresponds to `HADM_ID` in the original data file). But `id=105582` is at index 234 in the DataFrame (see below).

For now I will assume the set of unique values in `all_ids[:235]` is the set which was annotated. (The size of this set here does indeed turn out to be 200, so it's plausible that this is the case.)

In [4]:
# I use Khuyen's data for this, but it's easy to check it's the same case for Sandra's as well

ids_unique = df_k['id'].unique()
ids_all = df_k['id']

id_final = 105582
id_final_index = list(ids_all).index(id_final)

print('The index of the final id within the imported DataFrame is {}.'.format(id_final_index))

assert list(ids_unique).index(id_final) == 200
assert len(ids_all[:id_final_index].unique()) == 200

The index of the final id within the imported DataFrame is 234.


## Minor Cleanup

- Rehsape annotation data (annotations are provided in a pseudo-JSON string column)
- Resolve rows with duplicate `id`s
- Select rows that were annotated by both annotators

In [5]:
def reshape_cr_json(df):
    """Convert ClinicalRegex's 'annotationValues' from a 'JSON' string
    (into appropriate columns)."""
    df_annotations = pd.json_normalize(df['annotationValues'].map(json.loads))
    
    return pd.concat([df, df_annotations], axis=1)

def resolve_annotations(arr):
    """An entry (in this case this is a HADM) gets a value of 1 if
    any of the notes are annotated with 1.
    """
    
    unique = set(arr)
    
    if len(arr) <= 0:
        return arr

    elif len(unique) == 1:
        return list(arr)[0]
    
    elif unique == {0, 1}:
        return 1
    
    else:
        raise ValueError('Resolution of annotations unclear/undecided.', arr)
        
def get_annotation_values(df):
    return df.groupby('id')\
             .agg({'L1_annotation': resolve_annotations,
                   'L2_annotation': resolve_annotations,
                   'L1': list,
                   'L2': list,
                   'text': list})\
             .reset_index()

def rename_columns(df):
    return df.rename(columns={'L1_annotation': COL_CHILD,
                              'L2_annotation': COL_SPOUSE})
    

def select_annotated_rows(df):
    """Take the subset of common annotations."""
    return df[df['id'].isin(ids)]

In [6]:
df_k = df_k.pipe(reshape_cr_json)\
           .pipe(get_annotation_values)\
           .pipe(rename_columns)\
           .pipe(select_annotated_rows)
df_s = df_s.pipe(reshape_cr_json)\
           .pipe(get_annotation_values)\
           .pipe(rename_columns)\
           .pipe(select_annotated_rows)

## Cohen Kappa Score

### Label 1: Child

In [7]:
cohen_kappa_score(df_k[COL_CHILD],
                  df_s[COL_CHILD])

0.8299167487243756

### Label 2: Spouse

In [8]:
cohen_kappa_score(df_k[COL_SPOUSE],
                  df_s[COL_SPOUSE])

0.8181893310193793

## Confusion Matrix / Value Counts

In [9]:
confusion_matrix(df_k[COL_CHILD],
                 df_s[COL_CHILD])

array([[ 62,   4,   0,   0],
       [  3, 105,   2,   6],
       [  0,   0,   0,   0],
       [  2,   2,   0,  14]])

In [10]:
confusion_matrix(df_k[COL_SPOUSE],
                 df_s[COL_SPOUSE])

array([[76,  1,  0,  0],
       [ 4, 85,  5, 11],
       [ 0,  0,  0,  0],
       [ 0,  1,  0, 17]])

## Negative Note Check

In [11]:
df_neg = pd.read_excel("../data/raw/Dr.Cooper Anaylsis .xlsx", usecols=["HADMIN"])

In [12]:
# sklearn compatible objects

# all are true negatives, for both labels
to_add = pd.Series([0]*len(df_neg))

child_true = df_k['child'].apply(lambda v: 0 if v == 9 else v)\
                          .reset_index(drop=True)\
                          .append(to_add, ignore_index=True)
child_pred = df_k['child'].apply(lambda v: 1 if v == 9 else v)\
                          .reset_index(drop=True)\
                          .append(to_add, ignore_index=True)
spouse_true = df_k['spouse'].apply(lambda v: 0 if v == 9 else v)\
                          .reset_index(drop=True)\
                          .append(to_add, ignore_index=True)
spouse_pred = df_k['spouse'].apply(lambda v: 1 if v == 9 else v)\
                          .reset_index(drop=True)\
                          .append(to_add, ignore_index=True)

print(len(child_true))

222


In [13]:
def precision_recall(label):
    tp = sum(df_k[label] == 1)
    fp = sum(df_k[label] == 9)
    tn = sum(df_k[label] == 0)
    tn += len(df_neg)
    fn = 0
    
    return tp/(tp + fp), tp/(tp + fn)

In [14]:
precision_recall('child')

(0.8656716417910447, 1.0)

In [15]:
precision_recall('spouse')

(0.8536585365853658, 1.0)