## Notebook Setup

In [1]:
from pathlib import Path

import pandas as pd
import numpy as np

from guidedsum.evaluation import load_summaries

- Shuffle reports
- Write tasks

```
{
   "id": "..",
   "batch": "..",
   "reference": "..",
   "candidate0": "..",
   "candidate1": "..",
   "candidate2": "..",
   "candidate3": "..",
   "candidate_order": [
       "candidate0",
       "candidate1",
       "candidate2",
       "candidate3",
   ]
}
```

### Configuration

Generate these files

In [2]:
REPORTS_PATH = Path('../error-analysis/data/reports.json')
DATABASE_PATH = Path('../error-analysis/data/assignments.xlsx')

## Prepare data

In [3]:
!cp ../output/mimic-official-bg-oracle/gsum-default/model_step_best.txt ../output/mimic-official-bg-bertext-default-clip-threshold/gsum-default

In [4]:
MODELS = [
    ('bertabs', '../output/mimic-official-bg-unguided/bertabs-default/', ),
    ('gsum_thresholding', '../output/mimic-official-bg-bertext-default-clip-threshold/gsum-default/'),
    ('wgsum', '../output/mimic-official-bg-wgsum/wgsum-default/'),
    ('wgsum+cl', '../output/mimic-official-bg-wgsum-cl/wgsum-cl-default//'),
]
MODEL_NAMES = [name for name, path in MODELS]

### Load reference and candidates

In [5]:
df_impression = pd.read_json('../data/processed/mimic-official/reports.test.json')
df_impression = df_impression[['id', 'findings+bg', 'impression']]
df_impression.head(2)

Unnamed: 0,id,findings+bg,impression
0,50010466,History:\n_-year-old male with cough and histo...,No evidence of acute cardiopulmonary process.
1,50014127,Indication:\n_-year-old male with HIV with sha...,"Vague bibasilar opacities, which may represent..."


In [6]:
def best_step(run_path):
    run_path = Path(run_path)
    try:
        with open(run_path / 'model_step_best.txt') as fin:
            return int(fin.read().strip())
    except FileNotFoundError:
        return -1


def postprocess_candidate(s: str):
    s = s.replace('<q>', ' ')
    s = s.replace(' .', '.')
    s = s.replace(' , ', ', ')
    s = s.replace(" 's",  "'s")
    s = s.replace(" ' s",  "'s")
    s = s.replace(" - ",  "-")
    return s
    


runs = []

for name, path in MODELS:
    summaries = load_summaries(path, best_step(path))
    summaries = summaries['candidate']
    summaries = summaries.rename(name)    
    summaries = summaries.apply(postprocess_candidate)
    runs.append(summaries)
    
df_runs = pd.concat(runs, axis=1)
df_runs.head(2)

Unnamed: 0,bertabs,gsum_thresholding,wgsum,wgsum+cl
0,no acute cardiopulmonary process.,no acute cardiopulmonary process.,no acute cardiopulmonary process.,no acute cardiopulmonary process.
1,vague bibasilar opacities are nonspecific but ...,vague bibasilar opacities are nonspecific but ...,"vague bibasilar opacities, which may represent...",vague bibasilar opacities are nonspecific but ...


### Generate random display order and shuffle reports

In [7]:
df_reports = pd.concat([df_impression, df_runs], axis=1)

np.random.seed(42)
def arrange_candidates(s):
    order = np.random.choice(MODEL_NAMES, size=len(MODEL_NAMES), replace=False)
    s['order'] = order
    for i, model_name in enumerate(order):
        s[f'candidate{i}'] = s[model_name]
    return s

# randomly order candidates and put them into separate columns (candidate0, candidate1 ...)
df_reports = df_reports.apply(arrange_candidates, axis=1)
# drop obsolete columns
df_reports = df_reports[['id', 'findings+bg', 'impression', 'order'] + [f'candidate{i}' for i in range(len(MODEL_NAMES))]]
# shuffle dataset
df_reports = df_reports.sample(len(df_reports), random_state=42)
df_reports.head(2)

Unnamed: 0,id,findings+bg,impression,order,candidate0,candidate1,candidate2,candidate3
803,55187337,Comparison:\n_.\n\nHistory:\nLow-grade fever.\...,New left lower lobe infiltrate and effusion.,"[wgsum+cl, bertabs, wgsum, gsum_thresholding]",new left lower lobe infiltrate and small left ...,new left lower lobe infiltrate and effusion.,new left lower lobe infiltrate.,new left lower lobe infiltrate and small left ...
124,50848467,"Indication:\n_-year-old woman with fever, eval...",Slight increased hazy opacities at the right l...,"[wgsum, bertabs, wgsum+cl, gsum_thresholding]",slightly increased hazy opacities at the right...,slightly increased hazy opacity at the right l...,slightly increased hazy opacities at the right...,slightly increased hazy opacities at the right...


### Persist summaries and databse

In [8]:
from pathlib import Path

In [9]:
## Reports: reference impression and candidates
if REPORTS_PATH.is_file():
    print('Tasks file already exist, do not overwrite.')
else:
    df_reports.to_json(REPORTS_PATH, orient='records')
       

## Assignment database
if DATABASE_PATH.is_file():
    print('Databse file already exist, do not overwrite.')
else:   
    df_database = df_reports[['id']].copy()
    df_database['batch'] = None
    df_database['a1'] = None
    df_database['a2'] = None
    df_database['a3'] = None
    df_database = df_database.set_index('id')
    df_database.to_excel(DATABASE_PATH)

Tasks file already exist, do not overwrite.
Databse file already exist, do not overwrite.


## Generate Assignments

Use below function to generate pairings for a specific batch. Paste into excel sheet above and select `text to columns -> delimited -> commam`

In [10]:
from collections import Counter
import random
from pprint import pprint

# Configuration
N_TASKS = 100
N_ANNOTATORS_PER_TASK = 3
ANNOTATORS = [
    'annotator1',
    'annotator2',
    'annotator3',
    'annotator4',
    'annotator5',
    'annotator6',
]
BATCH = 'batch2'

# Generate assignments
stack = ANNOTATORS.copy()
random.shuffle(stack)

tasks = []
for _ in range(N_TASKS):
    task = []
    for _ in range(N_ANNOTATORS_PER_TASK):
        task.append(stack.pop())
        
        if len(stack) == 0:
            # refill
            stack = ANNOTATORS.copy()
            random.shuffle(stack)
            
    tasks.append(task)
    
# Print number of tasks per annotator
print('Tasks per annotator')
pprint(Counter(a for task in tasks for a in task))
print()

print('Assignments:\n')
for assignment in tasks:
    print(f'{BATCH},'+ ','.join(assignment))

Tasks per annotator
Counter({'annotator5': 50,
         'annotator4': 50,
         'annotator2': 50,
         'annotator3': 50,
         'annotator1': 50,
         'annotator6': 50})

Assignments:

batch2,annotator5,annotator4,annotator2
batch2,annotator3,annotator1,annotator6
batch2,annotator5,annotator3,annotator2
batch2,annotator6,annotator4,annotator1
batch2,annotator1,annotator4,annotator3
batch2,annotator5,annotator6,annotator2
batch2,annotator5,annotator6,annotator3
batch2,annotator2,annotator1,annotator4
batch2,annotator1,annotator4,annotator3
batch2,annotator2,annotator5,annotator6
batch2,annotator3,annotator1,annotator6
batch2,annotator4,annotator2,annotator5
batch2,annotator5,annotator3,annotator4
batch2,annotator2,annotator6,annotator1
batch2,annotator2,annotator3,annotator1
batch2,annotator6,annotator4,annotator5
batch2,annotator5,annotator1,annotator2
batch2,annotator4,annotator6,annotator3
batch2,annotator2,annotator5,annotator4
batch2,annotator3,annotator1,annotator6
ba

## Load preprocessed data

In [11]:
df = pd.read_json(REPORTS_PATH).rename({'id': 'study_id'}, axis=1)
database = pd.read_excel(DATABASE_PATH).rename({'id': 'study_id'}, axis=1)

display(df.head(2))
display(database.head(2))

Unnamed: 0,study_id,findings+bg,impression,order,candidate0,candidate1,candidate2,candidate3
0,55187337,Comparison:\n_.\n\nHistory:\nLow-grade fever.\...,New left lower lobe infiltrate and effusion.,"[wgsum+cl, bertabs, wgsum, gsum_thresholding]",new left lower lobe infiltrate and small left ...,new left lower lobe infiltrate and effusion.,new left lower lobe infiltrate.,new left lower lobe infiltrate and small left ...
1,50848467,"Indication:\n_-year-old woman with fever, eval...",Slight increased hazy opacities at the right l...,"[wgsum, bertabs, wgsum+cl, gsum_thresholding]",slightly increased hazy opacities at the right...,slightly increased hazy opacity at the right l...,slightly increased hazy opacities at the right...,slightly increased hazy opacities at the right...


Unnamed: 0,study_id,batch,a1,a2,a3
0,55187337,batch0,all,all,
1,50848467,batch0,all,all,


In [12]:
out_path = Path('../data/interim/error-analysis-tasks/')
batches = database['batch'].dropna().unique()

for batch in batches:
    for annotator in ANNOTATORS:
        mask = (database['batch'] == batch) & (
            database['a1'].isin([annotator, 'all']) 
            | database['a2'].isin([annotator, 'all'])
            | database['a3'].isin([annotator, 'all'])
        )
        
        print(f'{annotator:<18} {batch} tasks = {sum(mask)}')
        
        batch_ids = database[mask]['study_id']
        
        out_file = out_path / f'{batch}-{annotator}-tasks.json'
        out_file.parent.mkdir(exist_ok=True, parents=True)
        
        tasks = df[df['study_id'].isin(batch_ids)].copy()
        tasks['batch'] = batch
        tasks.to_json(out_file, orient='records')

annotator1         batch0 tasks = 10
annotator2         batch0 tasks = 10
annotator3         batch0 tasks = 10
annotator4         batch0 tasks = 10
annotator5         batch0 tasks = 10
annotator6         batch0 tasks = 10
annotator1         batch1 tasks = 10
annotator2         batch1 tasks = 10
annotator3         batch1 tasks = 10
annotator4         batch1 tasks = 10
annotator5         batch1 tasks = 10
annotator6         batch1 tasks = 10
annotator1         batch2 tasks = 50
annotator2         batch2 tasks = 50
annotator3         batch2 tasks = 50
annotator4         batch2 tasks = 50
annotator5         batch2 tasks = 50
annotator6         batch2 tasks = 50
