## Notebook Setup

### Imports

In [1]:
import re
from pathlib import Path
from collections import Counter, defaultdict
from pprint import pprint
from typing import List
import json
import dataclasses

import numpy as np
import pandas as pd

In [2]:
from guidedsum.annotation_utils import Annotation, group_spans, label_groups, apply_votings, annotator_agreement, annotator_agreement_three, expand_to_borders

### Configuration

In [3]:
ANNOTATIONS_PATH = Path('../error-analysis/annotations/')
REPORTS_PATH = Path('../error-analysis/data/reports.json')
DATABASE_PATH = Path('../error-analysis/data/assignments.xlsx')

# Label studio ID -> anonymized names
USERS = {
    5: "annotator1", # user has two accounts
    12: "annotator1", # user has two accounts
    6: "annotator2",
    9: "annotator3",
    7: "annotator4",
    11: "annotator5",
    3: "annotator6",
}

## Load Data

Reports

In [4]:
df_reports = pd.read_json(REPORTS_PATH)
df_reports = df_reports.rename({'id': 'study_id'}, axis=1)
df_reports.head(2)

Unnamed: 0,study_id,findings+bg,impression,order,candidate0,candidate1,candidate2,candidate3
0,55187337,Comparison:\n_.\n\nHistory:\nLow-grade fever.\...,New left lower lobe infiltrate and effusion.,"[wgsum+cl, bertabs, wgsum, gsum_thresholding]",new left lower lobe infiltrate and small left ...,new left lower lobe infiltrate and effusion.,new left lower lobe infiltrate.,new left lower lobe infiltrate and small left ...
1,50848467,"Indication:\n_-year-old woman with fever, eval...",Slight increased hazy opacities at the right l...,"[wgsum, bertabs, wgsum+cl, gsum_thresholding]",slightly increased hazy opacities at the right...,slightly increased hazy opacity at the right l...,slightly increased hazy opacities at the right...,slightly increased hazy opacities at the right...


Assignments

In [5]:
df_assignments = pd.read_excel(DATABASE_PATH)
df_assignments = df_assignments[~df_assignments['batch'].isna()]
df_assignments = df_assignments.rename({'id': 'study_id'}, axis=1)
df_assignments = df_assignments.melt(id_vars=['study_id', 'batch'], var_name='annotator_id', value_name='annotator')
df_assignments = df_assignments.sort_values(['study_id', 'annotator_id'])
df_assignments = df_assignments[df_assignments['batch'] == 'batch2']
df_assignments.head()

Unnamed: 0,study_id,batch,annotator_id,annotator
41,50126222,batch2,a1,annotator2
161,50126222,batch2,a2,annotator6
281,50126222,batch2,a3,annotator5
24,50178679,batch2,a1,annotator4
144,50178679,batch2,a2,annotator5


Annotations

In [6]:
annotation_files = list(sorted(ANNOTATIONS_PATH.glob(f'annotator*.json')))
pprint(annotation_files)

df_annotations = pd.concat((pd.read_json(f).assign(annotation_file=f.name) for f in annotation_files), ignore_index=True)

missing_mask = df_annotations['annotator'].isna()
if any(missing_mask):
    print('WARNING: Missing annotations for following reports: ')
    display(df_annotations[missing_mask][['annotation_file', 'batch', 'study_id']])
    df_annotations = df_annotations[~missing_mask]
    
df_annotations['annotator'] = df_annotations['annotator'].apply(lambda x: USERS[x])
df_annotations['duration-minutes'] = df_annotations['duration-minutes'].astype(str).str.strip().apply(lambda x: re.match(r'(\d+).*', x).group(1)).astype(float)

df_annotations = df_annotations.drop('id', axis=1) # remove label-studio internal id, we don't need it
df_annotations.head()

[PosixPath('../error-analysis/annotations/annotator1.json'),
 PosixPath('../error-analysis/annotations/annotator2.json'),
 PosixPath('../error-analysis/annotations/annotator3.json'),
 PosixPath('../error-analysis/annotations/annotator4.json'),
 PosixPath('../error-analysis/annotations/annotator5.json'),
 PosixPath('../error-analysis/annotations/annotator6.json')]


Unnamed: 0,study_id,findings+bg,impression,order,candidate0,candidate1,candidate2,candidate3,batch,candidate0-labels-omissions,...,errors-candidate3-other,candidate2-labels-additions,candidate2-errors,errors-candidate2-other,candidate1-errors,errors-candidate1-other,candidate0-errors,remarks,errors-candidate0-other,annotation_file
0,52008677,Indication:\n_-year-old female with lightheade...,Stable chest radiographs without evidence for ...,"[wgsum, wgsum+cl, gsum_thresholding, bertabs]",no radiographic evidence for acute cardiopulmo...,no radiographic evidence for acute cardiopulmo...,no radiographic evidence for acute cardiopulmo...,no radiographic evidence for acute cardiopulmo...,batch2,"[{'start': 0, 'end': 24, 'text': 'Stable chest...",...,,,,,,,,,,annotator1.json
1,53536595,Examination:\nCHEST (PORTABLE AP)\n\nIndicatio...,Increased opacity in the left upper lobe may b...,"[wgsum+cl, bertabs, wgsum, gsum_thresholding]",there is an increased opacity in the left uppe...,increased opacity in the left upper lobe likel...,increased opacity in the left upper lobe likel...,increased opacity in the left upper lobe likel...,batch2,,...,,,,,,,,,,annotator1.json
2,50753069,Indication:\n_-year-old male patient with rena...,Increased density along the left cardiac borde...,"[wgsum, bertabs, wgsum+cl, gsum_thresholding]",area of increased density projecting over the ...,no radiographic evidence of an acute cardiopul...,focal area of increased density projecting ove...,no radiographic evidence of an acute cardiopul...,batch2,"[{'start': 48, 'end': 110, 'text': 'for which ...",...,falsche Uhrzeit,"[{'start': 72, 'end': 139, 'text': 'which coul...",other,falsche Uhrzeit,,,,,,annotator1.json
3,53923012,Indication:\nEvaluation of patient with histor...,Stable small to moderal right pleural effusion...,"[gsum_thresholding, bertabs, wgsum+cl, wgsum]",persistent right lower lobe opacity consistent...,stable right lower lobe opacity consistent wit...,stable right pleural effusion. improved asymme...,stable right lower lobe opacity and small to m...,batch2,,...,,,4. Incorrect severity of finding,,,,,,,annotator1.json
4,53130454,Examination:\nCHEST (PA AND LAT)\n\nIndication...,No acute cardiopulmonary abnormality. Density ...,"[wgsum+cl, gsum_thresholding, bertabs, wgsum]",no acute cardiopulmonary abnormality. new dens...,no acute cardiopulmonary abnormality. enlargem...,no acute intrathoracic abnormality.,no acute cardiopulmonary abnormality.,batch2,"[{'start': 129, 'end': 169, 'text': 'CT is rec...",...,,,,,other,Bedeutung des Satzes: enlargement of the ante...,,,,annotator1.json


In [7]:
annotators = list(sorted(df_annotations['annotator'].unique()))
print(annotators)

['annotator1', 'annotator2', 'annotator3', 'annotator4', 'annotator5', 'annotator6']


## Data Wrangling

### Checking Annotations for Completeness

Completed assignments

In [8]:
n_completed = df_annotations.groupby(['batch', 'annotator']).size()
n_assigned = df_assignments.groupby(['batch', 'annotator']).size()
n_missing = n_assigned - n_completed

df_progress = pd.concat([
    n_assigned,
    n_completed,
    n_missing   
], axis=1, keys=['assigned', 'completed', 'missing'])
df_progress.loc[('batch2', 'all'), :] = df_progress.sum()
df_progress

Unnamed: 0_level_0,Unnamed: 1_level_0,assigned,completed,missing
batch,annotator,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
batch2,annotator1,50.0,50.0,0.0
batch2,annotator2,50.0,50.0,0.0
batch2,annotator3,50.0,50.0,0.0
batch2,annotator4,50.0,50.0,0.0
batch2,annotator5,50.0,50.0,0.0
batch2,annotator6,50.0,50.0,0.0
batch2,all,300.0,300.0,0.0


Reports with all annotations

In [9]:
n_annotations_per_study = df_annotations.groupby('study_id').size()
ids_completed = n_annotations_per_study[n_annotations_per_study == 3].index

print('N Reports with 3 Annotations:', len(ids_completed))

N Reports with 3 Annotations: 100


Validity of span annotations

In [10]:
cols = [
    'candidate0-labels-additions', 'candidate1-labels-additions', 'candidate2-labels-additions', 'candidate3-labels-additions',
    'candidate0-labels-omissions', 'candidate1-labels-omissions', 'candidate2-labels-omissions', 'candidate3-labels-omissions'
]

for c in cols:  
    tmp = df_annotations.set_index('study_id')['candidate0-labels-omissions'].explode()
    tmp = tmp[~tmp.isna()]
    ix = tmp.index
    tmp = pd.json_normalize(tmp)
    tmp = tmp.set_index(ix)
    tmp['length'] = tmp['text'].apply(len)
    tmp = tmp[tmp['length'] <= 2] ## annotations of length 2 or less could be erroneous
    if len(tmp) > 0:
        print(tmp)

### Flat datastructure

A "task" is a reference + 4 candidates. Each task is annotated by three annotators. The following code "flattens" the label-studio task annotations, such that we get a dataframe. One row = one candidate/annotator pair.

```
study_id, candidate, annotator, [... error annotations ...]
1, 0, 'annotator_1', ...
1, 1, 'annotator_1', ...
1, 2, 'annotator_1', ...
1, 3, 'annotator_1', ...
1, 0, 'annotator_2', ...
....
1, 3, 'annotator_3', ...
...
N, 3, 'annotator_3', ...
```

In [11]:
def convert_annotations(annotations: List[dict], annotator, doc_id) -> List[Annotation]:
    anns = []
    if not isinstance(annotations, list):
        return anns
    for ann in annotations:
        anns.append(Annotation(ann['start'], ann['end'], ann['labels'][0], annotator=annotator, document_id=doc_id))
    return list(sorted(anns, key=lambda a: a.start))


records = []
for index, row in df_annotations.iterrows():
    study_id = row['study_id']
    annotator = row['annotator']    
    candidate_ids = [0,1,2,3]
      
    for i in candidate_ids:
        candidate_name = row['order'][i]
        record = {
            'study_id': study_id,
            'annotator': annotator,
            'candidate_id': i,
            'candidate_name': candidate_name,
            'candidate': row[f'candidate{i}'],
            'reference': row['impression'],
            'remarks': row['remarks']
        }
        
        # Omissions (list of spans)
        anns = convert_annotations(
            row[f'candidate{i}-labels-omissions'],
            annotator,
            doc_id=str(study_id) + '-' + candidate_name + '-' + 'omissions'
        )
        labels = [ann.label for ann in anns]
        record['omission_labels'] = labels
        record['omission_counts'] = Counter(labels)
        record['omission_standoff'] = anns

        # Additions (list of spans)
        anns = convert_annotations(
            row[f'candidate{i}-labels-additions'],
            annotator,
            doc_id=str(study_id) + '-' + candidate_name + '-' + 'additions'
        )
        labels = [ann.label for ann in anns]
        record['addition_labels'] = labels
        record['addition_counts'] = Counter(labels)
        record['addition_standoff'] = anns
        
        # Binary errors (category 3, 4, other)
        errors = row[f'candidate{i}-errors']
        if isinstance(errors, dict):
            # Multiple categories assigned, errors is a dictionary {'choices': [...]}
            for error in errors['choices']:
                record[error] = 1
        elif isinstance(errors, str):
            # Single category assigned, errors is a string
            record[errors] = 1
        
        # Other error (str)
        record['other'] = row[f'errors-candidate{i}-other']
        records.append(record)
        
df_annotations_normalized = pd.DataFrame(records)
df_annotations_normalized = pd.concat([
    df_annotations_normalized,
    pd.json_normalize(df_annotations_normalized['omission_counts']),
    pd.json_normalize(df_annotations_normalized['addition_counts'])
], axis=1)

df_annotations_normalized = pd.merge(
    df_annotations_normalized,
    df_reports[['study_id', 'findings+bg']],
    how='left',
    on='study_id'
)


# Names were shortened from the annotation guidelines!
labels_to_name = {
    '1a': '1a. Omission of finding/interpretation',
    '1b': '1b. Omission of comparison',
    '1c': '1c. Omission of reference to prior report',
    '1d': '1d. Omission of recommendation',
    '2a': '2a. Additional finding/interpretation',
    '2b': '2b. Additional comparison',
    '2c': '2c. Additional reference to prior report',
    '2d': '2d. Additional recommendation',
    '2e': '2e. Additional contradicting finding',
    '3': '3. Incorrect location/position of finding',
    '4': '4. Incorrect severity of finding',

}
error_cols = list(labels_to_name.values())

df_annotations_normalized = df_annotations_normalized.rename(labels_to_name, axis=1)
df_annotations_normalized[error_cols] = df_annotations_normalized[error_cols].fillna(0).astype(int)

print(len(df_annotations_normalized))
df_annotations_normalized.head(5)

1200


Unnamed: 0,study_id,annotator,candidate_id,candidate_name,candidate,reference,remarks,omission_labels,omission_counts,omission_standoff,...,1a. Omission of finding/interpretation,1d. Omission of recommendation,1b. Omission of comparison,1c. Omission of reference to prior report,2a. Additional finding/interpretation,2d. Additional recommendation,2b. Additional comparison,2e. Additional contradicting finding,2c. Additional reference to prior report,findings+bg
0,52008677,annotator1,0,wgsum,no radiographic evidence for acute cardiopulmo...,Stable chest radiographs without evidence for ...,,[1a],{'1a': 1},"[Annotation(start=0, end=24, label='1a', annot...",...,1,0,0,0,0,0,0,0,0,Indication:\n_-year-old female with lightheade...
1,52008677,annotator1,1,wgsum+cl,no radiographic evidence for acute cardiopulmo...,Stable chest radiographs without evidence for ...,,[1a],{'1a': 1},"[Annotation(start=0, end=24, label='1a', annot...",...,1,0,0,0,0,0,0,0,0,Indication:\n_-year-old female with lightheade...
2,52008677,annotator1,2,gsum_thresholding,no radiographic evidence for acute cardiopulmo...,Stable chest radiographs without evidence for ...,,[1a],{'1a': 1},"[Annotation(start=0, end=24, label='1a', annot...",...,1,0,0,0,0,0,0,0,0,Indication:\n_-year-old female with lightheade...
3,52008677,annotator1,3,bertabs,no radiographic evidence for acute cardiopulmo...,Stable chest radiographs without evidence for ...,,[1a],{'1a': 1},"[Annotation(start=0, end=24, label='1a', annot...",...,1,0,0,0,0,0,0,0,0,Indication:\n_-year-old female with lightheade...
4,53536595,annotator1,0,wgsum+cl,there is an increased opacity in the left uppe...,Increased opacity in the left upper lobe may b...,,[],{},[],...,0,0,0,0,0,0,0,0,0,Examination:\nCHEST (PORTABLE AP)\n\nIndicatio...


In [12]:
df_candidates = df_annotations_normalized.drop_duplicates(subset=['study_id', 'candidate_id'])
df_candidates = df_candidates[['study_id', 'candidate_id', 'candidate_name', 'candidate', 'reference']]
df_candidates = pd.merge(
    df_candidates,
    df_reports[['study_id', 'findings+bg']],
    how='left',
    on='study_id'
)
df_candidates.head(5)

Unnamed: 0,study_id,candidate_id,candidate_name,candidate,reference,findings+bg
0,52008677,0,wgsum,no radiographic evidence for acute cardiopulmo...,Stable chest radiographs without evidence for ...,Indication:\n_-year-old female with lightheade...
1,52008677,1,wgsum+cl,no radiographic evidence for acute cardiopulmo...,Stable chest radiographs without evidence for ...,Indication:\n_-year-old female with lightheade...
2,52008677,2,gsum_thresholding,no radiographic evidence for acute cardiopulmo...,Stable chest radiographs without evidence for ...,Indication:\n_-year-old female with lightheade...
3,52008677,3,bertabs,no radiographic evidence for acute cardiopulmo...,Stable chest radiographs without evidence for ...,Indication:\n_-year-old female with lightheade...
4,53536595,0,wgsum+cl,there is an increased opacity in the left uppe...,Increased opacity in the left upper lobe may b...,Examination:\nCHEST (PORTABLE AP)\n\nIndicatio...


### Group annotations by candidate and apply voting schemes

In [13]:
df_grouped = df_annotations_normalized \
    .groupby(['study_id', 'candidate_name'])

votings_omissions = df_grouped \
    .apply(lambda group_df: dict(zip(group_df['annotator'], group_df['omission_standoff']))) \
    .rename('annotations') \
    .apply(apply_votings)
votings_omissions = pd.json_normalize(votings_omissions, max_level=0).set_index(votings_omissions.index)

votings_additions = df_grouped \
    .apply(lambda group_df: dict(zip(group_df['annotator'], group_df['addition_standoff']))) \
    .rename('annotations') \
    .apply(apply_votings)
votings_additions = pd.json_normalize(votings_additions, max_level=0).set_index(votings_additions.index)

df_binary_errors = df_annotations_normalized \
    .groupby(['study_id', 'candidate_name']) \
    .agg({'3. Incorrect location/position of finding': pd.Series.mode,
          '4. Incorrect severity of finding': pd.Series.mode,
          'other': 'count'})

def errors_by_strategy(voting_strategy):
    df = pd.concat([
        pd.json_normalize(votings_omissions[voting_strategy]).set_index(votings_omissions.index),
        pd.json_normalize(votings_additions[voting_strategy]).set_index(votings_additions.index),
        df_binary_errors,
    ], axis=1).fillna(0).astype(int)
    
    df = df.rename(labels_to_name, axis=1)
    df = df[sorted(df.columns)]
    return df


df_all_errors_exact = errors_by_strategy('exact_vote_total')
df_all_errors_exact_relaxed = errors_by_strategy('relaxed_exact_vote_total')
df_all_errors_majority = errors_by_strategy('span_blind_majority_vote_total')

display(df_all_errors_exact)
display(df_all_errors_exact_relaxed)
display(df_all_errors_majority)

Unnamed: 0_level_0,Unnamed: 1_level_0,1a. Omission of finding/interpretation,1b. Omission of comparison,1c. Omission of reference to prior report,1d. Omission of recommendation,2a. Additional finding/interpretation,2b. Additional comparison,2d. Additional recommendation,3. Incorrect location/position of finding,4. Incorrect severity of finding,other
study_id,candidate_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
50126222,bertabs,0,0,0,0,0,0,0,0,0,1
50126222,gsum_thresholding,0,0,0,0,0,0,0,0,0,0
50126222,wgsum,0,0,0,0,0,0,0,0,0,1
50126222,wgsum+cl,0,0,0,0,0,0,0,0,0,1
50178679,bertabs,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...
59741915,wgsum+cl,0,0,0,0,0,0,0,0,0,0
59790228,bertabs,0,0,0,0,0,0,0,1,0,2
59790228,gsum_thresholding,0,0,0,0,0,0,0,1,0,2
59790228,wgsum,0,0,0,0,0,0,0,0,0,0


Unnamed: 0_level_0,Unnamed: 1_level_0,1a. Omission of finding/interpretation,1b. Omission of comparison,1c. Omission of reference to prior report,1d. Omission of recommendation,2a. Additional finding/interpretation,2b. Additional comparison,2d. Additional recommendation,2e. Additional contradicting finding,3. Incorrect location/position of finding,4. Incorrect severity of finding,other
study_id,candidate_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
50126222,bertabs,0,0,0,0,0,0,0,0,0,0,1
50126222,gsum_thresholding,0,0,0,0,0,0,0,0,0,0,0
50126222,wgsum,0,0,0,0,0,0,0,0,0,0,1
50126222,wgsum+cl,0,0,0,0,0,0,0,0,0,0,1
50178679,bertabs,0,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...
59741915,wgsum+cl,0,0,0,0,0,0,0,0,0,0,0
59790228,bertabs,1,0,0,0,1,0,1,0,1,0,2
59790228,gsum_thresholding,1,0,0,0,0,0,1,0,1,0,2
59790228,wgsum,0,0,0,0,1,0,0,0,0,0,0


Unnamed: 0_level_0,Unnamed: 1_level_0,1a. Omission of finding/interpretation,1b. Omission of comparison,1c. Omission of reference to prior report,1d. Omission of recommendation,2a. Additional finding/interpretation,2b. Additional comparison,2c. Additional reference to prior report,2d. Additional recommendation,2e. Additional contradicting finding,3. Incorrect location/position of finding,4. Incorrect severity of finding,other
study_id,candidate_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
50126222,bertabs,0,0,0,0,0,0,0,0,0,0,0,1
50126222,gsum_thresholding,0,0,0,0,0,0,0,0,0,0,0,0
50126222,wgsum,0,0,0,0,0,0,0,0,0,0,0,1
50126222,wgsum+cl,0,0,0,0,0,0,0,0,0,0,0,1
50178679,bertabs,0,0,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
59741915,wgsum+cl,0,0,0,0,0,0,0,0,0,0,0,0
59790228,bertabs,1,1,0,0,1,0,0,1,0,1,0,2
59790228,gsum_thresholding,1,0,0,0,0,0,0,1,0,1,0,2
59790228,wgsum,0,2,0,0,1,0,0,0,0,0,0,0


### Export DF with all annotations

In [14]:
df_annotations_normalized.to_json('../error-analysis/annotations/annotations.jsonl', lines=True, orient='records')

### Export DF with additions

In [15]:
df_addition_spans = votings_additions.copy().reset_index()
records = []

for index, row in df_addition_spans.iterrows():   
    groups = row['groups']
    votes = row['span_blind_majority_vote']
    
    for g, v in zip(groups, votes):
        r = {
            'study_id': row['study_id'],
            'candidate_name': row['candidate_name']
        }
        r['annotations'] = g
        r['annotation_kind'] = label_groups([g])[0]
        r['majority_vote'] = v
        records.append(r)
            
dd = pd.DataFrame(records)
dd = dd[dd['majority_vote'].apply(len) > 0]

dd = pd.merge(
    dd,
    df_candidates[['study_id', 'candidate_name', 'reference', 'candidate', 'findings+bg']],
    on=['study_id', 'candidate_name'],
    how='left'
)

dd.to_json('../error-analysis/annotations/additions.jsonl', lines=True, orient='records')

## Analysis

### Annotation duration

Label-studio lead time (duration on screen) is not really reliable as there may be artifacts when the screen is kept open in the background. Therefore, we rely on the self-reported measure.

In [16]:
duration_stats = df_annotations['duration-minutes'].describe()
duration_stats.loc['total_minutes'] = df_annotations['duration-minutes'].sum()
duration_stats.loc['total_hours'] = df_annotations['duration-minutes'].sum() / 60
duration_stats.round(1).to_frame()

Unnamed: 0,duration-minutes
count,300.0
mean,4.6
std,3.3
min,1.0
25%,2.0
50%,4.0
75%,6.0
max,20.0
total_minutes,1388.0
total_hours,23.1


### Assigned errors

How often did each annotator assign the categories?

In [17]:
raw_counts = df_annotations_normalized.groupby('annotator')[error_cols].sum()
raw_counts['other'] = df_annotations_normalized.groupby('annotator')['other'].count()
raw_counts.T

annotator,annotator1,annotator2,annotator3,annotator4,annotator5,annotator6
1a. Omission of finding/interpretation,113,150,142,135,96,138
1b. Omission of comparison,19,30,55,20,66,46
1c. Omission of reference to prior report,1,11,4,11,8,8
1d. Omission of recommendation,25,35,30,46,42,38
2a. Additional finding/interpretation,92,132,132,140,90,132
2b. Additional comparison,9,28,39,16,31,32
2c. Additional reference to prior report,0,3,2,9,2,1
2d. Additional recommendation,12,12,7,19,6,16
2e. Additional contradicting finding,7,4,3,1,14,5
3. Incorrect location/position of finding,13,33,35,2,24,4


### IAA: Span-based (Additions/Omissions)

In [18]:
def flatten(l):
    return [e for sub in l for e in sub]

In [19]:
def accumulate_annotations(annotation_column):
    # Step 1: Pivot s.t. one row = candidate (400), column = annotator, values = list of standoff annotations
    df_pivot = df_annotations_normalized.pivot(
        index=['study_id', 'candidate_name'],
        columns='annotator',
        values=annotation_column
    )

    # Step 2: Accumulate annotations of all candidates.
    # For each candidate, we have three annotators. Since the pairing of annotator varies,
    # we assign new IDs to annotators (i.e., first annotator, second annotator, third annotator per sample).
    # d = {
    #     'annotator0': List[Annotation],
    #     'annotator1': List[Annotation],
    #     'annotator2': List[Annotation],
    # }
    all_annotations = defaultdict(list)
    for index, row in df_pivot.iterrows():
        annotations_by_annotator = row.dropna().to_dict()
        groups = group_spans(annotations_by_annotator)
        labels = label_groups(groups)

        # map names to an annotator ID
        annotator_ids = {a: f'annotator{i}' for i, a in enumerate(annotations_by_annotator.keys())}

        # Tolarate boundary issues in 1-to-1 groups
        groups_adjusted = []
        for group, label in zip(groups, labels):
            if label == 'EXTEND':
                group = expand_to_borders(group)
            groups_adjusted.append(group)

        anns = flatten(groups_adjusted)
        for ann in anns:
            new_annotator_name = annotator_ids[ann.annotator]
            ann = dataclasses.replace(ann, annotator=new_annotator_name)
            all_annotations[new_annotator_name].append(ann)
            
    return all_annotations

In [20]:
all_annotations = accumulate_annotations('omission_standoff')
labels = sorted(list(set(ann.label for anns in all_annotations.values() for ann in anns)))
overall = annotator_agreement_three(
    all_annotations['annotator0'],
    all_annotations['annotator1'],
    all_annotations['annotator2'],
)
print(f'F1 Omission (overall) {overall:.2f}')
for label in labels:
    iaa = annotator_agreement_three(
        all_annotations['annotator0'],
        all_annotations['annotator1'],
        all_annotations['annotator2'],
        label=label
    )
    print(f'- {label}: {iaa:.2f}')
    
    
all_annotations = accumulate_annotations('addition_standoff')
labels = sorted(list(set(ann.label for anns in all_annotations.values() for ann in anns)))
overall = annotator_agreement_three(
    all_annotations['annotator0'],
    all_annotations['annotator1'],
    all_annotations['annotator2'],
)
print(f'F1 Addition (overall) {overall:.2f}')
for label in labels:
    iaa = annotator_agreement_three(
        all_annotations['annotator0'],
        all_annotations['annotator1'],
        all_annotations['annotator2'],
        label=label
    )
    print(f'- {label}: {iaa:.2f}')

F1 Omission (overall) 0.61
- 1a: 0.64
- 1b: 0.33
- 1c: 0.23
- 1d: 0.83
F1 Addition (overall) 0.60
- 2a: 0.66
- 2b: 0.44
- 2c: 0.07
- 2d: 0.65
- 2e: 0.26


### IAA: Krippendorff's Alpha (Binary Judgments)

Calculate Krippendorff's alpha which is suitable for multiple coders assigning multiple values. Distance measure is MASI (for set-based comparison) or binary. The `other` category is excluded from IAA calculation, because it is specifically meant for subjective/uncertrain errors

Based on: 
- https://www.nltk.org/_modules/nltk/metrics/agreement.html
- https://stats.stackexchange.com/a/460450

In [21]:
from nltk.metrics.agreement import AnnotationTask
from nltk.metrics import masi_distance, binary_distance

In [22]:
error_cols = [
    '3. Incorrect location/position of finding',
    '4. Incorrect severity of finding',
]

# (coder, item, errors)
iaa_data = []
for index, row in df_annotations_normalized.iterrows():
    item_id = str(row['study_id']) + '-' + str(row['candidate_id'])
    
    errors = []
    for error in error_cols:
        if row[error] > 0:
            errors.append(error)

    if len(errors) == 0:
        errors = frozenset(['No Error'])

    iaa_data.append((
        annotator,
        item_id,
        frozenset(errors)
    ))       
iaa_data = pd.DataFrame(iaa_data, columns=['coder', 'item', 'errors'])
iaa_data.head()

print('Overall IAA')
task = AnnotationTask(distance=masi_distance)
task.load_array(iaa_data.values)
print(f"Krippendorff's Alpha: {task.alpha():.2f}")


print('IAA by error')
# get list of unique errors
errors = set(error for errors_item in iaa_data['errors'].values for error in errors_item)
scores = []

for error in errors:
    tmp = iaa_data.copy()
    tmp['errors'] = tmp['errors'].apply(lambda errors: str(error in errors))
    task = AnnotationTask(distance=binary_distance)
    task.load_array(tmp.values)
    scores.append(task.alpha())
    
pd.Series(scores, index=errors, name='IAA (Alpha)').to_frame().sort_index()

Overall IAA
Krippendorff's Alpha: 0.33
IAA by error


Unnamed: 0,IAA (Alpha)
3. Incorrect location/position of finding,0.256074
4. Incorrect severity of finding,0.41225
No Error,0.346529


### Types of error groups

In [23]:
def span_stats(df, annotation_col):
    label_counts = df_annotations_normalized \
        .groupby(['study_id', 'candidate_name']) \
        .apply(lambda group_df: dict(zip(group_df['annotator'], group_df[annotation_col]))) \
        .apply(group_spans) \
        .apply(label_groups) \
        .apply(pd.value_counts)
        
    stats = pd.concat([
        label_counts.sum(),
        label_counts.clip(0,1).sum()
    ], axis=1, keys=['Groups', 'Reports'])
    return stats

In [24]:
pd.concat([
    span_stats(df_annotations_normalized, 'addition_standoff'),
    span_stats(df_annotations_normalized, 'omission_standoff')
], axis=1, keys=['Additions', 'Omissions']).astype(int)

Unnamed: 0_level_0,Additions,Additions,Omissions,Omissions
Unnamed: 0_level_1,Groups,Reports,Groups,Reports
ACCEPT,80,76,107,97
EXTEND,196,158,240,179
REVIEW,33,33,56,50
SINGLE,97,83,76,69


### Span-based error counts after voting

- Relative: Compute the percentage of candidates where this error applies (= here in 100), by clipping the occurence count to binary ([0,1]).
- Absolute: Sum up all occurences (errors in category 1,2 can appear multiple per candidate, for `other` there may be multiple annotations by different annotators)

#### Exact

In [25]:
absolute = df_all_errors_exact.sum(level=1).T
relative = df_all_errors_exact.clip(0, 1).sum(level=1).T
relative = (relative / df_annotations_normalized['study_id'].nunique() * 100)
relative = relative.round(0).astype(int)

absolute = (df_all_errors_exact == 0).all(axis=1).sum(level=1).rename('No Error').to_frame().T.append(absolute)
relative = (df_all_errors_exact == 0).all(axis=1).sum(level=1).rename('No Error').to_frame().T.append(relative)

for total_name, column_match in [('Total Omissions', '1'), ('Total Additions', '2')]:
    cols = [c for c in df_all_errors_exact.columns if c.startswith(column_match)]

    total_absolute = df_all_errors_exact[cols].sum(level=1).sum(axis=1) # aggregate all errors per candidate
    total_relative = (df_all_errors_exact[cols] > 0).any(axis=1).sum(level=1) # any error in this candidate? sum over candidate names
    total_relative = (total_relative / df_annotations_normalized['study_id'].nunique() * 100).round(0).astype(int)
    total_relative
    
    absolute.loc[total_name] = total_absolute
    relative.loc[total_name] = total_relative
    
formatted = absolute.astype(str) + ' (' + relative.astype(str) + '%)'
formatted

candidate_name,bertabs,gsum_thresholding,wgsum,wgsum+cl
No Error,55 (55%),60 (60%),50 (50%),53 (53%)
1a. Omission of finding/interpretation,10 (10%),3 (3%),7 (7%),7 (7%)
1b. Omission of comparison,2 (2%),1 (1%),1 (1%),2 (2%)
1c. Omission of reference to prior report,1 (1%),0 (0%),0 (0%),1 (1%)
1d. Omission of recommendation,3 (3%),2 (2%),3 (3%),2 (2%)
2a. Additional finding/interpretation,5 (5%),5 (4%),3 (3%),8 (8%)
2b. Additional comparison,4 (4%),4 (4%),5 (5%),3 (3%)
2d. Additional recommendation,1 (1%),1 (1%),1 (1%),0 (0%)
3. Incorrect location/position of finding,5 (5%),8 (8%),8 (8%),7 (7%)
4. Incorrect severity of finding,6 (6%),7 (7%),7 (7%),9 (9%)


#### Exact relaxed

In [26]:
absolute = df_all_errors_exact_relaxed.sum(level=1).T
relative = df_all_errors_exact_relaxed.clip(0, 1).sum(level=1).T
relative = (relative / df_annotations_normalized['study_id'].nunique() * 100)
relative = relative.round(0).astype(int)

absolute = (df_all_errors_exact_relaxed == 0).all(axis=1).sum(level=1).rename('No Error').to_frame().T.append(absolute)
relative = (df_all_errors_exact_relaxed == 0).all(axis=1).sum(level=1).rename('No Error').to_frame().T.append(relative)

for total_name, column_match in [('Total Omissions', '1'), ('Total Additions', '2')]:
    cols = [c for c in df_all_errors_exact_relaxed.columns if c.startswith(column_match)]

    total_absolute = df_all_errors_exact_relaxed[cols].sum(level=1).sum(axis=1) # aggregate all errors per candidate
    total_relative = (df_all_errors_exact_relaxed[cols] > 0).any(axis=1).sum(level=1) # any error in this candidate? sum over candidate names
    total_relative = (total_relative / df_annotations_normalized['study_id'].nunique() * 100).round(0).astype(int)
    total_relative
    
    absolute.loc[total_name] = total_absolute
    relative.loc[total_name] = total_relative
    
formatted = absolute.astype(str) + ' (' + relative.astype(str) + '%)'
formatted

candidate_name,bertabs,gsum_thresholding,wgsum,wgsum+cl
No Error,31 (31%),35 (35%),21 (21%),34 (34%)
1a. Omission of finding/interpretation,40 (35%),30 (26%),36 (31%),35 (29%)
1b. Omission of comparison,5 (5%),2 (2%),2 (2%),3 (3%)
1c. Omission of reference to prior report,1 (1%),0 (0%),0 (0%),1 (1%)
1d. Omission of recommendation,13 (13%),14 (13%),14 (13%),12 (12%)
2a. Additional finding/interpretation,27 (23%),35 (28%),34 (32%),32 (28%)
2b. Additional comparison,6 (5%),4 (4%),5 (5%),4 (3%)
2d. Additional recommendation,4 (4%),4 (4%),3 (3%),2 (2%)
2e. Additional contradicting finding,0 (0%),0 (0%),1 (1%),1 (1%)
3. Incorrect location/position of finding,5 (5%),8 (8%),8 (8%),7 (7%)


#### Majority

In [27]:
absolute = df_all_errors_majority.sum(level=1).T
relative = df_all_errors_majority.clip(0, 1).sum(level=1).T
relative = (relative / df_annotations_normalized['study_id'].nunique() * 100)
relative = relative.round(0).astype(int)

absolute = (df_all_errors_majority == 0).all(axis=1).sum(level=1).rename('No Error').to_frame().T.append(absolute)
relative = (df_all_errors_majority == 0).all(axis=1).sum(level=1).rename('No Error').to_frame().T.append(relative)

for total_name, column_match in [('Total Omissions', '1'), ('Total Additions', '2')]:
    cols = [c for c in df_all_errors_majority.columns if c.startswith(column_match)]

    total_absolute = df_all_errors_majority[cols].sum(level=1).sum(axis=1) # aggregate all errors per candidate
    total_relative = (df_all_errors_majority[cols] > 0).any(axis=1).sum(level=1) # any error in this candidate? sum over candidate names
    total_relative = (total_relative / df_annotations_normalized['study_id'].nunique() * 100).round(0).astype(int)
    total_relative
    
    absolute.loc[total_name] = total_absolute
    relative.loc[total_name] = total_relative
    
formatted = absolute.astype(str) + ' (' + relative.astype(str) + '%)'
display(formatted)

rename_models = {
    'bertabs': 'M1',
    'gsum_thresholding': 'M2',
    'wgsum': 'M3',
    'wgsum+cl': 'M4'
}

rename_index = {
    'No Error': '0',
    '1a. Omission of finding/interpretation': '1a',
    '1b. Omission of comparison': '1b',
    '1c. Omission of reference to prior report': '1c',
    '1d. Omission of recommendation': '1d',
    '2a. Additional finding/interpretation': '2a',
    '2b. Additional comparison': '2b',
    '2c. Additional reference to prior report': '2c',
    '2d. Additional recommendation': '2d',
    '2e. Additional contradicting finding': '2e',
    '3. Incorrect location/position of finding': '3',
    '4. Incorrect severity of finding': '4',
    'other': '5',
    'Total Omissions': 'Total Omissions',
    'Total Additions': 'Total Additions'
}

error_names_shortened = pd.Series([
    'No error',
    'Finding/interpretation',
    'Comparison',
    'Ref. to prior report',
    'Recommendation',
    'Finding/interpretation',
    'Comparison',
    'Ref. to prior report',
    'Recommendation',
    'Contradicting finding',
    'Incorrect location',
    'Incorrect severity',
    'Other error', 
    'Total Omissions',
    'Total Additions',
], index=rename_index.values())

## to latex
absolute = absolute.rename(rename_models, axis=1)
relative = relative.rename(rename_models, axis=1).add_suffix(' (\%)')

df = pd.concat([absolute, relative], axis=1)
df = df.rename(rename_index)
df['Error Category'] = error_names_shortened
df = df[sorted(df.columns)]
df.columns.name = r'\textbf{\#}'

for c in relative.columns:
    df[c] = r'(' + df[c].astype(str) + ')'
    df = df.rename({c: '(\%)'}, axis=1)

caption = ' '.join(f'{k} = {v}' for k,v in rename_models.items())

column_format = r"""
    l @{\hspace{1\tabcolsep}} l @{\hspace{1\tabcolsep}}
    r @{\hspace{0.3\tabcolsep}} >{\color{gray}}r @{\hspace{1\tabcolsep}}
    r @{\hspace{0.3\tabcolsep}} >{\color{gray}}r @{\hspace{1\tabcolsep}}
    r @{\hspace{0.3\tabcolsep}} >{\color{gray}}r @{\hspace{1\tabcolsep}}
    r @{\hspace{0.3\tabcolsep}} >{\color{gray}}r @{\hspace{1\tabcolsep}}
"""

tex = df.to_latex(
    na_rep="-",
    position='t',
    escape=False,
    index_names=True,
#     column_format='l' + 'r' * len(df.columns), # Align: left for row label, right for all numbers
    column_format=column_format,
    multicolumn_format='c',
#     caption=caption
)
tex = tex.replace('table', 'table*')
tex = tex.replace('\\centering', '\\small\n\centering')
tex = tex.replace('\centering', '\centering\n\\resizebox{\\columnwidth}{!}{')
tex = tex.replace('\end{tabular}', '\end{tabular}}')

tex = re.sub(r' +', ' ', tex)
for c in df.columns:
    if '%' in c:
        continue
    tex = tex.replace(c, '\\textbf{' + c + '}')
print(tex)

candidate_name,bertabs,gsum_thresholding,wgsum,wgsum+cl
No Error,20 (20%),18 (18%),14 (14%),22 (22%)
1a. Omission of finding/interpretation,70 (52%),58 (43%),62 (48%),64 (47%)
1b. Omission of comparison,23 (19%),16 (15%),19 (16%),23 (19%)
1c. Omission of reference to prior report,1 (1%),3 (3%),2 (2%),2 (2%)
1d. Omission of recommendation,20 (19%),18 (16%),19 (17%),19 (17%)
2a. Additional finding/interpretation,51 (44%),72 (57%),61 (50%),54 (46%)
2b. Additional comparison,11 (8%),10 (9%),9 (9%),7 (6%)
2c. Additional reference to prior report,0 (0%),1 (1%),0 (0%),0 (0%)
2d. Additional recommendation,5 (5%),8 (6%),8 (8%),4 (3%)
2e. Additional contradicting finding,0 (0%),1 (1%),3 (3%),1 (1%)


\begin{table*}[t]
\small
\centering
\resizebox{\columnwidth}{!}{
\begin{tabular}{
 l @{\hspace{1\tabcolsep}} l @{\hspace{1\tabcolsep}}
 r @{\hspace{0.3\tabcolsep}} >{\color{gray}}r @{\hspace{1\tabcolsep}}
 r @{\hspace{0.3\tabcolsep}} >{\color{gray}}r @{\hspace{1\tabcolsep}}
 r @{\hspace{0.3\tabcolsep}} >{\color{gray}}r @{\hspace{1\tabcolsep}}
 r @{\hspace{0.3\tabcolsep}} >{\color{gray}}r @{\hspace{1\tabcolsep}}
}
\toprule
\textbf{\#} & \textbf{Error Category} & \textbf{M1} & (\%) & \textbf{M2} & (\%) & \textbf{M3} & (\%) & \textbf{M4} & (\%) \\
\midrule
0 & No error & 20 & (20) & 18 & (18) & 14 & (14) & 22 & (22) \\
1a & Finding/interpretation & 70 & (52) & 58 & (43) & 62 & (48) & 64 & (47) \\
1b & Comparison & 23 & (19) & 16 & (15) & 19 & (16) & 23 & (19) \\
1c & Ref. to prior report & 1 & (1) & 3 & (3) & 2 & (2) & 2 & (2) \\
1d & Recommendation & 20 & (19) & 18 & (16) & 19 & (17) & 19 & (17) \\
2a & Finding/interpretation & 51 & (44) & 72 & (57) & 61 & (50) & 54 & (46) \\
2b & Compar

### Examples

In [28]:
df_examples = pd.merge(
    df_all_errors_majority,
    votings_omissions['groups'].rename('groups_omissions'),
    left_index=True,
    right_index=True
)

df_examples = pd.merge(
    df_examples,
    votings_additions['groups'].rename('groups_additions'),
    left_index=True,
    right_index=True
)

df_examples = pd.merge(
    df_examples,
    df_candidates,
    on=['study_id', 'candidate_name']
)
df_examples = df_examples.set_index(['study_id', 'candidate_name'])

df_examples.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,1a. Omission of finding/interpretation,1b. Omission of comparison,1c. Omission of reference to prior report,1d. Omission of recommendation,2a. Additional finding/interpretation,2b. Additional comparison,2c. Additional reference to prior report,2d. Additional recommendation,2e. Additional contradicting finding,3. Incorrect location/position of finding,4. Incorrect severity of finding,other,groups_omissions,groups_additions,candidate_id,candidate,reference,findings+bg
study_id,candidate_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
50126222,bertabs,0,0,0,0,0,0,0,0,0,0,0,1,"[[Annotation(start=118, end=152, label='1a', a...",[],0,"mild pulmonary edema, slightly improved in the...",Slight improvement in mild pulmonary edema. Pa...,Examination:\nCHEST (PORTABLE AP)\n\nIndicatio...
50126222,gsum_thresholding,0,0,0,0,0,0,0,0,0,0,0,0,[],[],1,"mild pulmonary edema, slightly improved in the...",Slight improvement in mild pulmonary edema. Pa...,Examination:\nCHEST (PORTABLE AP)\n\nIndicatio...


Based on manually going through instances for each error, we found following illustrative examples

```
* = selected for paper
o = optional if space
- = not selected
```

| * | 1a | 1b | 1c | 1d | 2a | 2b | 2c | 2d | 2e | 3 | 4 | ix |
|---|----|----|----|----|----|----|----|----|----|---|---|----|
| * |  x |    |    |    |    | x |    |    |    |   |   | (55176260, 'wgsum+cl') |
| * |  x |    |  x | x  |    |   |    |    |    |   |   | (50448867, 'bertabs') |
| * |    |  x |    |    |  x |   |    | x  |  x |   |   | (54518631, 'wgsum') |
| * |  x |    |    |    |    |   |    |    |    | x | x | (59239338, 'wgsum') |
| * |    |  x |    |    |  x |   |    |    |    |   |   | (53305461, 'gsum_thresholding') |
| * |  x |    |    |    |  x |   |    |    |    |   |   | (57812270, 'wgsum') |
| o |    |    |    |    |    |   |    |  x |    |   |   | (51773416, 'wgsum') |
| o |    |    |    |    |    |   |    |    |    |   | x | (53923012, 'wgsum+cl') |
| - |  x |    |    |    |    | x |    |    |    |   | x | (55176260, 'wgsum') |
| - |  x |    |  x | x  |    |   |    |    |    |   |   | (57330459, 'bertabs') |
| - |  x |    |    | x  |    |   |    |    |    |   |   | (57665537, 'wgsum+cl') |
| - |    |  x |    |    |    |   |    |    |    |   |   | (59505688, 'wgsum') |

In [29]:
examples = [
    (57812270, 'wgsum'),
    (55176260, 'wgsum+cl'),
    (50448867, 'bertabs'),
    (54518631, 'wgsum'),
    (59239338, 'wgsum'),

    #     (53305461, 'gsum_thresholding'),
    #     (51773416, 'wgsum'),
    #     (53923012, 'wgsum+cl'),    
    #     (55176260, 'wgsum'),
    #     (57330459, 'bertabs'),
    #     (57665537, 'wgsum+cl'),
    #     (59505688, 'wgsum'),
]

colors = {
    '1a': "#6BAED6",
    '1b': "#FD8D3C",
    '1c': "#74C476",
    '1d': "#9E9AC8",
    
    '2a': "#6BAED6",
    '2b': "#FD8D3C",
    '2c': "#74C476",
    '2d': "#9E9AC8",
    '2e': "#FF9F9B",
}


def colored(s, hex_color):
    reset = '\033[0m'
    hex_color = hex_color.lstrip('#')
    r,g,b = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
    escape = f'\033[48;2;{r};{g};{b}m'
    print(escape + s + reset)

def display_example(x):
    sample_id, model_name = x.name
    
    print(r'\textbf{Reference:}', x['reference'].replace('_', '\_'), r'\\')
    print(r'\textbf{Candidate ' + model_name + ':}', x['candidate'].replace('_', '\_'), r'\\')
    
    print()
    print('Annotations in reference (omissions):')
    print('='*37)
    if len(x['groups_omissions']) == 0:
        print('None')
    for group in x['groups_omissions']:
        for ann in group:
            s = ann.label + ' - ' + x['reference'][ann.start:ann.end]
            colored(s, colors[ann.label])
        print()
        
    print('Annotations in candidate (additions):')
    print('='*37)
    if len(x['groups_additions']) == 0:
        print('None')
    for group in x['groups_additions']:
        for ann in group:
            s = ann.label + ' - ' + x['candidate'][ann.start:ann.end]
            colored(s, colors[ann.label])
        print()

for x in examples:
    example = df_examples.loc[x]
    display_example(example)
    print('\n')
    print('='*90)
    print('\n')

\textbf{Reference:} Interval increase in vascular engorgement. No frank interstitial edema. No focal consolidations identified. \\
\textbf{Candidate wgsum:} interval increase in pulmonary vascular congestion without evidence of interstitial edema. small right-sided pleural effusion. \\

Annotations in reference (omissions):
[48;2;107;174;214m1a - No focal consolidations identified[0m
[48;2;107;174;214m1a - No focal consolidations identified.[0m
[48;2;107;174;214m1a - No focal consolidations identified[0m

Annotations in candidate (additions):
[48;2;107;174;214m2a - small right-sided pleural effusion[0m
[48;2;107;174;214m2a - small right-sided pleural effusion.[0m
[48;2;107;174;214m2a - small right-sided pleural effusion[0m





\textbf{Reference:} Right lower lobe opacity, possibly atelectasis, with associated moderate sized effusion. \\
\textbf{Candidate wgsum+cl:} persistent right lower lobe opacity with associated effusion, mildly progressed from the preceding radiograph