## Notebook Setup

magic cell that declares `\bm{}` latex macro.
$\newcommand{\bm}[1]{{\mathbf{\boldsymbol{{#1}}}}}$

In [1]:
import re
from pathlib import Path

import pandas as pd
import numpy as np
from tqdm.auto import tqdm
tqdm.pandas()

In [2]:
from guidedsum.evaluation import calculate_statistics, calculate_rouge, load_summaries
from guidedsum.guidance.extractive_guidance import greedy_selection, greedy_selection_parallel

from guidedsum.utils import display_sample

## Summary statistics

### Calculate stats for each dataset

In [3]:
def dataset_statistics(dataset_path, name):
    dataset_path = Path(dataset_path)
    df_train =  pd.read_json(dataset_path / "reports.train.json").assign(split='train')
    df_valid = pd.read_json(dataset_path / "reports.valid.json").assign(split='valid')
    df_test = pd.read_json(dataset_path / "reports.test.json").assign(split='test')

    df = pd.concat([df_train, df_valid, df_test], ignore_index=True)    
    src = df['src'].apply(lambda sents: '<q>'.join((' '.join(sent) for sent in sents)))
    tgt = df['tgt'].apply(lambda sents: '<q>'.join((' '.join(sent) for sent in sents)))

    stats_all = calculate_statistics(tqdm(src), tgt)
    stats = stats_all.mean()
    stats['n_train'] = len(df_train)
    stats['n_valid'] = len(df_valid)
    stats['n_test'] = len(df_test)
    stats['n'] = len(df)
    
    n_docs = '{:,d}/{:,d}/{:,d}'.format(
        int(stats['n_train']),
        int(stats['n_valid']),
        int(stats['n_test'])
    )
    stats['docs'] = n_docs
    stats['novelty_uni'] = stats['novelty_uni'] * 100
    stats['novelty_bi'] = stats['novelty_bi'] * 100
    stats['cmp_w'] = stats['cmp_w'] * 100
    stats['cmp_s'] = stats['cmp_s'] * 100

    
    std = stats_all.agg('std').add_suffix('_std')
    stats = stats.append(std)
    stats = stats.rename(name)
    
    stats_by_split = stats_all.groupby(df['split']).mean()
    
    return stats, stats_by_split

In [4]:
# MIMIC-OFFICIAL
stats_mimic_official, stats_mimic_official_by_split = dataset_statistics(
    dataset_path="../data/processed/mimic-official-unguided/",
    name="mimic-official",
)

# MIMIC-OFFICIAL-BG
stats_mimic_official_bg, stats_mimic_official_bg_by_split = dataset_statistics(
    dataset_path="../data/processed/mimic-official-bg-unguided/",
    name="mimic-official-bg",
)

# OpenI
stats_openi, stats_openi_by_split = dataset_statistics(
    dataset_path="../data/processed/openi-unguided/",
    name="openi",
)

# OpenI-BG
stats_openi_bg, stats_openi_bg_by_split = dataset_statistics(
    dataset_path="../data/processed/openi-bg-unguided/",
    name="openi-bg",
)

  0%|          | 0/125061 [00:00<?, ?it/s]

  0%|          | 0/125061 [00:00<?, ?it/s]

  0%|          | 0/3346 [00:00<?, ?it/s]

  0%|          | 0/3346 [00:00<?, ?it/s]

### Combine datasets

In [5]:
stats = [
    stats_mimic_official,
    stats_mimic_official_bg,
    stats_openi,
    stats_openi_bg
]

df_stats = pd.DataFrame(stats)
df_stats = df_stats.drop(['n_train', 'n_valid', 'n_test'], axis=1)
df_stats['n'] = df_stats['n'].astype(int)
df_stats

Unnamed: 0,n_words_doc,n_sents_doc,n_words_summary,n_sents_summary,cmp_w,cmp_s,novelty_uni,novelty_bi,n,docs,n_words_doc_std,n_sents_doc_std,n_words_summary_std,n_sents_summary_std,cmp_w_std,cmp_s_std,novelty_uni_std,novelty_bi_std
mimic-official,56.07027,5.479558,15.098144,1.606328,73.771076,68.674235,47.432424,73.392954,125061,"122,500/963/1,598",25.184872,1.870711,13.493695,0.893279,0.175122,0.170616,0.269221,0.268636
mimic-official-bg,87.096257,7.47552,15.098144,1.606328,83.517694,77.390005,42.482523,72.296663,125061,"122,500/963/1,598",28.744373,2.208604,13.493695,0.893279,0.118171,0.125023,0.265856,0.269975
openi,37.537657,4.642857,8.928273,1.426778,76.145868,67.877095,59.835063,86.823445,3346,"2,342/334/670",16.41052,1.641777,8.078442,0.819158,0.14691,0.153252,0.277196,0.206821
openi-bg,53.61058,5.853556,8.928273,1.426778,83.683191,74.511569,56.526555,86.654419,3346,"2,342/334/670",18.992954,2.002408,8.078442,0.819158,0.105881,0.121462,0.276696,0.207707


### Paper table

Render latex version for inclusion in paper (statistics without the background section).

In [6]:
display_names = {
    'docs': 'Reports',
    'n_words_doc': 'Avg. $|x|_{t}$',
    'n_sents_doc': 'Avg. $|x|_{s}$',
    'n_words_summary': 'Avg. $|y|_{t}$',
    'n_sents_summary': 'Avg. $|y|_{s}$',
    'novelty_bi': 'Novelty',
    'cmp_w': 'CMP',
}

display_dataset = {
    'mimic-official': 'MIMIC-CXR',
    'openi': 'OpenI'
}

df_table = df_stats.copy()
df_table['n'] = df_table['n'].astype(int).apply(lambda x: f'{x:,}')
df_table['n_words_doc'] = df_table['n_words_doc'].astype(int).astype(str) + " {\color{gray} $\pm$ " + df_table['n_words_doc_std'].round(1).astype(str) + "}"
df_table['n_sents_doc'] = df_table['n_sents_doc'].round(1).astype(str) + " {\color{gray} $\pm$ " + df_table['n_sents_doc_std'].round(1).astype(str) + "}"
df_table['n_words_summary'] = df_table['n_words_summary'].astype(int).astype(str) + " {\color{gray} $\pm$ " + df_table['n_words_summary_std'].round(1).astype(str) + "}"
df_table['n_sents_summary'] = df_table['n_sents_summary'].round(1).astype(str) + " {\color{gray} $\pm$ " + df_table['n_sents_summary_std'].round(1).astype(str) + "}"
df_table['novelty_bi'] = df_table['novelty_bi'].round(1).apply(lambda s: f'{s}%')
df_table['cmp_w'] = df_table['cmp_w'].round(1).apply(lambda s: f'{s}%')
df_table = df_table[display_names.keys()]
df_table = df_table.loc[display_dataset.keys()]
df_table = df_table.astype(object)
df_table = df_table.T
df_table = df_table.rename(display_names)
df_table = df_table.rename(display_dataset, axis=1)
display(df_table)

tex = df_table.to_latex(   
    position='t',
    column_format='lccc',
    label='tab:datasets',
    escape=False
)

tex = tex \
    .replace('{} &', r'\textbf{Aspect} &') \
    .replace('MIMIC-CXR', r'\textbf{MIMIC-CXR}') \
    .replace('OpenI', r'\textbf{OpenI}') \
    .replace(r'\centering', r'\small' + '\n' + '\centering') \
    .replace(r'%', r'\%') \
    .replace(r'|x|', r'|\bm{x}|') \
    .replace(r'|y|', r'|\bm{y}|')

tex = re.sub(r' +', ' ', tex)

print(tex)

Unnamed: 0,MIMIC-CXR,OpenI
Reports,"122,500/963/1,598","2,342/334/670"
Avg. $|x|_{t}$,56 {\color{gray} $\pm$ 25.2},37 {\color{gray} $\pm$ 16.4}
Avg. $|x|_{s}$,5.5 {\color{gray} $\pm$ 1.9},4.6 {\color{gray} $\pm$ 1.6}
Avg. $|y|_{t}$,15 {\color{gray} $\pm$ 13.5},8 {\color{gray} $\pm$ 8.1}
Avg. $|y|_{s}$,1.6 {\color{gray} $\pm$ 0.9},1.4 {\color{gray} $\pm$ 0.8}
Novelty,73.4%,86.8%
CMP,73.8%,76.1%


\begin{table}[t]
\small
\centering
\label{tab:datasets}
\begin{tabular}{lccc}
\toprule
\textbf{Aspect} & \textbf{MIMIC-CXR} & \textbf{OpenI} \\
\midrule
Reports & 122,500/963/1,598 & 2,342/334/670 \\
Avg. $|\bm{x}|_{t}$ & 56 {\color{gray} $\pm$ 25.2} & 37 {\color{gray} $\pm$ 16.4} \\
Avg. $|\bm{x}|_{s}$ & 5.5 {\color{gray} $\pm$ 1.9} & 4.6 {\color{gray} $\pm$ 1.6} \\
Avg. $|\bm{y}|_{t}$ & 15 {\color{gray} $\pm$ 13.5} & 8 {\color{gray} $\pm$ 8.1} \\
Avg. $|\bm{y}|_{s}$ & 1.6 {\color{gray} $\pm$ 0.9} & 1.4 {\color{gray} $\pm$ 0.8} \\
Novelty & 73.4\% & 86.8\% \\
CMP & 73.8\% & 76.1\% \\
\bottomrule
\end{tabular}
\end{table}



### Caluclate statistics of error analysis sample

In [8]:
sample_ids = pd.read_json('../error-analysis/data/annotations.jsonl', lines=True)['study_id'].unique()

In [9]:
df = pd.read_json("../data/processed/mimic-official-unguided/reports.test.json").assign(split='test')
src = df['src'].apply(lambda sents: '<q>'.join((' '.join(sent) for sent in sents)))
tgt = df['tgt'].apply(lambda sents: '<q>'.join((' '.join(sent) for sent in sents)))
stats = calculate_statistics(tqdm(src), tgt)
stats = stats.set_index(df['id'])

full = stats.agg('mean')
full = full.append(stats.agg('std').add_suffix('_std'))
full['n'] = len(df)
full = full.rename(f'Test Set')

sample = stats.loc[sample_ids].agg('mean')
sample = sample.append(stats.loc[sample_ids].agg('std').add_suffix('_std'))
sample['n'] = len(sample_ids)
sample = sample.rename(f'Error Analysis Sample')

df_stats = pd.concat([full, sample], axis=1).T
df_stats['novelty_uni'] = df_stats['novelty_uni'] * 100
df_stats['novelty_bi'] = df_stats['novelty_bi'] * 100
df_stats['cmp_w'] = df_stats['cmp_w'] * 100
df_stats['cmp_s'] = df_stats['cmp_s'] * 100
df_stats

  0%|          | 0/1598 [00:00<?, ?it/s]

Unnamed: 0,n_words_doc,n_sents_doc,n_words_summary,n_sents_summary,cmp_w,cmp_s,novelty_uni,novelty_bi,n_words_doc_std,n_sents_doc_std,n_words_summary_std,n_sents_summary_std,cmp_w_std,cmp_s_std,novelty_uni_std,novelty_bi_std,n
Test Set,70.981852,6.220275,19.770338,1.825407,71.925122,68.982011,42.101694,69.788209,27.419815,1.94129,15.188128,1.000403,0.179781,0.170151,0.252323,0.257535,1598.0
Error Analysis Sample,63.25,5.74,18.55,1.73,70.310152,68.242063,43.24326,69.703033,20.388834,1.586687,12.447364,0.897302,0.197809,0.181449,0.274547,0.27091,100.0


In [10]:
display_names = {
    'n': 'Reports',
    'n_words_doc': 'Avg. $|x|_{t}$',
    'n_sents_doc': 'Avg. $|x|_{s}$',
    'n_words_summary': 'Avg. $|y|_{t}$',
    'n_sents_summary': 'Avg. $|y|_{s}$',
    'novelty_bi': 'Novelty',
    'cmp_w': 'CMP',
}

df_table = df_stats.copy()
df_table['n'] = df_table['n'].astype(int).apply(lambda x: f'{x:,}')
df_table['n_words_doc'] = df_table['n_words_doc'].astype(int).astype(str) + " {\color{gray} $\pm$ " + df_table['n_words_doc_std'].round(1).astype(str) + "}"
df_table['n_sents_doc'] = df_table['n_sents_doc'].round(1).astype(str) + " {\color{gray} $\pm$ " + df_table['n_sents_doc_std'].round(1).astype(str) + "}"
df_table['n_words_summary'] = df_table['n_words_summary'].astype(int).astype(str) + " {\color{gray} $\pm$ " + df_table['n_words_summary_std'].round(1).astype(str) + "}"
df_table['n_sents_summary'] = df_table['n_sents_summary'].round(1).astype(str) + " {\color{gray} $\pm$ " + df_table['n_sents_summary_std'].round(1).astype(str) + "}"
df_table['novelty_bi'] = df_table['novelty_bi'].round(1).apply(lambda s: f'{s}%')
df_table['cmp_w'] = df_table['cmp_w'].round(1).apply(lambda s: f'{s}%')
df_table = df_table[display_names.keys()]
df_table = df_table.astype(object)
df_table = df_table.T
df_table = df_table.rename(display_names)
display(df_table)

tex = df_table.to_latex(   
    position='t',
    column_format='lccc',
    label='tab:error-analysis-sample-statistics',
    escape=False
)

tex = tex \
    .replace('{} &', r'\textbf{Aspect} &') \
    .replace('MIMIC-CXR', r'\textbf{MIMIC-CXR}') \
    .replace('OpenI', r'\textbf{OpenI}') \
    .replace(r'\centering', r'\small' + '\n' + '\centering') \
    .replace(r'%', r'\%') \
    .replace(r'|x|', r'|\bm{x}|') \
    .replace(r'|y|', r'|\bm{y}|')

tex = re.sub(r' +', ' ', tex)

print(tex)

Unnamed: 0,Test Set,Error Analysis Sample
Reports,1598,100
Avg. $|x|_{t}$,70 {\color{gray} $\pm$ 27.4},63 {\color{gray} $\pm$ 20.4}
Avg. $|x|_{s}$,6.2 {\color{gray} $\pm$ 1.9},5.7 {\color{gray} $\pm$ 1.6}
Avg. $|y|_{t}$,19 {\color{gray} $\pm$ 15.2},18 {\color{gray} $\pm$ 12.4}
Avg. $|y|_{s}$,1.8 {\color{gray} $\pm$ 1.0},1.7 {\color{gray} $\pm$ 0.9}
Novelty,69.8%,69.7%
CMP,71.9%,70.3%


\begin{table}[t]
\small
\centering
\label{tab:error-analysis-sample-statistics}
\begin{tabular}{lccc}
\toprule
\textbf{Aspect} & Test Set & Error Analysis Sample \\
\midrule
Reports & 1,598 & 100 \\
Avg. $|\bm{x}|_{t}$ & 70 {\color{gray} $\pm$ 27.4} & 63 {\color{gray} $\pm$ 20.4} \\
Avg. $|\bm{x}|_{s}$ & 6.2 {\color{gray} $\pm$ 1.9} & 5.7 {\color{gray} $\pm$ 1.6} \\
Avg. $|\bm{y}|_{t}$ & 19 {\color{gray} $\pm$ 15.2} & 18 {\color{gray} $\pm$ 12.4} \\
Avg. $|\bm{y}|_{s}$ & 1.8 {\color{gray} $\pm$ 1.0} & 1.7 {\color{gray} $\pm$ 0.9} \\
Novelty & 69.8\% & 69.7\% \\
CMP & 71.9\% & 70.3\% \\
\bottomrule
\end{tabular}
\end{table}



## Length distribution of OracleExt

Average length in sentences

In [14]:
print(pd.read_json('../data/processed/mimic-oracle/reports.train.json')['z_ids'].apply(len).mean())
print(pd.read_json('../data/processed/mimic-bg-oracle/reports.train.json')['z_ids'].apply(len).mean())
print(pd.read_json('../data/processed/mimic-official-oracle/reports.train.json')['z_ids'].apply(len).mean())
print(pd.read_json('../data/processed/mimic-official-bg-oracle/reports.train.json')['z_ids'].apply(len).mean())
print(pd.read_json('../data/processed/openi-oracle/reports.train.json')['z_ids'].apply(len).mean())
print(pd.read_json('../data/processed/openi-bg-oracle/reports.train.json')['z_ids'].apply(len).mean())

1.3824264353110507
1.4460558931712777
1.3778285714285714
1.4413061224489796
1.0742954739538855
1.1473099914602904


Too short/too long as opposed to fixed length summaries.

In [12]:
def print_oracleext_distribution(df, split='train', k=1):   
    oracle_stats = df['oracle_len'].value_counts(normalize=True)
    oracle_stats = oracle_stats * 100
    oracle_stats.index.name = 'Length (sents)'
    oracle_stats.name = 'Reports (%)'

    print(
        'Percent of reports with OracleExt length of 0/1/2/3 sentences = ' \
        + '{:.0f}'.format(oracle_stats.loc[0]) \
        + '/' \
        + '{:.0f}'.format(oracle_stats.loc[1]) \
        + '/' \
        + '{:.0f}'.format(oracle_stats.loc[2]) \
        + '/' \
        + '{:.0f}'.format(oracle_stats.loc[3]) \
        + '%'
    )

    k = 1
    too_short = oracle_stats.loc[oracle_stats.index > k].sum()
    too_long = oracle_stats.loc[oracle_stats.index < k].sum()
    print(f'Too short when k = {k}: {too_short:.0f}%')
    print(f'Too long when k = {k}: {too_long:.0f}%')
    display(oracle_stats.to_frame().round(2))

In [13]:
df = pd.read_json('../data/processed/mimic-official-oracle/reports.test.json')
df['oracle_len'] = df['z_ids'].apply(len)
print_oracleext_distribution(df)

Percent of reports with OracleExt length of 0/1/2/3 sentences = 2/52/32/14%
Too short when k = 1: 46%
Too long when k = 1: 2%


Unnamed: 0_level_0,Reports (%)
Length (sents),Unnamed: 1_level_1
1,51.56
2,32.04
3,14.08
0,2.32


In [15]:
df = pd.read_json('../data/processed/openi-oracle/reports.test.json')
df['oracle_len'] = df['z_ids'].apply(len)
print_oracleext_distribution(df)

Percent of reports with OracleExt length of 0/1/2/3 sentences = 15/67/14/3%
Too short when k = 1: 17%
Too long when k = 1: 15%


Unnamed: 0_level_0,Reports (%)
Length (sents),Unnamed: 1_level_1
1,67.16
0,15.37
2,14.18
3,3.28
