In [2]:
from platform import python_version
print(python_version())

3.6.4


In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
%matplotlib inline
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
import itertools
import json
import sys
sys.path.append("..")
from utils import data_proc_tools as dpt
import random
random.seed(42)
random_state=1000
import pylab
pd.set_option('display.max_colwidth', -1)

pylab.rcParams['figure.figsize'] = (8.0, 10.0)

In [5]:
dir = '/vol/medic02/users/ag6516/radiology_report_summarisation/'
data_dir = dir + 'data/'

sample_size = 'all'
aug = 'aug'

model_output_dir = dir + 'trained_models/seq2seq'

## Load reports and mesh captions

In [5]:
reports_mesh_df = pd.read_pickle(data_dir + 'raw/reports_mesh.pkl')

In [6]:
reports_mesh_df.head()

Unnamed: 0,imageid,text_report,mesh_caption
0,CXR10_IM-0002-1001,"[the cardiomediastinal silhouette is within normal limits for size and contour . the lungs are normally inflated without evidence of focal airspace disease , pleural effusion , or pneumothorax . stable calcified granuloma within the right upper lung . no acute bone abnormality . ., no acute cardiopulmonary process .]","[[calcified_granuloma, lung, upper_lobe, right]]"
1,CXR10_IM-0002-2001,"[the cardiomediastinal silhouette is within normal limits for size and contour . the lungs are normally inflated without evidence of focal airspace disease , pleural effusion , or pneumothorax . stable calcified granuloma within the right upper lung . no acute bone abnormality . ., no acute cardiopulmonary process .]","[[calcified_granuloma, lung, upper_lobe, right]]"
10,CXR1001_IM-0004-1002,"[interstitial markings are diffusely prominent throughout both lungs . heart size is normal . pulmonary xxxx normal ., diffuse fibrosis . no visible focal acute disease .]","[[fibrosis, diffuse], [markings, lung, bilateral, interstitial, diffuse, prominent]]"
100,CXR1051_IM-0039-6001,"[heart size and mediastinal contour are normal . pulmonary vascularity is normal . lungs are clear . no pleural effusions or pneumothoraces ., no acute cardiopulmonary process .]",[[normal]]
1000,CXR1499_IM-0323-2001,"[cardiomediastinal silhouettes are within normal limits . low lung volumes . lungs are clear without focal consolidation , pneumothorax , or pleural effusion . bony thorax is unremarkable ., no acute cardiopulmonary abnormalities .]","[[lung, hypoinflation]]"


In [55]:
# group by examid
reports_mesh_df['examid'] = reports_mesh_df['imageid'].apply(lambda row: '-'.join(row.split('-')[:-1]))
unique_reports_df = reports_mesh_df.groupby('examid', as_index=False)

In [56]:
# aggregate under one examid
reports_mesh_df = unique_reports_df.aggregate({'imageid':'sum', 'mesh_caption':'first', 'text_report':'first'})

In [90]:
# combine 'findings' and 'impression' into one text report
reports_mesh_df['combined_text_report'] = reports_mesh_df['text_report'].apply(lambda row: ' '.join(row))

In [98]:
# remove one-character sentences from combined reports
text_reports = list(reports_mesh_df.combined_text_report)

sentences = [s.split('.') for s in text_reports]
sentences_ = [[s.lstrip().rstrip() for s in sen if len(s.lstrip().rstrip())>1] for sen in sentences]

In [103]:
reports_mesh_df['combined_text_report'] = sentences_

In [109]:
reports_mesh_df.drop(['text_report','imageid'], axis=1, inplace=True)

In [193]:
reports_mesh_df.isnull().any()

examid                  False
mesh_caption            False
combined_text_report    False
dtype: bool

In [110]:
# save
reports_mesh_df.to_pickle(data_dir + 'raw/agg_reports_mesh.pkl')

## Text report stats

In [6]:
reports_mesh_df = pd.read_pickle(data_dir + 'raw/agg_reports_mesh.pkl')

In [7]:
reports = list(reports_mesh_df.combined_text_report)

all_words = Counter()
num_words = []
num_sentences = []
empty_reports = 0

for report in reports:
    num_sentences.append(len(report))
    joined = ' '.join(report)
    num_words.append(len(joined))
    if len(joined) < 5:
        empty_reports+=1
    all_words.update(joined.split(' '))
    
print('Total vocab length: {}'.format(len(all_words)))
print('Average number of sentences per report: {}'.format(np.mean(num_sentences)))
print('Average number of words per report: {}'.format(np.mean(num_words)))
print('STD of words per report: {}'.format(np.std(num_words)))
print('Number of empty reports: {}'.format(empty_reports))

Total vocab length: 2087
Average number of sentences per report: 5.837830617152017
Average number of words per report: 265.89740849585894
STD of words per report: 136.70481097827746
Number of empty reports: 0


## Preprocess reports
Lower-caseing, non-alpha-numeric character removal, removal of words >99th percentile, stopword removal

In [8]:
reports = list(reports_mesh_df.combined_text_report)
processed_reports = dpt.preprocess_reports(reports, data_dir+'stopwords.txt')

Avg. number of sentences per report before negation removal: 5.837830617152017
Avg. number of sentences per report after negation removal: 3.7948169917178736
Min sentence length: 0
Max sentence length: 21
Total vocab length: 2002
Vocab length of words>=2: 1397
Average number of sentences per report: 3.7993588030991186
STD number of sentences per report: 2.057178803022604

Average number of words per sentence: 6.382532873918852
STD number of words per sentence: 2.9593859267335634

Average number of words per exam report: 21.450173657493988
STD number of words per exam report: 14.267907762813904
Vocab length after stopwords removal: 1319


## Preprocess MeSH
Lower-caseing, removal of words >99th percentile

In [34]:
mesh_captions = list(reports_mesh_df.mesh_caption)
processed_mesh_captions = dpt.preprocess_mesh(mesh_captions)

Stats prior to vocab reduction
Average number of captions per exam: 2.0801496126102057
Average number of terms per caption: 2.532494220395582
Average number of terms per exam: 5.267966871493455
STD of terms per exam: 5.546351674055975
Exams with >1 MeSH annotation: 1609
Normal vs abnormal cases: normal: 1357 abnormal: 2386
Total vocab length: 177
Vocab length of words>=8: 125

Stats after to vocab reduction
Average number of captions per exam: 2.0801496126102057
Average number of terms per caption: 2.507834574877986
Average number of terms per exam: 5.216671119422923
STD of terms per exam: 5.493816010087873
Exams with >1 MeSH annotation: 1609
Normal vs abnormal cases: normal: 1357 abnormal: 2386


## Create and save new df with processed data

In [37]:
reports_mesh_df['processed_reports'] = processed_reports
reports_mesh_df['processed_mesh'] = [list(itertools.chain(*mesh)) for mesh in processed_mesh_captions]

In [39]:
reports_mesh_df['single_mesh'] = [max(x, key=len) for x in processed_mesh_captions]

In [40]:
proc_reports_mesh_df = reports_mesh_df.drop(['mesh_caption', 'combined_text_report'], axis=1)

In [41]:
proc_reports_mesh_df.rename(columns={'processed_reports':'report', 'processed_mesh':'all_mesh'}, inplace=True)
proc_reports_mesh_df.to_pickle(data_dir + 'proc/proc_reports_mesh.pkl')

## Train/Test/Val split

In [42]:
reports_mesh_df = pd.read_pickle(data_dir + 'proc/proc_reports_mesh.pkl')

In [43]:
reports_mesh_df.head()

Unnamed: 0,examid,report,all_mesh,single_mesh
0,CXR1000_IM-0003,"[increased, opacity, within, right, upper, lobe, possible, mass, associated, area, atelectasis, focal, consolidation, ., cardiac, silhouette, within, normal, limits, ., opacity, left, midlung, overlying, posterior, left, 5th, rib, may, represent, focal, airspace, disease, ., increased, opacity, right, upper, lobe, associated, atelectasis, may, represent, focal, consolidation, mass, lesion, atelectasis, ., recommend, chest, ct, evaluation, ., opacity, overlying, left, 5th, rib, may, represent, focal, airspace, disease]","[opacity, lung, lingula, opacity, lung, upper_lobe, right, pulmonary_atelectasis, upper_lobe, right]","[opacity, lung, upper_lobe, right]"
1,CXR1001_IM-0004,"[interstitial, markings, diffusely, prominent, throughout, lungs, ., heart, size, normal, ., pulmonary, normal, ., diffuse, fibrosis]","[diffuse, markings, lung, bilateral, interstitial, diffuse, prominent]","[markings, lung, bilateral, interstitial, diffuse, prominent]"
2,CXR1002_IM-0004,"[status, post, left, mastectomy, ., heart, size, normal, ., lungs, clear]",[left],[left]
3,CXR1003_IM-0005,"[heart, size, pulmonary, vascularity, appear, within, normal, limits, ., retrocardiac, soft, tissue, density, present, ., appears, air, within, suggest, represents, hiatal, hernia, ., vascular, calcification, noted, ., calcified, granuloma, seen, ., interval, development, bandlike, opacity, left, lung, base, ., may, represent, atelectasis, ., osteopenia, present, spine, ., retrocardiac, soft, tissue, density, ., appearance, suggests, hiatal, hernia, ., left, base, bandlike, opacity, ., appearance, suggests, atelectasis]","[bone_diseases_metabolic, spine, calcified_granuloma, calcinosis, blood_vessels, density, retrocardiac, opacity, lung, base, left]","[opacity, lung, base, left]"
4,CXR1004_IM-0005,"[heart, ,, pulmonary, mediastinum, within, normal, limits, ., aorta, tortuous, ectatic, ., degenerative, changes, acromioclavicular, joints, ., degenerative, changes, spine, ., ivc, identified]","[aorta, tortuous, catheters_indwelling, shoulder, bilateral, degenerative, spine, degenerative]","[shoulder, bilateral, degenerative]"


In [44]:
val_df = reports_mesh_df.sample(300, random_state=42)
remaining_df = reports_mesh_df.drop(val_df.index)
test_df = remaining_df.sample(300, random_state=42)
train_df = remaining_df.drop(test_df.index)

In [45]:
len(train_df), len(val_df), len(test_df)

(3143, 300, 300)

In [46]:
train_df.to_pickle(data_dir + 'train/train.pkl')
val_df.to_pickle(data_dir + 'val/val.pkl')
test_df.to_pickle(data_dir + 'test/test.pkl')

## Augment training dataset
Shuffle sentences in cases where mesh caption is not 'normal'

In [47]:
train_df = pd.read_pickle(data_dir + 'train/train.pkl')

In [50]:
aug_train_df = train_df
normal_count = 0
for i, exam in train_df.iterrows():
    mesh = exam.all_mesh
    if 'normal' not in mesh:
        new_text_report = dpt.shuffle_text2(exam.report)
        new_row = {'examid': exam.examid,
                   'report': new_text_report,
                   'all_mesh': exam.all_mesh,
                   'single_mesh' : exam.single_mesh}
        aug_train_df = aug_train_df.append(new_row, ignore_index=True)
    else:
        normal_count+=1

In [51]:
len(aug_train_df)

5148

In [52]:
aug_train_df.isnull().any()

examid         False
report         False
all_mesh       False
single_mesh    False
dtype: bool

In [53]:
aug_train_df.to_pickle(data_dir + 'train/aug_train.pkl')

In [54]:
aug_train_df.head()

Unnamed: 0,examid,report,all_mesh,single_mesh
0,CXR1000_IM-0003,"[increased, opacity, within, right, upper, lobe, possible, mass, associated, area, atelectasis, focal, consolidation, ., cardiac, silhouette, within, normal, limits, ., opacity, left, midlung, overlying, posterior, left, 5th, rib, may, represent, focal, airspace, disease, ., increased, opacity, right, upper, lobe, associated, atelectasis, may, represent, focal, consolidation, mass, lesion, atelectasis, ., recommend, chest, ct, evaluation, ., opacity, overlying, left, 5th, rib, may, represent, focal, airspace, disease]","[opacity, lung, lingula, opacity, lung, upper_lobe, right, pulmonary_atelectasis, upper_lobe, right]","[opacity, lung, upper_lobe, right]"
1,CXR1001_IM-0004,"[interstitial, markings, diffusely, prominent, throughout, lungs, ., heart, size, normal, ., pulmonary, normal, ., diffuse, fibrosis]","[diffuse, markings, lung, bilateral, interstitial, diffuse, prominent]","[markings, lung, bilateral, interstitial, diffuse, prominent]"
2,CXR1002_IM-0004,"[status, post, left, mastectomy, ., heart, size, normal, ., lungs, clear]",[left],[left]
3,CXR1003_IM-0005,"[heart, size, pulmonary, vascularity, appear, within, normal, limits, ., retrocardiac, soft, tissue, density, present, ., appears, air, within, suggest, represents, hiatal, hernia, ., vascular, calcification, noted, ., calcified, granuloma, seen, ., interval, development, bandlike, opacity, left, lung, base, ., may, represent, atelectasis, ., osteopenia, present, spine, ., retrocardiac, soft, tissue, density, ., appearance, suggests, hiatal, hernia, ., left, base, bandlike, opacity, ., appearance, suggests, atelectasis]","[bone_diseases_metabolic, spine, calcified_granuloma, calcinosis, blood_vessels, density, retrocardiac, opacity, lung, base, left]","[opacity, lung, base, left]"
4,CXR1004_IM-0005,"[heart, ,, pulmonary, mediastinum, within, normal, limits, ., aorta, tortuous, ectatic, ., degenerative, changes, acromioclavicular, joints, ., degenerative, changes, spine, ., ivc, identified]","[aorta, tortuous, catheters_indwelling, shoulder, bilateral, degenerative, spine, degenerative]","[shoulder, bilateral, degenerative]"
