In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
import itertools
import json
import sys
sys.path.append("..")
from utils import data_proc_tools as dpt
from utils import plot_tools as pt
from utils.custom_metrics import recall, precision, binary_accuracy
from utils.custom_metrics import recall_np, precision_np, binary_accuracy_np, multilabel_confusion_matrix
from utils.multi_label_text_models import Seq2Seq
import random
random.seed(42)
random_state=1000
import pylab

pylab.rcParams['figure.figsize'] = (8.0, 10.0)

Using TensorFlow backend.


In [3]:
dir = '/vol/medic02/users/ag6516/image_sentence_mapping/'
data_dir = dir + 'data/chestx/'
sample_size = 'all'
data_type = 'processed_balanced'
model_output_dir = dir + 'trained_models/chestx/text_seq2seq/train_{}/{}/'.format(sample_size, data_type)
data_output_dir = dir + 'data/chestx/{}/'.format(data_type)
dicts_dir = dir + 'data/chestx/{}/dicts_pad/'.format(data_type)

### Create and store dictionaries from entire dataset

In [4]:
start_token = 'start'
end_token = '.'

In [None]:
all_df = pd.read_pickle(data_output_dir + 'all/all.pkl')

In [None]:
all_words = Counter()
for cap in list(all_df.mesh_caption):
    cap = [c for c in cap]
    all_words.update(cap)

vocab = [k for k, v in all_words.items() if v >= 10]
print('Total vocab length: {0}\nVocab length of words>=10: {1}'.format(len(all_words), len(vocab)))

In [None]:
all_df['proc_mesh_caption'] = all_df.mesh_caption.apply(lambda cap: [w for w in cap if w in vocab])
all_df.to_pickle(data_output_dir + 'all/all_proc.pkl')

In [None]:
vectoriser = dpt.Vectoriser(data_output_dir+'all/')

all_df.tok_reports_padded = all_df.tok_reports_padded.apply(lambda c: [start_token]+c+[end_token])
all_df.proc_mesh_caption = all_df.proc_mesh_caption.apply(lambda c: [start_token]+c+[end_token])

tok_reports_padded = list(all_df.tok_reports_padded)
mesh_captions = list(all_df.proc_mesh_caption)

vectoriser.entities_to_vectors(mesh_captions)
vectoriser.sentences_to_vectors(tok_reports_padded)

### Load samples, vectorise text reports and mesh captions

In [15]:
val_df = pd.read_pickle(data_output_dir + 'val/val.pkl')
train_df = pd.read_pickle(data_output_dir + 'train_{0}/train_{0}.pkl'.format(sample_size))

In [16]:
train_df.head()

Unnamed: 0,imageid,mesh_caption,text_report,tok_reports_padded,proc_mesh_caption
0,CXR10_IM-0002-1001,"[calcified granuloma, lung, upper lobe, right]",the cardiomediastinal silhouette is within nor...,"[cardiomediastinal, silhouette, within, normal...","[calcified granuloma, lung, upper lobe, right]"
1,CXR10_IM-0002-2001,"[calcified granuloma, lung, upper lobe, right]",the cardiomediastinal silhouette is within nor...,"[cardiomediastinal, silhouette, within, normal...","[calcified granuloma, lung, upper lobe, right]"
2,CXR1_1_IM-0001-3001,[normal],the cardiac silhouette and mediastinum size ar...,"[cardiac, silhouette, mediastinum, size, withi...",[normal]
3,CXR1_1_IM-0001-4001,[normal],the cardiac silhouette and mediastinum size ar...,"[cardiac, silhouette, mediastinum, size, withi...",[normal]
4,CXR1003_IM-0005-2002,"[bone diseases, metabolic, spine]",heart size and pulmonary vascularity appear wi...,"[heart, size, pulmonary, vascularity, appear, ...","[bone diseases, metabolic, spine]"


In [17]:
# prepend start token to mesh captions and reports
train_df.tok_reports_padded = train_df.tok_reports_padded.apply(lambda c: [start_token]+c)
train_df.mesh_caption = train_df.mesh_caption.apply(lambda c: [start_token]+c)
train_df.proc_mesh_caption = train_df.proc_mesh_caption.apply(lambda c: [start_token]+c)

val_df.tok_reports_padded = val_df.tok_reports_padded.apply(lambda c: [start_token]+c)
val_df.mesh_caption = val_df.mesh_caption.apply(lambda c: [start_token]+c)
val_df.proc_mesh_caption = val_df.proc_mesh_caption.apply(lambda c: [start_token]+c)

In [18]:
train_df.head()

Unnamed: 0,imageid,mesh_caption,text_report,tok_reports_padded,proc_mesh_caption
0,CXR10_IM-0002-1001,"[start, calcified granuloma, lung, upper lobe,...",the cardiomediastinal silhouette is within nor...,"[start, cardiomediastinal, silhouette, within,...","[start, calcified granuloma, lung, upper lobe,..."
1,CXR10_IM-0002-2001,"[start, calcified granuloma, lung, upper lobe,...",the cardiomediastinal silhouette is within nor...,"[start, cardiomediastinal, silhouette, within,...","[start, calcified granuloma, lung, upper lobe,..."
2,CXR1_1_IM-0001-3001,"[start, normal]",the cardiac silhouette and mediastinum size ar...,"[start, cardiac, silhouette, mediastinum, size...","[start, normal]"
3,CXR1_1_IM-0001-4001,"[start, normal]",the cardiac silhouette and mediastinum size ar...,"[start, cardiac, silhouette, mediastinum, size...","[start, normal]"
4,CXR1003_IM-0005-2002,"[start, bone diseases, metabolic, spine]",heart size and pulmonary vascularity appear wi...,"[start, heart, size, pulmonary, vascularity, a...","[start, bone diseases, metabolic, spine]"


In [19]:
# initialise vectoriser. if id dictionaries exist, set load_dicts=True
train_vectoriser = dpt.Vectoriser(data_output_dir+'train_{}/'.format(sample_size), load_dicts=True, dicts_dir=dicts_dir)
val_vectoriser = dpt.Vectoriser(data_output_dir+'val/', load_dicts=True, dicts_dir=dicts_dir)

In [20]:
# extract tokenized sentences and entities from df
# pad with end token
train_tok_reports_padded = list(train_df.tok_reports_padded)
train_mesh_captions = list(train_df.proc_mesh_caption)

lengths = [len(caption) for caption in train_mesh_captions]
max_caption_length = max(lengths)

train_mesh_captions_padded = [dpt.pad_sentence(m, max_caption_length, padtok=end_token) for m in train_mesh_captions]

val_tok_reports_padded = list(val_df.tok_reports_padded)
val_mesh_captions = list(val_df.proc_mesh_caption)

val_mesh_captions_padded = [dpt.pad_sentence(m, max_caption_length, padtok=end_token) for m in val_mesh_captions]

In [21]:
# vectorize mesh captions
train_vectoriser.entities_to_vectors(train_mesh_captions_padded, save=True)
val_vectoriser.entities_to_vectors(val_mesh_captions_padded, save=True)

# vectorise reports
train_vectoriser.sentences_to_vectors(train_tok_reports_padded)
val_vectoriser.sentences_to_vectors(val_tok_reports_padded)

Creating list of word ids from loaded dictionaries
Creating list of word ids from loaded dictionaries


In [22]:
word_to_id = train_vectoriser.word_to_id
id_to_word = train_vectoriser.id_to_word

mesh_to_id = train_vectoriser.ent_to_id
id_to_mesh = train_vectoriser.id_to_ent

report_vocab_length = len(word_to_id)
mesh_vocab_length = len(mesh_to_id)

In [None]:
# encoder_input_data = np.zeros(
#     (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
#     dtype='float32')
# decoder_input_data = np.zeros(
#     (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
#     dtype='float32')
# decoder_target_data = np.zeros(
#     (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
#     dtype='float32')

# for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
#     for t, char in enumerate(input_text):
#         encoder_input_data[i, t, input_token_index[char]] = 1.
#     for t, char in enumerate(target_text):
#         # decoder_target_data is ahead of decoder_input_data by one timestep
#         decoder_input_data[i, t, target_token_index[char]] = 1.
#         if t > 0:
#             # decoder_target_data will be ahead by one timestep
#             # and will not include the start character.
# decoder_target_data[i, t - 1, target_token_index[char]] = 1.

In [23]:
# Create arrays of indixes for input sentences, output entities and shifted output entities (t-1)
train_token_ids_array = train_vectoriser.token_ids_array
train_mesh_ids_array = train_vectoriser.ents_ids_array
train_mesh_ids_array_shifted = [np.concatenate((t[1:],t[-1]), axis=None) for t in train_mesh_ids_array]
train_mesh_ids_array_shifted = np.asarray(train_mesh_ids_array_shifted)

val_token_ids_array = val_vectoriser.token_ids_array
val_mesh_ids_array = val_vectoriser.ents_ids_array
val_mesh_ids_array_shifted = [np.concatenate((t[1:],t[-1]), axis=None) for t in val_mesh_ids_array]
val_mesh_ids_array_shifted = np.asarray(val_mesh_ids_array_shifted)

In [24]:
report_seq_length = train_vectoriser.max_sen_len
mesh_seq_length = max_caption_length

## Train Seq-to-Seq Model

In [25]:
input_dim = report_vocab_length
output_dim = mesh_vocab_length
latent_dim = 256
input_seq_length = report_seq_length
output_seq_length = mesh_seq_length
epochs = 10
optimizer = 'adam'
batch_size = 128

new_experiment = Seq2Seq(epochs=epochs,
                               metrics=['accuracy', binary_accuracy,recall,precision],
                               optimizer=optimizer,
                               batch_size=batch_size, 
                               input_dim=input_dim,
                               output_dim=output_dim,
                               latent_dim=latent_dim,
                               input_seq_length=input_seq_length,
                               output_seq_length=output_seq_length,
                               verbose=True)
new_experiment.build_model()
new_experiment.model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 1397)   0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 105)    0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 256), (None, 1693696     input_1[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, None, 256),  370688      input_2[0][0]                    
                                                                 lstm_1[0][1]                     
          

In [None]:
# create batch generators
# train_batch_generator = dpt.batch_generator_seq2seq(train_token_ids_array, report_vocab_length, train_mesh_ids_array, 
#                                                    train_mesh_ids_array_shifted, mesh_vocab_length, batch_size)

# val_batch_generator = dpt.batch_generator_seq2seq(val_token_ids_array, report_vocab_length, val_mesh_ids_array, 
#                                                    val_mesh_ids_array_shifted, mesh_vocab_length, batch_size)

In [26]:
# or one-hot-encode all at once
one_hot_reports_train = dpt.one_hot_sequence(train_token_ids_array, report_vocab_length)
one_hot_mesh_train = dpt.one_hot_sequence(train_mesh_ids_array, mesh_vocab_length)
one_hot_mesh_shifted_train = dpt.one_hot_sequence(train_mesh_ids_array_shifted, mesh_vocab_length)

one_hot_reports_val = dpt.one_hot_sequence(val_token_ids_array, report_vocab_length)
one_hot_mesh_val = dpt.one_hot_sequence(val_mesh_ids_array, mesh_vocab_length)
one_hot_mesh_shifted_val = dpt.one_hot_sequence(val_mesh_ids_array_shifted, mesh_vocab_length)

In [27]:
new_experiment.run_experiment(one_hot_reports_train, one_hot_mesh_train, one_hot_mesh_shifted_train, 
                              one_hot_reports_val, one_hot_mesh_val, one_hot_mesh_shifted_val)

Train on 6244 samples, validate on 500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [28]:
new_experiment.save_weights_history(model_output_dir)

## Load results of specific experiment

In [29]:
sample_size = 'all'
data_type = 'processed_balanced'
model_output_dir = dir + 'trained_models/chestx/text_seq2seq/train_{}/{}/'.format(sample_size,data_type)

In [31]:
epochs = 10
latent_dim = 256

param_fn = 'param_cnn_epochs_{}_latentdim_{}.json'\
.format(epochs, latent_dim)
params = json.load(open(model_output_dir + param_fn, 'r'))

old_experiment = Seq2Seq(**params)
old_experiment.build_model()
old_experiment.load_weights_history(model_output_dir)

In [54]:
sample = val_df.sample(1)
true_mesh_caption = list(sample.proc_mesh_caption)[0]
sample_report = list(sample.tok_reports_padded)[0]
sample_report_ids = [word_to_id[word] for word in sample_report]
sample_report_ids = np.array(sample_report_ids)
sample_report_ids.shape

(41,)

In [58]:
# generate samples
for seq_index in range(10):
    sample = val_df.sample(1)
    true_mesh_caption = list(sample.proc_mesh_caption)[0]
    sample_report = list(sample.tok_reports_padded)[0]
    sample_report_ids = [word_to_id[word] for word in sample_report]
    sample_report_ids = np.array(sample_report_ids).reshape(1, len(sample_report_ids))
    one_hot_sample_report = dpt.one_hot_sequence(sample_report_ids, report_vocab_length)
    input_seq = one_hot_sample_report
    decoded_sentence = old_experiment.decode_sequence(input_seq, id_to_mesh, mesh_to_id)
    print('-')
    print('Original report: ', sample_report)
    print('True mesh caption: ', true_mesh_caption)
    print('Predicted mesh caption: ', decoded_sentence)

(1, 256)
(1, 1, 105)
(1, 1, 105)
-
Original report:  ['start', 'lung', 'volumes', 'low', 'interval', 'patchy', 'infiltrate', 'developed', 'right', 'lower', 'lobe', 'heart', 'pulmonary', 'xxxx', 'normal', 'xxxx', 'xxxx', 'patchy', 'right', 'lower', 'lobe', 'infiltrate', 'consistent', 'pneumonia', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.']
True mesh caption:  ['start', 'infiltrate', 'lung', 'lower lobe', 'right', 'patchy']
Predicted mesh caption:  normal.
(1, 256)
(1, 1, 105)
(1, 1, 105)
-
Original report:  ['start', 'heart', 'normal', 'size', 'contour', 'vague', 'area', 'airspace', 'disease', 'identified', 'within', 'right', 'midlung', 'pa', 'view', 'well-demonstrated', 'lateral', 'view', 'pneumothorax', 'effusion', 'vague', 'area', 'focal', 'airspace', 'disease', 'within', 'right', 'midlung', 'concern', 'pneumonia', 'recommend', 'followup', 'appropriate', 'treatment', 'document', 'complete', 'resolution', '.', '.', '.', '.']
True mesh caption: 

In [None]:
_pred_mesh_val = old_experiment.model.predict(reports_val)
_pred_mesh_train = old_experiment.model.predict(reports_train)

pred_mesh_val = np.array([_pred_mesh_val > 0.5])*1.0
pred_mesh_val = pred_mesh_val[0]

pred_mesh_train = np.array([_pred_mesh_train > 0.5])*1.0
pred_mesh_train = pred_mesh_train[0]