In [1]:
import numpy as np
import pandas as pd
import pydicom
%matplotlib inline
import matplotlib.pyplot as plt
import keras 
#from keras.models import load_model
from skimage.transform import resize
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split
import sklearn.metrics
from glob import glob
import os
from random import sample  

Using TensorFlow backend.


In [2]:
# This function reads in a .dcm file, checks the important fields for our device, and returns a numpy array
# of just the imaging data

def check_dicom(filename): 

    print('\nLoad file {} ...'.format(filename))
    ds = pydicom.dcmread(filename)
    
    # Optional: You can check some DICOM headers if needed
    # e.g., Patient Position, Body Part Examined, etc.
    if 'PatientPosition' in ds:
        print('  Patient Position:', ds.PatientPosition)
    if 'BodyPartExamined' in ds:
        print('  Body Part Examined:', ds.BodyPartExamined)
    if 'ImageType' in ds:
        print('  Image Type:', ds.ImageType)
    
    img = ds.pixel_array
    return img


# This function takes the numpy array output by check_dicom and 
# runs the appropriate pre-processing needed for our model input
def preprocess_image(img, img_mean, img_std, img_size): 
    # Normalizing using mean and std dev
    img = (img - img_mean) / img_std
    
    # Resizing
    img = resize(img, (img_size[1], img_size[2]))
    
    # Adding channel and batch dimensions
    img = np.expand_dims(img, axis=-1)
    img = np.expand_dims(img, axis=0)
    
    # If the model expects 3 channels (RGB), then duplicate the single channel 3 times
    if img_size[3] == 3:
        img = np.repeat(img, 3, axis=-1)
    
    return img


# This function loads in our trained model w/ weights and compiles it 
def load_my_model(model_path, weight_path):
    model = load_model(model_path)
    model.load_weights(weight_path)
    return model


# This function uses our device's threshold parameters to predict whether or not
# the image shows the presence of pneumonia using our trained model
def predict_image(model, img, thresh): 
    pred = model.predict(img)
    return (pred > thresh).astype(int)


In [3]:
from tensorflow.keras.models import load_model

test_dicoms = ['test1.dcm','test2.dcm','test3.dcm','test4.dcm','test5.dcm','test6.dcm']
model_path =  "full_model2.h5" #path to saved model
weight_path = "{}_my_model.best.hdf5".format('xray_class2') #path to saved best weights

IMG_SIZE = (1,224,224,3) # This might be different if you did not use vgg16
img_mean = 0.50952472 # loads the mean image value they used during training preprocessing
img_std = 0.24108991 # loads the std dev image value they used during training preprocessing

#my_model = load_model(model_path, weight_path) #loads model
my_model = load_model("full_model2.h5")
my_model.load_weights("{}_my_model.best.hdf5".format('xray_class2'))
thresh = 0.44483218 #loads the threshold they chose for model classification 

# use the .dcm files to test your prediction
for i in test_dicoms:
    
    img = np.array([])
    img = check_dicom(i)
    
    if img is None:
        continue
        
    img_proc = preprocess_image(img,img_mean,img_std,IMG_SIZE)
    pred = "Positive" if predict_image(my_model,img_proc,thresh) else "Negative"
    print("  Prediction: ", pred)


Load file test1.dcm ...
  Patient Position: PA
  Body Part Examined: CHEST
  Prediction:  Negative

Load file test2.dcm ...
  Patient Position: AP
  Body Part Examined: CHEST
  Prediction:  Negative

Load file test3.dcm ...
  Patient Position: AP
  Body Part Examined: CHEST
  Prediction:  Negative

Load file test4.dcm ...
  Patient Position: PA
  Body Part Examined: RIBCAGE
  Prediction:  Negative

Load file test5.dcm ...
  Patient Position: PA
  Body Part Examined: CHEST
  Prediction:  Negative

Load file test6.dcm ...
  Patient Position: XX
  Body Part Examined: CHEST
  Prediction:  Negative


###  Algorithm Limitation


In [4]:
## Below is some helper code to read all of your full image filepaths into a dataframe for easier manipulation
## Load the NIH data to all_xray_df
all_xray_df = pd.read_csv('/data/Data_Entry_2017.csv')
all_image_paths = {os.path.basename(x): x for x in 
                   glob(os.path.join('/data','images*', '*', '*.png'))}
print('Scans found:', len(all_image_paths), ', Total Headers', all_xray_df.shape[0])
all_xray_df['path'] = all_xray_df['Image Index'].map(all_image_paths.get)

# Split labels by the delimiter
comorbid_diseases = all_xray_df['Finding Labels'].str.split('|').explode().unique()
print(comorbid_diseases)

for comorbid_disease in comorbid_diseases:
    all_xray_df[comorbid_disease] = all_xray_df['Finding Labels'].str.contains(comorbid_disease).astype(int)

all_xray_df['pneumonia_class'] = all_xray_df['Finding Labels'].str.contains('Pneumonia').astype(str)

Scans found: 112120 , Total Headers 112120
['Cardiomegaly' 'Emphysema' 'Effusion' 'No Finding' 'Hernia'
 'Infiltration' 'Mass' 'Nodule' 'Atelectasis' 'Pneumothorax'
 'Pleural_Thickening' 'Pneumonia' 'Fibrosis' 'Edema' 'Consolidation']


In [5]:
def create_splits(df):
    
    ## Either build your own or use a built-in library to split your original dataframe into two sets 
    ## that can be used for training and testing your model
    ## It's important to consider here how balanced or imbalanced you want each of those sets to be
    ## for the presence of pneumonia
    
    # Todo
    train_data, val_data = train_test_split(df, test_size=0.2, stratify = df['Pneumonia'], random_state=42)
    
    return train_data, val_data

# Use the function
train_data, val_data = create_splits(all_xray_df)
p_inds = val_data[val_data.Pneumonia==1].index.tolist()
np_inds = val_data[val_data.Pneumonia==0].index.tolist()

# The following code pulls a random sample of non-pneumonia data that's 4 times as big as the pneumonia sample.
np_sample = sample(np_inds,4*len(p_inds))
val_data = val_data.loc[p_inds + np_sample]

In [6]:
from keras.preprocessing.image import ImageDataGenerator
IMG_SIZE = (224, 224)

idg_val = ImageDataGenerator(rescale=1. / 255.0)
val_gen = idg_val.flow_from_dataframe(dataframe=val_data, 
                                         directory=None, 
                                         x_col = 'path',
                                         y_col = 'pneumonia_class',
                                         class_mode = 'binary',
                                         target_size = IMG_SIZE, 
                                         batch_size = 9
                                         )

Found 1430 validated image filenames belonging to 2 classes.


In [7]:
pred_Y = my_model.predict(val_gen, verbose = True)




In [8]:
binary_predictions = np.where(pred_Y > thresh, 1, 0)

In [9]:
tn, fp, fn, tp = sklearn.metrics.confusion_matrix(val_data['Pneumonia'],
                                                  binary_predictions, labels=[0,1]).ravel()

In [10]:
sens = tp/(tp+fn)
spec = tn/(tn+fp)
print(sens, spec)

0.42657342657342656 0.5646853146853147
