## Skeleton Code

The code below provides a skeleton for the model building & training component of your project. You can add/remove/build on code however you see fit, this is meant as a starting point.

In [108]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from glob import glob
%matplotlib inline
import matplotlib.pyplot as plt

##Import any other stats/DL/ML packages you may need here. E.g. Keras, scikit-learn, etc.
import tensorflow as tf
from tensorflow import keras
import sklearn.model_selection

## Do some early processing of your metadata for easier model training:

In [109]:
## Below is some helper code to read all of your full image filepaths into a dataframe for easier manipulation

all_xray_df = pd.read_csv('data/Data_Entry_2017.csv')
all_image_paths = {os.path.basename(x): x for x in 
                   glob(os.path.join('data','images*', '*', '*.png'))}
print('Scans found:', len(all_image_paths), ', Total Headers', all_xray_df.shape[0])
all_xray_df['path'] = all_xray_df['Image Index'].map(all_image_paths.get)
all_xray_df.sample(3)

Scans found: 112120 , Total Headers 112120


Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11,path
71416,00017606_019.png,No Finding,19,17606,56,M,AP,2048,2500,0.168,0.168,,data\images_008\images\00017606_019.png
63272,00015628_019.png,No Finding,19,15628,48,M,PA,2890,2832,0.143,0.143,,data\images_007\images\00015628_019.png
37225,00009826_004.png,No Finding,4,9826,65,F,PA,2500,2048,0.168,0.168,,data\images_005\images\00009826_004.png


In [110]:
# First to extract the data in the  "Finding Labels" column and one-hot-ecode.

diseases = ["Atelectasis", "Consolidation", "Infiltration", "Pneumothorax", "Edema", "Emphysema", "Fibrosis", "Effusion", "Pneumonia", "Pleural_Thickening", "Cardiomegaly", "Nodule", "Mass", "Hernia", "No Finding"]

for label in diseases:
    all_xray_df[label] = all_xray_df["Finding Labels"].map(lambda finding: 1 if label in finding else 0)

In [111]:
## Here we can create a new column called 'pneumonia_class' that will allow us to look at 
## images with or without pneumonia for binary classification

#all_xray_df["pneumonia_class"] = all_xray_df["Pneumonia"]
xray = all_xray_df[['path',"Pneumonia"]]
xray["Pneumonia"] = ["Yes" if i == 1 else "No" for i in xray["Pneumonia"]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


## Create your training and testing data:

In [86]:
def create_splits(vargs):
    
    train_data, val_data = sklearn.model_selection.train_test_split(all_xray_df, test_size = 0.2)
    
    return train_data, val_data

# Now we can begin our model-building & training

#### First suggestion: perform some image augmentation on your data

In [87]:
def my_image_augmentation():
    
    idg = keras.preprocessing.image.ImageDataGenerator(rescale=1. / 255.0,
                                  horizontal_flip = True,
                                  vertical_flip = False,
                                  height_shift_range= 0.1,
                                  width_shift_range=0.1,
                                  rotation_range=20,
                                  shear_range = 0.1,
                                  zoom_range=0.1)
    
    return idg


def make_train_gen(idg, train_data):
    
    ## Create the actual generators using the output of my_image_augmentation for your training data
    ## Suggestion here to use the flow_from_dataframe library, e.g.:
    
    train_gen = idg.flow_from_dataframe(dataframe=train_data,
                                             directory=None,
                                             x_col = 'path',
                                             y_col = 'Pneumonia',
                                             class_mode = 'binary',
                                             #target_size = (1024,1024), 
                                             batch_size = 10)

    return train_gen


def make_val_gen(idg, val_data):
    
    val_gen = idg.flow_from_dataframe(dataframe = val_data, 
                                              directory=None, 
                                              x_col = 'path',
                                              y_col = "Pneumonia",
                                              class_mode = 'binary',
                                              #target_size = (1024,1024), 
                                              batch_size = 10) 
    return val_gen

In [88]:
val_data

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,...,Emphysema,Fibrosis,Effusion,Pneumonia,Pleural_Thickening,Cardiomegaly,Nodule,Mass,Hernia,No Finding
16563,00004435_004.png,No Finding,4,4435,28,F,PA,2048,2500,0.171,...,0,0,0,0,0,0,0,0,0,1
18993,00005064_006.png,No Finding,6,5064,46,M,AP,2500,2048,0.168,...,0,0,0,0,0,0,0,0,0,1
83273,00020450_006.png,No Finding,6,20450,59,M,AP,3004,2544,0.139,...,0,0,0,0,0,0,0,0,0,1
34342,00009021_008.png,No Finding,8,9021,18,F,PA,2048,2500,0.168,...,0,0,0,0,0,0,0,0,0,1
100867,00026798_002.png,Infiltration,2,26798,38,M,PA,2992,2991,0.143,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84300,00020724_000.png,No Finding,0,20724,49,M,PA,2738,2809,0.143,...,0,0,0,0,0,0,0,0,0,1
74695,00018343_000.png,Infiltration|Nodule,0,18343,71,M,PA,2992,2991,0.143,...,0,0,0,0,0,0,1,0,0,0
96734,00025498_001.png,Infiltration,1,25498,86,M,AP,3056,2544,0.139,...,0,0,0,0,0,0,0,0,0,0
106232,00028606_000.png,Pneumothorax,0,28606,18,M,PA,2544,3056,0.139,...,0,0,0,0,0,0,0,0,0,0


In [89]:
## May want to pull a single large batch of random validation data for testing after each epoch:

train_data, val_data = create_splits(all_xray_df)
make_val_gen(my_image_augmentation(), val_data)
valX, valY = val_gen.next()

TypeError: If class_mode="binary", y_col="Pneumonia" column values must be strings.

In [None]:
## May want to look at some examples of our augmented training data. 
## This is helpful for understanding the extent to which data is being manipulated prior to training, 
## and can be compared with how the raw data look prior to augmentation

t_x, t_y = next(train_gen)
fig, m_axs = plt.subplots(4, 4, figsize = (16, 16))
for (c_x, c_y, c_ax) in zip(t_x, t_y, m_axs.flatten()):
    c_ax.imshow(c_x[:,:,0], cmap = 'bone')
    if c_y == 1: 
        c_ax.set_title('Pneumonia')
    else:
        c_ax.set_title('No Pneumonia')
    c_ax.axis('off')

## Build your model: 

Recommendation here to use a pre-trained network downloaded from Keras for fine-tuning

In [None]:
def load_pretrained_model(vargs):
    
    # model = VGG16(include_top=True, weights='imagenet')
    # transfer_layer = model.get_layer(lay_of_interest)
    # vgg_model = Model(inputs = model.input, outputs = transfer_layer.output)
    
    # Todo
    
    return vgg_model


In [None]:
def build_my_model(vargs):
    
    # my_model = Sequential()
    # ....add your pre-trained model, and then whatever additional layers you think you might
    # want for fine-tuning (Flatteen, Dense, Dropout, etc.)
    
    # if you want to compile your model within this function, consider which layers of your pre-trained model, 
    # you want to freeze before you compile 
    
    # also make sure you set your optimizer, loss function, and metrics to monitor
    
    # Todo
    
    return my_model



## STAND-OUT Suggestion: choose another output layer besides just the last classification layer of your modele
## to output class activation maps to aid in clinical interpretation of your model's results

In [None]:
## Below is some helper code that will allow you to add checkpoints to your model,
## This will save the 'best' version of your model by comparing it to previous epochs of training

## Note that you need to choose which metric to monitor for your model's 'best' performance if using this code. 
## The 'patience' parameter is set to 10, meaning that your model will train for ten epochs without seeing
## improvement before quitting

# Todo

# weight_path="{}_my_model.best.hdf5".format('xray_class')

# checkpoint = ModelCheckpoint(weight_path, 
#                              monitor= CHOOSE_METRIC_TO_MONITOR_FOR_PERFORMANCE, 
#                              verbose=1, 
#                              save_best_only=True, 
#                              mode= CHOOSE_MIN_OR_MAX_FOR_YOUR_METRIC, 
#                              save_weights_only = True)

# early = EarlyStopping(monitor= SAME_AS_METRIC_CHOSEN_ABOVE, 
#                       mode= CHOOSE_MIN_OR_MAX_FOR_YOUR_METRIC, 
#                       patience=10)

# callbacks_list = [checkpoint, early]

### Start training! 

In [None]:
## train your model

# Todo

# history = my_model.fit_generator(train_gen, 
#                           validation_data = (valX, valY), 
#                           epochs = , 
#                           callbacks = callbacks_list)

##### After training for some time, look at the performance of your model by plotting some performance statistics:

Note, these figures will come in handy for your FDA documentation later in the project

In [None]:
## After training, make some predictions to assess your model's overall performance
## Note that detecting pneumonia is hard even for trained expert radiologists, 
## so there is no need to make the model perfect.
my_model.load_weights(weight_path)
pred_Y = new_model.predict(valX, batch_size = 32, verbose = True)

In [None]:
def plot_auc(t_y, p_y):
    
    ## Hint: can use scikit-learn's built in functions here like roc_curve
    
    # Todo
    
    return

## what other performance statistics do you want to include here besides AUC? 


# def ... 
# Todo

# def ...
# Todo
    
#Also consider plotting the history of your model training:

def plot_history(history):
    
    # Todo
    return

In [None]:
## plot figures

# Todo

Once you feel you are done training, you'll need to decide the proper classification threshold that optimizes your model's performance for a given metric (e.g. accuracy, F1, precision, etc.  You decide) 

In [None]:
## Find the threshold that optimize your model's performance,
## and use that threshold to make binary classification. Make sure you take all your metrics into consideration.

# Todo

In [None]:
## Let's look at some examples of predicted v. true with our best model: 

# Todo

# fig, m_axs = plt.subplots(10, 10, figsize = (16, 16))
# i = 0
# for (c_x, c_y, c_ax) in zip(valX[0:100], testY[0:100], m_axs.flatten()):
#     c_ax.imshow(c_x[:,:,0], cmap = 'bone')
#     if c_y == 1: 
#         if pred_Y[i] > YOUR_THRESHOLD:
#             c_ax.set_title('1, 1')
#         else:
#             c_ax.set_title('1, 0')
#     else:
#         if pred_Y[i] > YOUR_THRESHOLD: 
#             c_ax.set_title('0, 1')
#         else:
#             c_ax.set_title('0, 0')
#     c_ax.axis('off')
#     i=i+1

In [None]:
## Just save model architecture to a .json:

model_json = my_model.to_json()
with open("my_model.json", "w") as json_file:
    json_file.write(model_json)