# Big Data Analytics Project - Chest X-Rays Image Recognition

## Importing Libraries

In [1]:
from PIL import Image
import os
import pandas as pd
import numpy as np
import tensorflow as tf

from keras import layers 
from keras import models
from keras import optimizers
from keras import metrics

#Set working directory to the downloads folder where the pictures are
os.chdir(r'C:\Users\Leonardo Luchetti\Documents\Fordham Datasets\CheXphoto')

## Creating test sets

In [2]:
#Reading in the train csv and break up path into columns
train_df = pd.read_csv(r'C:\Users\Leonardo Luchetti\Documents\Fordham Datasets\CheXphoto\CheXphoto-v1.0\train.csv')
train_df['photo_type'] = train_df['Path'].apply(lambda x : x.split('/')[2])
train_df['camera_type'] = train_df['Path'].apply(lambda x : x.split('/')[3])
train_df['patient_number'] = train_df['Path'].apply(lambda x : int(x.split('/')[4][7:]))
train_df['study_number'] = train_df['Path'].apply(lambda x : int(x.split('/')[5][5:]))
train_df['photo_name'] = train_df['Path'].apply(lambda x : x.split('/')[6])

#filling the NA's of No Finding to 0
train_df['No Finding'] = train_df['No Finding'].fillna(value=0)

#Converting to integer
train_df['No Finding'] = train_df['No Finding'].astype('int8')

In [3]:
#Reading in the validation csv and break up path into columns
valid_df = pd.read_csv(r'C:\Users\Leonardo Luchetti\Documents\Fordham Datasets\CheXphoto\CheXphoto-v1.0\valid.csv')
valid_df['photo_type'] = valid_df['Path'].apply(lambda x : x.split('/')[2])
valid_df['camera_type'] = valid_df['Path'].apply(lambda x : x.split('/')[3])
valid_df['patient_number'] = valid_df['Path'].apply(lambda x : int(x.split('/')[4][7:]))
valid_df['study_number'] = valid_df['Path'].apply(lambda x : int(x.split('/')[5][5:]))
valid_df['photo_name'] = valid_df['Path'].apply(lambda x : x.split('/')[6])

#filling the NA's of No Finding to 0
valid_df['No Finding'] = valid_df['No Finding'].fillna(value=0)

#Converting to integer
valid_df['No Finding'] = valid_df['No Finding'].astype('int8')

In [4]:
#Select only the digital photos of the train set
photographic_train_df = train_df[train_df['camera_type'] == 'digital']
photographic_train_df = photographic_train_df.reset_index()

In [5]:
#Select only the digital photos of the validation set
photographic_valid_df = valid_df[valid_df['camera_type'] == 'digital']
photographic_valid_df = photographic_valid_df.reset_index()

## Creating Deep Learning Function to Loop Model Settings for Optimal Specs

In [6]:
def run_model(md,size,eps,bs):
    #load a file list
    file_list_train = photographic_train_df['Path'][:3].tolist()
    file_list_valid = photographic_valid_df['Path'][:20].tolist()

    #load photos into numpy arrays
    photo_np_train = np.array([np.array(Image.open(fname).resize((size,size))) for fname in file_list_train])
    photo_np_valid = np.array([np.array(Image.open(fname).resize((size,size))) for fname in file_list_valid])

    #reshape and change formatting
    x_train = photo_np_train.astype('float32') / 255
    x_valid = photo_np_valid.astype('float32') / 255
    
    #correct image shape
    x_train = tf.expand_dims(x_train, axis=-1)
    x_valid = tf.expand_dims(x_valid, axis=-1)

    #separating train/validation labels
    train_labels = np.array(photographic_train_df['No Finding'][:3])
    valid_labels = np.array((photographic_valid_df['No Finding'][:20]))

    #vectorize the labels
    y_train = np.asarray(train_labels).astype('float32').reshape((-1,1))
    y_valid = np.asarray(valid_labels).astype('float32').reshape((-1,1))

    #??
    y_train = tf.expand_dims(y_train, axis=-1)
    y_valid = tf.expand_dims(y_valid, axis=-1)

    #####################################################
    
    #importing model from function
    model = md
    
    #choosing optimizer
    opt = optimizers.Adam(learning_rate=0.10)
    
    #compiling model
    model.compile(loss='binary_crossentropy',
                  optimizer=opt,
                  metrics=['acc',metrics.Precision()])
    
    #executing model
    history = model.fit(
        x_train,
        y_train,
        batch_size=bs,
        epochs=eps,
        validation_data=(x_valid, y_valid),
        verbose=0
    )
    
    ######################################################
    
    #retrieving results
    history_dict = history.history

    val_acc_values = history_dict['val_acc']
    val_loss_values = history_dict['val_loss']
    
    #######################################################
    
    return{'image_size':size,
           'epochs':eps,
           'batch size':bs,
           'val_acc_mean':sum(val_acc_values)/len(val_acc_values),
           'val_acc_max':max(val_acc_values),
           'val_acc_min':min(val_acc_values),
           'val_loss_mean':sum(val_loss_values)/len(val_loss_values),
           'val_loss_max':max(val_loss_values),
           'val_loss_min':min(val_loss_values),
           'model':m}

## Generating Initial Results

In [7]:
model_performance_df = pd.DataFrame()
x = 1

image_size = [10,100,1000]
epochs_list = [10,50]
batch_list = [round(len(photographic_train_df)/100),round(len(photographic_train_df)/10)]
model_list = ['model_1','model_2','model_3']

for size in image_size:
    for eps in epochs_list:
        for bs in batch_list:
            for m in model_list:
                if m == 'model_1':
                    model_1 = models.Sequential()
                    model_1.add(layers.Conv2D(32, (3, 3), activation='relu',input_shape=(size, size, 1)))
                    model_1.add(layers.MaxPooling2D((2, 2)))
                    model_1.add(layers.Flatten())
                    model_1.add(layers.Dense(1, activation='sigmoid'))
                    print('current settings: '+str(size)+', '+str(eps)+', '+str(bs)+', '+str(m))      
                    model_results = run_model(model_1,size,eps,bs)
                    model_information_df = pd.DataFrame.from_dict([model_results])
                    model_performance_df = model_performance_df.append(model_results,ignore_index=True)   
                    print(str(x)+' out of '+str(len(image_size)*len(epochs_list)*len(batch_list)*len(model_list)) +' models successfully ran.')
                    x+=1
                elif m == 'model_2':
                    model_1 = models.Sequential()
                    model_1.add(layers.Conv2D(32, (3, 3), activation='relu',input_shape=(size, size, 1)))
                    model_1.add(layers.MaxPooling2D((2, 2)))
                    model_1.add(layers.Flatten())
                    model_1.add(layers.Dense(1, activation='sigmoid'))
                    print('current settings: '+str(size)+', '+str(eps)+', '+str(bs)+', '+str(m))      
                    model_results = run_model(model_1,size,eps,bs)
                    model_information_df = pd.DataFrame.from_dict([model_results])
                    model_performance_df = model_performance_df.append(model_results,ignore_index=True)   
                    print(str(x)+' out of '+str(len(image_size)*len(epochs_list)*len(batch_list)*len(model_list)) +' models successfully ran.')
                    x+=1
                elif m == 'model_3':
                    model_1 = models.Sequential()
                    model_1.add(layers.Conv2D(32, (3, 3), activation='relu',input_shape=(size, size, 1)))
                    model_1.add(layers.MaxPooling2D((2, 2)))
                    model_1.add(layers.Flatten())
                    model_1.add(layers.Dense(1, activation='sigmoid'))
                    print('current settings: '+str(size)+', '+str(eps)+', '+str(bs)+', '+str(m))      
                    model_results = run_model(model_1,size,eps,bs)
                    model_information_df = pd.DataFrame.from_dict([model_results])
                    model_performance_df = model_performance_df.append(model_results,ignore_index=True)   
                    print(str(x)+' out of '+str(len(image_size)*len(epochs_list)*len(batch_list)*len(model_list)) +' models successfully ran.')
                    x+=1
                else:
                    print('error')

current settings: 10, 10, 105, model_1
1 out of 36 models successfully ran.
current settings: 10, 10, 105, model_2
2 out of 36 models successfully ran.
current settings: 10, 10, 105, model_3
3 out of 36 models successfully ran.
current settings: 10, 10, 1051, model_1
4 out of 36 models successfully ran.
current settings: 10, 10, 1051, model_2
5 out of 36 models successfully ran.
current settings: 10, 10, 1051, model_3
6 out of 36 models successfully ran.
current settings: 10, 50, 105, model_1
7 out of 36 models successfully ran.
current settings: 10, 50, 105, model_2
8 out of 36 models successfully ran.
current settings: 10, 50, 105, model_3
9 out of 36 models successfully ran.
current settings: 10, 50, 1051, model_1
10 out of 36 models successfully ran.
current settings: 10, 50, 1051, model_2
11 out of 36 models successfully ran.
current settings: 10, 50, 1051, model_3
12 out of 36 models successfully ran.
current settings: 100, 10, 105, model_1
13 out of 36 models successfully ran.
c

In [8]:
model_performance_df

Unnamed: 0,batch size,epochs,image_size,model,val_acc_max,val_acc_mean,val_acc_min,val_loss_max,val_loss_mean,val_loss_min
0,105.0,10.0,10.0,model_1,0.8,0.305,0.25,6.982257,1.371465,0.665939
1,105.0,10.0,10.0,model_2,0.75,0.355,0.25,9.454093,1.7679,0.579304
2,105.0,10.0,10.0,model_3,0.75,0.355,0.25,8.9857,1.649786,0.691123
3,1051.0,10.0,10.0,model_1,0.75,0.34,0.25,9.632663,1.655908,0.654926
4,1051.0,10.0,10.0,model_2,0.75,0.335,0.25,7.800041,1.722002,0.661783
5,1051.0,10.0,10.0,model_3,0.75,0.395,0.25,9.768615,1.821219,0.568294
6,105.0,50.0,10.0,model_1,0.75,0.449,0.25,7.299397,2.521547,0.609814
7,105.0,50.0,10.0,model_2,0.75,0.443,0.2,6.03041,3.211858,0.572034
8,105.0,50.0,10.0,model_3,0.75,0.448,0.25,5.400099,2.977589,0.55425
9,1051.0,50.0,10.0,model_1,0.7,0.438,0.2,5.69393,2.390434,0.662179
