# Identification and Classification of Viral Pneumonia by Image-Based Deep Learning
Hamza Khokhar
Final Project
Professor Biwas

## Section 1: Preparing the data

In [1]:
# imports 
import os
import numpy as np
from tqdm import tqdm
import pandas as pd
from keras.preprocessing.image import ImageDataGenerator
import glob
import cv2
import matplotlib.pyplot as plt
from keras.applications.densenet import DenseNet121




import tensorflow as tf
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Dropout, Flatten, BatchNormalization, Activation, GlobalAveragePooling2D, MaxPool2D
from keras.layers import Conv2D, SeparableConv2D, MaxPool2D, LeakyReLU, Activation
from keras.constraints import maxnorm
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping



from sklearn.metrics import accuracy_score
from sklearn.utils import class_weight
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

In [2]:
# getting all the paths from corresponding images
paths_train_PNEUMONIA = glob.glob("chest_xray/train/PNEUMONIA/*.jpeg")
paths_train_NORMAL = glob.glob("chest_xray/train/NORMAL/*.jpeg")
paths_test_PNEUMONIA = glob.glob("chest_xray/test/PNEUMONIA/*.jpeg")
paths_test_NORMAL = glob.glob("chest_xray/test/NORMAL/*.jpeg")


# Total images for each folder
print('Train PNEUMONIA: '+str(len(paths_train_PNEUMONIA)))
print('Train NORMAL: '+str(len(paths_train_NORMAL)))
print('Test PNEUMONIA: '+str(len(paths_test_PNEUMONIA)))
print('Test PNEUMONIA: '+str(len(paths_test_NORMAL)))

# Total images for each Directory
print('Total Number of Train samples: '+ str(len(paths_train_PNEUMONIA)+len(paths_train_NORMAL)))
print('Total Number of Test samples: '+ str(len(paths_test_PNEUMONIA)+len(paths_test_NORMAL)))


Train PNEUMONIA: 3875
Train NORMAL: 1341
Test PNEUMONIA: 398
Test PNEUMONIA: 242
Total Number of Train samples: 5216
Total Number of Test samples: 640


In [3]:
# creating a function to return a dataframe containing paths and labels for corresponding images  
def create_dataframe(path_names,label_name):
    paths = []
    labels = []

    for path in path_names:
        paths.append(path)
        labels.append(label_name)
    df = pd.DataFrame()
    df['paths'] = paths
    df['labels'] = labels
    return df

    

In [4]:
# Calling the create dataframe function twice in the pandas concat function to get our three main data frames for training and testing purposes
train_df = pd.concat([
    create_dataframe(paths_train_PNEUMONIA,'1'), 
    create_dataframe(paths_train_NORMAL,'0')
],ignore_index=True)

test_df = pd.concat([
    create_dataframe(paths_test_PNEUMONIA,'1'), 
    create_dataframe(paths_test_NORMAL,'0')
],ignore_index=True)







In [5]:
#Checking the size and shape of the dataframes
train_df



Unnamed: 0,paths,labels
0,chest_xray/train/PNEUMONIA/person63_bacteria_3...,1
1,chest_xray/train/PNEUMONIA/person1438_bacteria...,1
2,chest_xray/train/PNEUMONIA/person755_bacteria_...,1
3,chest_xray/train/PNEUMONIA/person478_virus_975...,1
4,chest_xray/train/PNEUMONIA/person661_bacteria_...,1
...,...,...
5211,chest_xray/train/NORMAL/IM-0183-0001.jpeg,0
5212,chest_xray/train/NORMAL/IM-0460-0001.jpeg,0
5213,chest_xray/train/NORMAL/NORMAL2-IM-1011-0001.jpeg,0
5214,chest_xray/train/NORMAL/NORMAL2-IM-0826-0001.jpeg,0


In [6]:
test_df

Unnamed: 0,paths,labels
0,chest_xray/test/PNEUMONIA/person147_bacteria_7...,1
1,chest_xray/test/PNEUMONIA/person100_bacteria_4...,1
2,chest_xray/test/PNEUMONIA/person78_bacteria_38...,1
3,chest_xray/test/PNEUMONIA/person124_bacteria_5...,1
4,chest_xray/test/PNEUMONIA/person1647_virus_284...,1
...,...,...
635,chest_xray/test/NORMAL/NORMAL2-IM-0309-0001.jpeg,0
636,chest_xray/test/NORMAL/NORMAL2-IM-0246-0001-00...,0
637,chest_xray/test/NORMAL/NORMAL2-IM-0292-0001.jpeg,0
638,chest_xray/test/NORMAL/NORMAL2-IM-0221-0001.jpeg,0


In [18]:

train_datagen=ImageDataGenerator(rescale=1./255,  
        vertical_flip=True,
        featurewise_center = False,
        samplewise_center = False,
        featurewise_std_normalization = False, 
        samplewise_std_normalization = False,
        zca_whitening = False,
        horizontal_flip = False,
        #vertical_flip = False,
        rotation_range = 10,  
        zoom_range = 0.1, 
        width_shift_range = 0.1, 
        height_shift_range = 0.1)
val_datagen=ImageDataGenerator(rescale = 1./255, 
    validation_split = 0.1)




train_generator = train_datagen.flow_from_dataframe(dataframe= train_df,
                                                  x_col= 'paths',
                                                  y_col= 'labels',
                                                  target_size=(200, 200),
                                                  class_mode='binary',
                                                  batch_size=32,
                                                  shuffle=True,
                                                  subset = 'training'
                                                  )
 

    
validation_generator = val_datagen.flow_from_dataframe(dataframe= train_df,
                                                  x_col= 'paths',
                                                  y_col= 'labels',
                                                  target_size=(200,200),
                                                  class_mode= 'binary',
                                                  batch_size= 32,
                                                  shuffle=True,
                                                  subset = 'validation'
                                                  )


Found 5216 validated image filenames belonging to 2 classes.
Found 521 validated image filenames belonging to 2 classes.


In [19]:
test_images = []
test_labels = []

for path in tqdm(paths_test_PNEUMONIA): 
    image= cv2.imread(path)
    image=cv2.resize(image, (200, 200))
    image=np.array(image)
    image = image.astype('float32')
    image /= 255 
    test_images.append(image)
    test_labels.append(1)
    
for path in tqdm(paths_test_NORMAL): 
    image= cv2.imread(path)
    image=cv2.resize(image, (200, 200))
    image=np.array(image)
    image = image.astype('float32')
    image /= 255 
    test_images.append(image)
    test_labels.append(0)
    
test_images=np.array(test_images, np.float32)
test_labels=np.array(test_labels)

100%|██████████| 398/398 [00:02<00:00, 189.98it/s]
100%|██████████| 242/242 [00:03<00:00, 64.22it/s]


In [20]:
print(test_images.shape)
print(test_labels.shape)

(640, 200, 200, 3)
(640,)


In [26]:
callbacks = [ 
    EarlyStopping(monitor = 'loss', patience = 7), 
    ReduceLROnPlateau(monitor = 'loss', patience = 4), 
    ModelCheckpoint('./model.best4.hdf5', monitor='loss' , save_best_only=True) # saving the best model
]


## Section 2: Visulaizing the data

In [33]:
# model = Sequential()
# model.add(Conv2D(32,(3,3),input_shape = (300,300,3)))
# model.add(Activation("relu"))
# model.add(MaxPooling2D())

# model.add(Conv2D(32,(3,3)))
# model.add(Activation("relu"))
# model.add(MaxPooling2D())

# model.add(Conv2D(64,(3,3)))
# model.add(Activation("relu"))
# model.add(MaxPooling2D())

# model.add(Flatten())
# model.add(Dense(1024))
# model.add(Activation("relu"))
# model.add(Dropout(0.4))
# model.add(Dense(1)) # output
# model.add(Activation("sigmoid"))

# model.compile(loss = "binary_crossentropy",
#               optimizer = "rmsprop",
#               metrics = ["accuracy"])



model = Sequential([
        Conv2D(16, kernel_size=(3, 3), activation='relu', padding='same', input_shape = (200,200,3)),
        Conv2D(16, kernel_size=(3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        MaxPool2D(pool_size=(2, 2)),
        Dropout(0.2),
        
        Conv2D(32, kernel_size=(3, 3), activation='relu', padding='same'),
        Conv2D(32, kernel_size=(3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        MaxPool2D(pool_size=(2, 2)),
        Dropout(0.2),
        
        Conv2D(64, kernel_size=(3, 3), activation='relu', padding='same'),
        Conv2D(64, kernel_size=(3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        MaxPool2D(pool_size=(2, 2)),
        Dropout(0.2),
        
        Conv2D(128, kernel_size=(3, 3), activation='relu', padding='same'),
        Conv2D(128, kernel_size=(3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        MaxPool2D(pool_size=(2, 2)),
        Dropout(0.2),
        
        Conv2D(256, kernel_size=(3, 3), activation='relu', padding='same'),
        Conv2D(256, kernel_size=(3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        MaxPool2D(pool_size=(2, 2)),
        Dropout(0.2),
        
        Flatten(),
       
        Dense(1024, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        
        Dense(512, activation='relu'),
        BatchNormalization(),
        Dropout(0.4),
        
        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),
        
        Dense(1, activation = "softmax")
        
    ])

In [34]:
model.compile(loss='categorical_crossentropy', optimizer = 'adam', metrics=['accuracy'])


In [None]:
hist = model.fit_generator(
           train_generator, steps_per_epoch=train_generator.samples // 32, 
           epochs=100, validation_data=validation_generator, 
           validation_steps=validation_generator.samples // 32,callbacks = callbacks)



# learning_history = model.fit_generator((train_gen), 
#                                epochs = 100, 
#                                steps_per_epoch = X_train.shape[0] // 32,
#                                validation_data = (X_test, y_test),
#                                callbacks = callbacks,
#                         )



Epoch 1/100
Epoch 2/100
Epoch 3/100

In [1]:
fig, ax = plt.subplots(1, 2, figsize=(10, 3))
ax = ax.ravel()

for i, met in enumerate(['accuracy', 'loss']):
    ax[i].plot(hist.history[met])
    ax[i].plot(hist.history['val_' + met])
    ax[i].set_title('Model {}'.format(met))
    ax[i].set_xlabel('epochs')
    ax[i].set_ylabel(met)
    ax[i].legend(['train', 'val'])

NameError: name 'plt' is not defined

In [20]:

from sklearn.metrics import accuracy_score, confusion_matrix

preds = model.predict(test_images)

acc = accuracy_score(test_labels, np.round(preds))*100
cm = confusion_matrix(test_labels, np.round(preds))
tn, fp, fn, tp = cm.ravel()

print('CONFUSION MATRIX ------------------')
print(cm)

print('\nTEST METRICS ----------------------')
precision = tp/(tp+fp)*100
recall = tp/(tp+fn)*100
print('Accuracy: {}%'.format(acc))
print('Precision: {}%'.format(precision))
print('Recall: {}%'.format(recall))
print('F1-score: {}'.format(2*precision*recall/(precision+recall)))

print('\nTRAIN METRIC ----------------------')
print('Train acc: {}'.format(np.round((hist.history['accuracy'][-1])*100, 2)))

CONFUSION MATRIX ------------------
[[103 139]
 [  1 397]]

TEST METRICS ----------------------
Accuracy: 78.125%
Precision: 74.06716417910447%
Recall: 99.74874371859298%
F1-score: 85.01070663811562

TRAIN METRIC ----------------------
Train acc: 86.79
