In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import matplotlib.pyplot as plt
import cv2
import glob

#step 1: reading input data, creating categorical output labels (softmax classification task)
#step 2: image pre-processing (standardization), visualization
#step 2(a): data statistics (how many normal, bact_pneumonia, vir_pneumonia)
#step 3: data augmentation (increase size of data)
#step 4: load pre-trained model VGG-16 for transfer learning
#step 5: construct model, train the classification layer (metric should be confusion)
#step 6: test 

train_normal = '../input/chest-xray-pneumonia/chest_xray/train/NORMAL'
train_pneumonia = '../input/chest-xray-pneumonia/chest_xray/train/PNEUMONIA'
test_normal = '../input/chest-xray-pneumonia/chest_xray/test/NORMAL'
test_pneumonia = '../input/chest-xray-pneumonia/chest_xray/test/PNEUMONIA'
validate_normal = '../input/chest-xray-pneumonia/chest_xray/val/NORMAL'
validate_pneumonia = '../input/chest-xray-pneumonia/chest_xray/val/PNEUMONIA'


In [None]:
#get image dimension
img_sample = cv2.imread('../input/chest-xray-pneumonia/chest_xray/train/NORMAL/IM-0115-0001.jpeg')
dim = img_sample.shape
print('Image dimension: ', dim)
plt.subplot(1, 2, 1)
plt.imshow(img_sample)
plt.title('Normal')

#visualize standardization
img_stand_sample = cv2.resize(img_sample, (224,224))
img_stand_sample = tf.image.per_image_standardization(img_stand_sample)
new_dim = img_stand_sample.shape
print('Resized Image Dimension: ', new_dim)
plt.subplot(1, 2, 2)
plt.imshow(img_stand_sample)
plt.title('Normal, Standardized')

In [None]:
#loading data into pandas dataframe
df_train_normal = pd.DataFrame({'path': glob.glob(train_normal + '/*.jpeg'), 'label' : 'normal'})
df_train_bact_pneu = pd.DataFrame({'path' : glob.glob(train_pneumonia + '/person*_b*.jpeg'), 'label' : 'bacterial pneumonia'})
df_train_vir_pneu = pd.DataFrame({'path' : glob.glob(train_pneumonia + '/person*_v*.jpeg'), 'label' : 'viral pneumonia'})

df_test_normal = pd.DataFrame({'path': glob.glob(test_normal + '/*.jpeg'), 'label' : 'normal'})
df_test_bact_pneu = pd.DataFrame({'path' : glob.glob(test_pneumonia + '/person*_b*.jpeg'), 'label' : 'bacterial pneumonia'})
df_test_vir_pneu = pd.DataFrame({'path' : glob.glob(test_pneumonia + '/person*_v*.jpeg'), 'label' : 'viral pneumonia'})

df_val_normal = pd.DataFrame({'path': glob.glob(validate_normal + '/*.jpeg'), 'label' : 'normal'})
df_val_bact_pneu = pd.DataFrame({'path' : glob.glob(validate_pneumonia + '/person*_b*.jpeg'), 'label' : 'bacterial pneumonia'})
df_val_vir_pneu = pd.DataFrame({'path' : glob.glob(validate_pneumonia + '/person*_v*.jpeg'), 'label' : 'viral pneumonia'})


In [None]:
#data visualization
plt.figure(figsize=(15, 15))

i = 1
for index in range(3):
    img = cv2.imread(df_train_normal['path'][index])
    img = cv2.resize(img, (224, 224))
    plt.subplot(3,3,i)
    plt.imshow(img)
    plt.title("Normal")
    i = i + 1
    
for index in range(3):
    img = cv2.imread(df_train_bact_pneu['path'][index])
    img = cv2.resize(img, (224, 224))
    plt.subplot(3,3,i)
    plt.imshow(img)
    plt.title("Bacterial")
    i = i + 1
    
for index in range(3):
    img = cv2.imread(df_train_vir_pneu['path'][index])
    img = cv2.resize(img, (224, 224))
    plt.subplot(3,3,i)
    plt.imshow(img)
    plt.title("Viral")
    i = i + 1

In [None]:
#data statistics
plt.figure(figsize=(12,12))
x_cat = ['train_normal', 'train_bacteria', 'train_virus', 'test_normal', 'test_bacteria', 'test_virus', 'val_normal', 'val_bacteria', 'val_virus']
y_cat = [len(df_train_normal), len(df_train_bact_pneu), len(df_train_vir_pneu), len(df_test_normal), 
         len(df_test_bact_pneu), len(df_test_vir_pneu), len(df_val_normal),len(df_val_bact_pneu),len(df_val_vir_pneu)]
plt.bar(x_cat, y_cat)
plt.xlabel('Data Category')
plt.ylabel('Number of Images')
plt.title('Data Distribution')

In [None]:
#merge dataset and shuffle dataframe
#note, our validation set is too small and non-uniform in distribution
#thus, we are going to merge the training set and validation set temporarily, then use validation split during fitting
df_train = pd.concat([df_train_normal, df_train_bact_pneu, df_train_vir_pneu, df_val_normal, df_val_bact_pneu, df_val_vir_pneu])
df_test = pd.concat([df_test_normal, df_test_bact_pneu, df_test_vir_pneu])

df_train = df_train.sample(frac=1).reset_index(drop=True)
df_test = df_test.sample(frac=1).reset_index(drop=True)

from sklearn.model_selection import train_test_split

df_train, df_val = train_test_split(df_train, test_size=0.05)

In [None]:
df_train.head()


In [None]:
df_test.head()

In [None]:
df_val.head()

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
train_datagen = ImageDataGenerator(samplewise_center = True, samplewise_std_normalization = True)
val_datagen = ImageDataGenerator(samplewise_center = True, samplewise_std_normalization = True)
test_datagen = ImageDataGenerator(samplewise_center = True, samplewise_std_normalization = True)


In [None]:
train_generator = train_datagen.flow_from_dataframe(dataframe = df_train,
                                                    x_col = "path",
                                                    y_col = "label",
                                                    target_size = (150, 150),
                                                    class_mode = "categorical",
                                                    batch_size = 32,
            
                                                   )

validation_generator = val_datagen.flow_from_dataframe(dataframe = df_val,
                                                    x_col = "path",
                                                    y_col = "label",
                                                    target_size = (150, 150),
                                                    batch_size = 32,
                                                    class_mode = "categorical",
            
                                                  )
test_generator = test_datagen.flow_from_dataframe(dataframe = df_test,
                                                    x_col = "path",
                                                    y_col = "label",
                                                    target_size = (150, 150),
                                                    batch_size = 32,
                                                    class_mode = "categorical",
            
                                                  )

In [None]:
base_model = tf.keras.applications.vgg16.VGG16(input_shape= (150, 150, 3),
                                               include_top=False,
                                               weights='imagenet')
base_model.summary()

model = tf.keras.Sequential([
    base_model,
    tf.keras.layers.Flatten(),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(512, activation = 'relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation = 'relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(3, activation = 'softmax'),
])

model.summary()

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(min_delta = 0.01, patience = 20, restore_best_weights = True)

model.compile(optimizer = 'adam',
              loss = 'categorical_crossentropy',
              metrics = 'accuracy')


In [None]:
history = model.fit(x = train_generator, validation_data = validation_generator, epochs = 100, callbacks = [early_stopping])

In [None]:
from tf.keras.models import load_model

model.save('my_model.h5')
del model


model = load_model('my_model.h5')

model.evaluate(test_generator)
