# Project 2 | ML | 2021-22

- João Santos, 76912
- João Carvalho, 106310

## Pneumonia Dataset

https://www.kaggle.com/datasets/artyomkolas/3-kinds-of-pneumonia

In [1]:
# Imports
import datetime
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

print(tf.__version__)

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score

from tensorflow.keras import backend as K
from tensorflow.keras import layers
from tensorflow.keras import Model, Sequential
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import tensorflow_addons as tfa

2022-06-25 15:36:24.388679: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/joao/.mujoco/mujoco210/bin
2022-06-25 15:36:24.388733: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


2.9.1


In [None]:
# Load images do Pandas DataFrame
filenames = tf.io.gfile.glob('./Curated X-Ray Dataset/*/*')
print(len(filenames))
filenames[:3]

data = pd.DataFrame()
for el in range(0, len(filenames)):
    target = filenames[el].split('/')[-2]
    path = filenames[el]
    
    data.loc[el, 'filename'] = path
    data.loc[el, 'class'] = target

print(data['class'].value_counts(dropna=False))
data

9208


In [None]:
# Shuffle data and make class numerical
data = shuffle(data)
data.reset_index(drop=True, inplace=True)

change = {
'Normal' : '0',
'Pneumonia-Bacterial': '1',
'Pneumonia-Viral' : '2',
'COVID-19' : '3',
}

data['class'] = data['class'].map(change)
data

In [None]:
# Split data into train and validation
train_data, val_data = train_test_split(data, test_size=0.1, stratify=data['class'])
print(train_data['class'].value_counts(dropna=False))
print(val_data['class'].value_counts(dropna=False))

# Split train into train and test
train_data, test_data = train_test_split(train_data, test_size=0.1, stratify=train_data['class'])
print(train_data['class'].value_counts(dropna=False))
print(test_data['class'].value_counts(dropna=False))

In [None]:
BATCH_SIZE = 2
EPOCHS = 30
IM_SIZE_W = 200 #300
IM_SIZE_H = 200 #400

AUTOTUNE = tf.data.experimental.AUTOTUNE

# tf.random.set_seed(10)

In [None]:
# Define ImageDataGenerator
datagen = ImageDataGenerator(rescale = 1./255,
                            zoom_range=0.1, # 0.05
                            brightness_range=[0.9, 1.0],
                            height_shift_range=0.05, 
                            width_shift_range=0.05,
                            rotation_range=10, 
                            )


test_datagen = ImageDataGenerator(rescale = 1./255)

train_gen = datagen.flow_from_dataframe(train_data,
                                        x_col="filename",
                                        y_col="class",
                                        target_size=(IM_SIZE_W, IM_SIZE_H),
                                        color_mode='grayscale',
                                        batch_size=BATCH_SIZE,
                                        class_mode='categorical',
                                        shuffle=True,
                                        num_parallel_calls=AUTOTUNE)

val_gen = test_datagen.flow_from_dataframe(val_data,
                                        x_col="filename",
                                        y_col="class",
                                        target_size=(IM_SIZE_W, IM_SIZE_H),
                                        color_mode='grayscale',
                                        batch_size=BATCH_SIZE,
                                        class_mode='categorical',
                                        shuffle=False,
                                        num_parallel_calls=AUTOTUNE)

test_gen = test_datagen.flow_from_dataframe(test_data,
                                        x_col="filename",
                                        y_col="class",
                                        target_size=(IM_SIZE_W, IM_SIZE_H),
                                        color_mode='grayscale',
                                        batch_size=BATCH_SIZE,
                                        class_mode='categorical',
                                        shuffle=False,
                                        num_parallel_calls=AUTOTUNE)

In [None]:
# Define CNN model
def create_model():
    
    # Model input
    input_layer = layers.Input(shape=(IM_SIZE_W, IM_SIZE_H, 1), name='input')  
    
    # First block
    x = layers.Conv2D(filters=128, kernel_size=3, 
                        activation='relu', padding='same', 
                        name='conv2d_1')(input_layer)
    x = layers.MaxPool2D(pool_size=2, name='maxpool2d_1')(x)
    x = layers.Dropout(0.1, name='dropout_1')(x)

    # Second block
    x = layers.Conv2D(filters=128, kernel_size=3, 
                        activation='relu', padding='same', 
                        name='conv2d_2')(x)
    x = layers.MaxPool2D(pool_size=2, name='maxpool2d_2')(x)
    x = layers.Dropout(0.1, name='dropout_2')(x)

    # Third block
    x = layers.Conv2D(filters=128, kernel_size=3, 
                        activation='relu', padding='same', 
                        name='conv2d_3')(x)
    x = layers.MaxPool2D(pool_size=2, name='maxpool2d_3')(x)
    x = layers.Dropout(0.1, name='dropout_3')(x)

    # Fourth block
    x = layers.Conv2D(filters=256, kernel_size=3, 
                        activation='relu', padding='same', 
                        name='conv2d_4')(x)
    x = layers.MaxPool2D(pool_size=2, name='maxpool2d_4')(x)
    x = layers.Dropout(0.1, name='dropout_4')(x)

    # Fifth block
    x = layers.Conv2D(filters=256, kernel_size=3, 
                        activation='relu', padding='same', 
                        name='conv2d_5')(x)
    x = layers.MaxPool2D(pool_size=2, name='maxpool2d_5')(x)
    x = layers.Dropout(0.1, name='dropout_5')(x)

    # Sixth block
    x = layers.Conv2D(filters=512, kernel_size=3, 
                        activation='relu', padding='same', 
                        name='conv2d_6')(x)
    x = layers.MaxPool2D(pool_size=2, name='maxpool2d_6')(x)
    x = layers.Dropout(0.1, name='dropout_6')(x)

    # Seventh block
    x = layers.Conv2D(filters=512, kernel_size=3, 
                        activation='relu', padding='same', 
                        name='conv2d_7')(x)
    x = layers.MaxPool2D(pool_size=2, name='maxpool2d_7')(x)
    x = layers.Dropout(0.1, name='dropout_7')(x)
    
    # GlobalAveragePooling
    x = layers.GlobalAveragePooling2D(name='global_average_pooling2d')(x)   
    x = layers.Flatten()(x)
    
    # Head
    x = layers.Dense(1024,activation='relu')(x)
    x = layers.Dropout(0.1, name='dropout_head_2')(x)
    x = layers.Dense(128,activation='relu')(x)
    
    # Output
    output = layers.Dense(units=4, 
                            activation='softmax', 
                            name='output')(x)


    model = Model(input_layer, output)
        
    F_1_macro = tfa.metrics.f_scores.F1Score(num_classes=4, average="macro", name='f1_macro') 
    
    model.compile(optimizer='adam', 
                    loss='categorical_crossentropy', 
                    metrics=F_1_macro)

    return model

model = create_model()
model.summary()

In [None]:
# Training
init_time = datetime.datetime.now()


train_steps = train_gen.samples // BATCH_SIZE
valid_steps = val_gen.samples // BATCH_SIZE

early_stopping = EarlyStopping(monitor="val_loss", patience=8, mode="min")
checkpoint = ModelCheckpoint("loss-{val_loss:.4f}.h5", monitor="val_loss", verbose=0, 
                             save_best_only=True, save_weights_only=True, mode="min")
learning_rate_reduction = ReduceLROnPlateau(monitor="val_loss", factor=0.1, patience=4, 
                                            min_lr=1e-7, verbose=1, mode="min")


history = model.fit(
    train_gen,
    validation_data=val_gen,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    steps_per_epoch=train_steps,
    validation_steps=valid_steps,
    callbacks=[
                checkpoint, 
                early_stopping, 
                learning_rate_reduction],
    verbose=1,
    )

requared_time = datetime.datetime.now() - init_time
print(f'\nRequired time:  {str(requared_time)}\n')

In [None]:
history_df = pd.DataFrame(history.history)
history_df.loc[0:, ['loss', 'val_loss']].plot()
print("Minimum Validation Loss: {:0.4f}".format(history_df['val_loss'].min()));

In [None]:
test_steps = test_gen.samples // BATCH_SIZE

test_loss, test_acc = model.evaluate(test_gen, steps=test_steps)
print('\naccuracy:', test_acc, 'loss: ',test_loss)

In [None]:
predict = model.predict(test_gen, steps=test_steps)
y_hat = np.argmax(predict, axis=1)
y_hat[:20]

In [None]:
test_labels_df = pd.DataFrame()
test_labels_df[['class']] = test_data[['class']]

change = {
'0' : 0,
'1' : 1,
'2' : 2,
'3' : 3,
}

test_labels_df['class'] = test_labels_df['class'].map(change)
test_labels_df = test_labels_df[ : test_steps*BATCH_SIZE]


y_test = np.array(test_labels_df['class'])
y_test[:20]

## Classification Report

In [None]:
print(classification_report(y_test, y_hat), '\n')
cm = confusion_matrix(y_test, y_hat)
sns.heatmap(cm, annot=True, cmap="Blues", fmt='.0f', cbar=False)

In [None]:
path = './model'

if (not os.path.exists(path)):
    os.mkdir(path)
    
model.save(path+'/')