# Pneumonia Classification on X-rays images



### Importing Essentials

In [None]:
import re
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
from kaggle_datasets import KaggleDatasets
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from PIL import Image
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense,BatchNormalization, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img
from sklearn.metrics import classification_report, confusion_matrix
import cv2
import warnings
warnings.filterwarnings('ignore')

### Import Datasets

In [None]:
import glob
main_path = "../input/chest-xray-pneumonia/chest_xray"

train_path = os.path.join(main_path, "train")
test_path = os.path.join(main_path, "test")

train_normal = glob.glob(train_path+"/NORMAL/*.jpeg")
train_pneumonia = glob.glob(train_path+"/PNEUMONIA/*.jpeg")

test_pneumonia = glob.glob(test_path+"/PNEUMONIA/*.jpeg")
test_normal = glob.glob(test_path+"/NORMAL/*.jpeg")

val_path = os.path.join(main_path, 'val')
val_normal = glob.glob(val_path+"/NORMAL/*.jpeg")
val_pneumonia = glob.glob(val_path+"/PNEUMONIA/*.jpeg")

In [None]:
# train
train_list = [x for x in train_normal]
train_list.extend([x for x in train_pneumonia])

df_train = pd.DataFrame(np.concatenate([['Normal']*len(train_normal) , ['Pneumonia']*len(train_pneumonia)]), columns = ['class'])
df_train['image'] = [x for x in train_list]

# test
test_list = [x for x in test_normal]
test_list.extend([x for x in test_pneumonia])

df_test = pd.DataFrame(np.concatenate([['Normal']*len(test_normal) , ['Pneumonia']*len(test_pneumonia)]), columns = ['class'])
df_test['image'] = [x for x in test_list]

# validation
val_list = [x for x in val_normal]
val_list.extend([x for x in val_pneumonia])

df_val = pd.DataFrame(np.concatenate([['Normal']*len(val_normal) , ['Pneumonia']*len(val_pneumonia)]), columns = ['class'])
df_val['image'] = [x for x in val_list]

In [None]:
df_train

In [None]:
df_test

In [None]:
df_train.shape, df_test.shape, df_val.shape

# Exploratory Data Analysis

### Training set

In [None]:
import seaborn as sns
sns.set_style('whitegrid')
plt.figure(figsize=(6,4))

ax = sns.countplot(x='class', data=df_train, palette="mako")

plt.xlabel("Class", fontsize=12)
plt.ylabel("Number of Samples",fontsize=12)
plt.ylim(0,5000)
plt.xticks([0,1],['Normal', 'Pneumonia'],fontsize=11)

for p in ax.patches:
    ax.annotate((p.get_height()), (p.get_x()+0.30,p.get_height()+300),fontsize=13)

plt.show()

In [None]:
plt.figure(figsize=(7,5))

df_train['class'].value_counts().plot(kind='pie',labels = ['',''], autopct='%1.1f%%', colors = ['darkcyan','blue'], explode = [0,0.05], textprops = {"fontsize":15})

plt.legend(labels=['Pneumonia', 'Normal'])
plt.show()

#### Testing set

In [None]:
plt.figure(figsize=(6,4))

ax = sns.countplot(x='class', data=df_test, palette="rocket")

plt.xlabel("Class", fontsize=12)
plt.ylabel("Number of Samples",fontsize=12)
plt.ylim(0,5000)
plt.xticks([0,1],['Normal', 'Pneumonia'],fontsize=11)

for p in ax.patches:
    ax.annotate((p.get_height()), (p.get_x()+0.30,p.get_height()+300),fontsize=13)

plt.show()

In [None]:
plt.figure(figsize=(7,5))

df_test['class'].value_counts().plot(kind='pie',labels = ['',''], autopct='%1.1f%%', colors=['purple','orange'], explode = [0,0.05], textprops = {"fontsize":15})

plt.legend(labels=['Pneumonia', 'Normal'])
plt.show()

#### Validation set

In [None]:
plt.figure(figsize=(6,4))

ax = sns.countplot(x='class', data=df_val, palette="cubehelix")

plt.xlabel("Class", fontsize=12)
plt.ylabel("Number of Samples",fontsize=12)
plt.ylim(0,20)
plt.xticks([0,1],['Normal', 'Pneumonia'],fontsize=11)

for p in ax.patches:
    ax.annotate((p.get_height()), (p.get_x()+0.30,p.get_height()+300),fontsize=13)

plt.show()

In [None]:
plt.figure(figsize=(7,5))

df_val['class'].value_counts().plot(kind='pie',labels = ['',''], autopct='%1.1f%%', colors=['green','pink'], explode = [0,0.05], textprops = {"fontsize":15})

plt.legend(labels=['Pneumonia', 'Normal'])
plt.show()

The distributions from these datasets are a little different from each other. Both are slightly imbalanced, having more samples from the positive class (Pneumonia), with the training set being a little more imbalanced.

Before we move on to the next section, we will take a look at a few examples from each dataset.

In [None]:
import cv2

print('Train Set - Normal')

for i in range(0,12):
#     plt.figure(figsize=(7,8))
    plt.subplot(3,4,i+1)
    img = cv2.imread(train_normal[i])
    img = cv2.resize(img,(224,224))
    plt.imshow(img)
    plt.axis('off')
    
plt.tight_layout()
plt.show()

In [None]:
print('Train Set - Pneumonia')

for i in range(0,12):
#     plt.figure(figsize=(7,8))
    plt.subplot(3,4,i+1)
    img = cv2.imread(train_pneumonia[i])
    img = cv2.resize(img,(224,224))
    plt.imshow(img)
    plt.axis('off')
    
plt.tight_layout()
plt.show()

Now, we’re going to load the images from the folders and prepare them to feed our models.

We begin by defining the data generators. With Keras Image Data Generator, we can rescale the pixel values and apply random transformation techniques for data augmentation on the fly. We define two different generators. The val_datagen is used to simply rescale the validation and test sets. The train_datagen includes some transformations to augment the train set.

We apply those generators on each dataset using the flow_from_dataframe method. Apart from the transformations defined in each generator, the images are also resized based on the target_size set.

## Preparing thr Data

In [None]:
train_df, val_df = train_test_split(df_train, test_size=0.20,random_state=42, stratify=df_train['class'])

In [None]:
train_df

In [None]:
val_df

In [None]:
# Data Augmentation on train dataset
train_datagen = ImageDataGenerator(rescale=1/255.,
                                  zoom_range=0.1,
                                  rotation_range=0.1,
                                  width_shift_range=0.1,
                                  height_shift_range=0.1)
# keeping validation data same and just rescaling it
val_datagen = ImageDataGenerator(rescale=1/255.)

train_imgGen = train_datagen.flow_from_dataframe(df_train,
                                             #directory=train_path, #dataframe contains the full paths
                                             x_col = 'image',
                                             y_col = 'class',
                                             target_size = (224, 224),
                                             class_mode = 'binary',
                                             batch_size = 32,
                                             seed = 42)

val_imgGen = val_datagen.flow_from_dataframe(df_val,
                                            #directory=train_path,
                                            x_col = 'image',
                                            y_col = 'class',
                                            target_size = (224, 224),
                                            class_mode = 'binary',
                                            batch_size = 32,
                                            seed = 42)

test_imgGen = val_datagen.flow_from_dataframe(df_test,
                                            #directory=test_path,
                                            x_col = 'image',
                                            y_col = 'class',
                                            target_size = (224, 224),
                                            class_mode = 'binary',
                                            batch_size = 1,
                                            shuffle = False)

## Custom CNN
Tuning is important factor here to choose better hyperparametres for building the model with good accuracy.

In [None]:
img, label = next(train_imgGen)


## Creating model with manual parameters

In [None]:
model = Sequential()

model.add(Conv2D(filters=64, kernel_size=(3,3),input_shape=(224,224,3),activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(MaxPooling2D(pool_size=(2,2),padding='same'))

model.add(Conv2D(filters=64, kernel_size=(3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(Conv2D(filters=64, kernel_size=(3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2), padding='same'))

model.add(Conv2D(filters=32, kernel_size=(3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(Conv2D(filters=32, kernel_size=(3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2), padding='same'))

model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(2, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
# Earlystopping: To overcome overfitting conditions

early_stopping = EarlyStopping(
            monitor='val_acc',
            min_delta=0.001,
            restore_best_weights=True,
            patience=3)

lr_stopping = ReduceLROnPlateau(
            monitor='val_acc',
            factor=0.5,
            min_lr=0.001)

In [None]:
model1 = model.fit(train_imgGen,epochs=10,callbacks=[early_stopping,lr_stopping])

## Manual Model performace evaluation.

In [None]:
plt.figure(figsize=(12,8))
plt.subplot(2,2,1)
plt.plot(model1.history['loss'],label='Loss')
plt.legend()
plt.title ('Loss Evaluation')

plt.figure(figsize=(12,8))
plt.subplot(2,2,2)
plt.plot(model1.history['accuracy'],label='Accuracy')
plt.legend()
plt.title ('Accuracy Evaluation')

- So from above graphs we can say that manual CNN model performed quite well but let's build this model with `keras-tuner` for more better result.

In [None]:
# Custom CNN model for tuning
def build_model(hp):
    model = Sequential([
#         block one
        Conv2D(
            filters=hp.Int('conv_1_filter', min_value=32, max_value=256,step=16),
            kernel_size = hp.Choice('conv_1_kernel',values=[3,5]),
            activation='relu',
            input_shape=(224,224,3)
        ),
        BatchNormalization(),
        MaxPooling2D(),
        Dropout(0.2),
#         block two
        Conv2D(
            filters = hp.Int('conv_2_filter', min_value=32, max_value=128, step=16),
            kernel_size = hp.Choice('conv_2_kernel',values=[3,5]),
            activation='relu'
        ),
        BatchNormalization(),
        MaxPooling2D(),
        Dropout(0.2),
#         block three
        Conv2D(
            filters = hp.Int('conv_3_filter', min_value=32, max_value=128, step=16),
            kernel_size = hp.Choice('conv_3_kernel',values=[3,5]),
            activation='relu'
        ),
        BatchNormalization(),
        MaxPooling2D(),
        Dropout(0.2),
        Flatten(),
        Dense(
            units=hp.Int('dense_1_units', min_value=32, max_value=128,step=16),
            activation='relu'
        ),
        Dense(
            units = hp.Int('dense_2_units', min_value=32, max_value=128,step=16),
            activation='relu'
        ),
        Dense(1,activation='softmax')
        
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(hp.Choice('learning_rate', values=[0.001,0.01,0.1])),
                 loss='sparse_categorical_crossentropy',
                 metrics=['accuracy'])
    return model

In [None]:
# tunning the model
from keras_tuner import RandomSearch
from keras_tuner.engine.hyperparameters import HyperParameters

In [None]:
tuner_search = RandomSearch(build_model,
                           objective='accuracy',max_trials=5,directory='output',project_name='pneumonia_detection')

In [None]:
# label = np.asarray(label).astype('float32').reshape((-1,1))
# img = np.asarray(img).astype('float32').reshape((-1,1))

In [None]:
tuner_search.search(img,label,epochs=5,validation_split=0.1)

In [None]:
model2 = tuner_search.get_best_models(num_models=1)[0]

In [None]:
model2.summary()

In [None]:
model2 = model2.fit(img, label, epochs=10, validation_split=0.1)

## CNN after tuning.

In [None]:
plt.figure(figsize=(12,8))
plt.subplot(2,2,1)
plt.plot(model2.history['val_loss'],label='Val_Loss')
plt.legend()
plt.title ('Loss Evaluation')

plt.figure(figsize=(12,8))
plt.subplot(2,2,2)
plt.plot(model2.history['val_accuracy'],label='Val_Accuracy')
plt.legend()
plt.title ('Accuracy Evaluation')