![Chest-Xray](https://www.lunit.io/img/products/cxr-s5-2.jpg)

## Problem Statement
 
### Pneumonia Classification in Chest X-Rays (CXRs) is organized by Segmind.   


As you maybe aware, 
<span style="background-color: #9FE2BF">
"Pneumonia killed more than 808,000 children under the age of 5 in 2017, accounting for 15% of all deaths of children under 5 years. People at-risk for pneumonia also include adults over the age of 65 and people with preexisting health problems." — WHO
</span>

While prevalent, diagnosing pneumonia in a CXR accurately is difficult. Expert radiologists are required to review the CXR and also require confirmation through clinical examinations. You are **tasked to classify CXRs with pneumonia from their normal CXR counterparts, using machine learning and computer vision techniques.**

In [1]:
# import required packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import keras 
import tensorflow as tf
import cv2
from keras.applications import DenseNet121
from keras.applications.densenet import preprocess_input
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
import h5py
import kerastuner as kt
from kerastuner import HyperModel
from kerastuner.tuners import Hyperband
import tensorflow_addons as tfa
import warnings
warnings.filterwarnings('ignore')

In [2]:
# dataset directory
train_dir = '../input/pn-hack/pneumonia_dataset/pneumonia_dataset/train'
# test directory
test_dir = '../input/pn-hack/pneumonia_dataset/pneumonia_dataset/test'

In [3]:
# test csv path
csv_path = '../input/pn-hack/pneumonia_dataset/pneumonia_dataset/test.csv'
test_csv = pd.read_csv(csv_path)
test_csv.head()

Unnamed: 0,filename
0,CXR_test_519.png
1,CXR_test_578.png
2,CXR_test_359.png
3,CXR_test_573.png
4,CXR_test_471.png


In [4]:
# image datagenerator instantiation
train_gen = ImageDataGenerator(preprocessing_function=preprocess_input, 
                               validation_split=0.2)

# test generator
test_gen = ImageDataGenerator(preprocessing_function= preprocess_input)

In [5]:
training_data = train_gen.flow_from_directory(directory=train_dir,
                                             target_size = (224,224),
                                             shuffle=True,
                                             batch_size= 15,
                                             subset = 'training')

validation_data = train_gen.flow_from_directory(directory=train_dir,
                                             target_size = (224, 224),
                                             shuffle=True,
                                             batch_size= 15,
                                             subset = 'validation')

test_data = test_gen.flow_from_dataframe(dataframe= test_csv,
                                        directory= test_dir,
                                        target_size = (224,224),
                                        batch_size = 15,
                                        shuffle= False,
                                        class_mode= None)


Found 1940 images belonging to 2 classes.
Found 485 images belonging to 2 classes.
Found 606 validated image filenames.


In [6]:
# path of pretrained models weights
weights_path = '../input/weights-file/brucechou1983_CheXNet_Keras_0.3.0_weights.h5'

### Optimal Model Architecture Search using Keras Tuner

In [None]:
def build_model(hp):
    base_model = DenseNet121(weights=None,
                        include_top=False,
                        input_shape=(224,224,3), pooling="avg")
    x = tf.keras.layers.Flatten()(base_model.output)
    x = tf.keras.layers.Dense(hp.Int('units', min_value = 64, 
                                     max_value=256, default=128),
                             activation = 'relu')(x)
    #x = tf.keras.layers.BatchNormalization()(x)
    #x = tf.keras.activations.relu(x)
    x = tf.keras.layers.Dropout(rate = hp.Float('dropout_rate',min_value = 0.1, 
                                                max_value=0.4,default=0.0))(x)
    #x = tf.keras.layers.Dense(hp.Int('units', min_value = 64, max_value=256, default=128),activation = 'relu')(x)
    #x = tf.keras.layers.BatchNormalization()(x)
    #x = tf.keras.activations.relu(x)
    predictions = tf.keras.layers.Dense(2, activation='softmax', name='predictions')(x)
    model = tf.keras.Model(inputs= base_model.input, outputs=predictions)
    model.load_weights(weights_path,by_name= True, skip_mismatch=True)
    model.layers.pop(),
    learning_rate = hp.Float('learning_rate', min_value=1e-4, max_value = 1e-3,default = 1e-3)
    #adamw = tfa.optimizers.AdamW(weight_decay= wd,learning_rate=learning_rate)
    optimizer = keras.optimizers.Adam(learning_rate)
    model.compile(optimizer=optimizer,
                  loss=keras.losses.BinaryCrossentropy(from_logits=True),
                  metrics=[keras.metrics.AUC(),
                           keras.metrics.Precision(),
                           keras.metrics.Recall()])
    return model

In [None]:
tuner = Hyperband(
    build_model,
    objective=kt.Objective('val_auc',direction = 'max'),
    max_epochs= 5,
    hyperband_iterations= 3,
    seed = 42,
    project_name='pneumonia_classification')

In [None]:
tuner.results_summary()

In [None]:
tuner.search(training_data, validation_data=validation_data)

### Build model using optimal parameter gets in tuning

In [7]:
def build_model():
    base_model = DenseNet121(weights=None,
                        include_top=False,
                        input_shape=(224,224,3), pooling="avg")
    base_model.load_weights(weights_path,by_name= True)
    x = tf.keras.layers.Dense(units = 256, activation = 'relu')(base_model.output)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.activations.relu(x)
    x = tf.keras.layers.Dropout(rate = 0.1)(x)
    x = tf.keras.layers.Dense(units = 128)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.activations.relu(x)
    predictions = tf.keras.layers.Dense(2, activation='softmax', name='predictions')(x)
    model = tf.keras.Model(inputs= base_model.input, outputs=predictions)
    
    learning_rate = 4.8637e-5
    #adamw = tfa.optimizers.AdamW(weight_decay= wd,learning_rate=learning_rate)
    optimizer = keras.optimizers.Adam(learning_rate)
    model.compile(optimizer=optimizer,
                  loss=keras.losses.BinaryCrossentropy(from_logits=True),
                  metrics=[keras.metrics.AUC(),
                           keras.metrics.Precision(),
                           keras.metrics.Recall()])
    return model
    

In [8]:
model = build_model()
#model.summary()

In [9]:
# Callbacks mode to min
checkpoint = ModelCheckpoint(filepath= 'Densenet_tuned.h5', save_best_only=True, save_weights_only=False)
lr_reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.99, patience=2, verbose=2, mode='min')
early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, mode='min')

In [10]:
model_hist = model.fit_generator(generator=training_data, validation_data=validation_data, 
                                 epochs = 10, callbacks =[checkpoint, lr_reduce])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10

Epoch 00007: ReduceLROnPlateau reducing learning rate to 4.8150630864256523e-05.
Epoch 8/10
Epoch 9/10

Epoch 00009: ReduceLROnPlateau reducing learning rate to 4.76691258882056e-05.
Epoch 10/10


In [11]:
validation_data = train_gen.flow_from_directory(directory=train_dir,
                                             target_size = (224,224),
                                             shuffle=False,
                                             batch_size= 15,
                                             subset = 'validation')

Found 485 images belonging to 2 classes.


In [None]:
best_model = keras.models.load_model("./Densenet_tuned.h5")

In [12]:
val_pred = model.predict_generator(validation_data)

In [13]:
# pnemonia probablities

val_pred = val_pred[:,1]

In [14]:
val_true = validation_data.classes

In [15]:
validation_data.class_indices

{'normal': 0, 'pneumonia': 1}

In [16]:
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score

def Find_Optimal_Cutoff(target, predicted):
    """ Find the optimal probability cutoff point for a classification model related to event rate
    Parameters
    ----------
    target : Matrix with dependent or target data, where rows are observations

    predicted : Matrix with predicted data, where rows are observations

    Returns
    -------
    list type, with optimal cutoff value

    """
    fpr, tpr, threshold = roc_curve(target, predicted)
    i = np.arange(len(tpr)) 
    roc = pd.DataFrame({'tf' : pd.Series(tpr-(1-fpr), index=i), 'threshold' : pd.Series(threshold, index=i)})
    roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]]

    return list(roc_t['threshold'])


In [17]:
threshold = Find_Optimal_Cutoff(target=val_true, predicted=val_pred)
print(threshold)

[0.5868260264396667]


In [18]:
predictions = []
for item in val_pred:
    if item <=threshold:
        predictions.append(0)
    else:
        predictions.append(1)
        
f1_score(y_true=val_true, y_pred=predictions)

0.7662337662337664

In [19]:
test_pred = model.predict_generator(test_data)
test_pred = test_pred[:,1]

In [20]:
test_predictions = []
for item in test_pred:
    if item <= threshold:
        test_predictions.append('normal')
    else:
        test_predictions.append('pneumonia')
        

In [21]:
test_csv['label'] = test_predictions
test_csv.head()

Unnamed: 0,filename,label
0,CXR_test_519.png,pneumonia
1,CXR_test_578.png,pneumonia
2,CXR_test_359.png,normal
3,CXR_test_573.png,pneumonia
4,CXR_test_471.png,normal


In [22]:
test_csv.to_csv( "test_pred_val_auc_8521_f1score_76.csv", index=False)