### LIBRARIES

In [None]:
import numpy as np 
import pandas as pd
import os
from glob import glob
%matplotlib inline
import matplotlib.pyplot as plt
from keras_preprocessing.image import ImageDataGenerator

### NIH DATASET
Since the external data was allowed to use for training, **NIH Chest X-rays** dataset was used. 
[Dataset](https://www.kaggle.com/nih-chest-xrays/data)

In [None]:
all_xray_df = pd.read_csv('../input/data/Data_Entry_2017.csv')
all_image_paths = {os.path.basename(x): x for x in 
                   glob(os.path.join('../input/data','images*', '*', '*.png'))}
print('Scans found:', len(all_image_paths), ', Total Headers', all_xray_df.shape[0])
all_xray_df['path'] = all_xray_df['Image Index'].map(all_image_paths.get)
all_xray_df.sample(3)

Scans found: 112120 , Total Headers 112120


Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11,path
22222,00005895_002.png,No Finding,2,5895,53,F,AP,2500,2048,0.171,0.171,,../input/data/images_003/images/00005895_002.png
70068,00017247_008.png,No Finding,8,17247,56,F,PA,2048,2500,0.168,0.168,,../input/data/images_008/images/00017247_008.png
108852,00029543_001.png,No Finding,1,29543,59,F,PA,3056,2544,0.139,0.139,,../input/data/images_012/images/00029543_001.png


In [None]:
findings = set()
for f in all_xray_df['Finding Labels'].unique():
    findings.update(f.split('|'))
print(f'Total number of single diagnoses: {len(findings)}')
findings

Total number of single diagnoses: 15


{'Atelectasis',
 'Cardiomegaly',
 'Consolidation',
 'Edema',
 'Effusion',
 'Emphysema',
 'Fibrosis',
 'Hernia',
 'Infiltration',
 'Mass',
 'No Finding',
 'Nodule',
 'Pleural_Thickening',
 'Pneumonia',
 'Pneumothorax'}

In [None]:
for finding in findings:
    all_xray_df[finding] = all_xray_df['Finding Labels'].map(lambda x: 1.0 if finding in x else 0)

all_xray_df.head(3)

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,...,No Finding,Mass,Emphysema,Pleural_Thickening,Fibrosis,Pneumonia,Infiltration,Pneumothorax,Nodule,Cardiomegaly
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


ENCODING the NORMAL images as **N** and PNEUMONIA images as **Y**

In [None]:
all_xray_df['pneumonia_class'] = all_xray_df.apply(lambda x: 'Y' if x['Pneumonia'] == 1.0 else 'N', axis=1)
all_xray_df.head(3)

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,...,Mass,Emphysema,Pleural_Thickening,Fibrosis,Pneumonia,Infiltration,Pneumothorax,Nodule,Cardiomegaly,pneumonia_class
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,N
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,N
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,N


In [None]:
all_xray_df.columns

Index(['Image Index', 'Finding Labels', 'Follow-up #', 'Patient ID',
       'Patient Age', 'Patient Gender', 'View Position', 'OriginalImage[Width',
       'Height]', 'OriginalImagePixelSpacing[x', 'y]', 'Unnamed: 11', 'path',
       'Effusion', 'Consolidation', 'Atelectasis', 'Edema', 'Hernia',
       'No Finding', 'Mass', 'Emphysema', 'Pleural_Thickening', 'Fibrosis',
       'Pneumonia', 'Infiltration', 'Pneumothorax', 'Nodule', 'Cardiomegaly',
       'pneumonia_class'],
      dtype='object')

Since NORMAL images in the NIH dataset is >1L so taking a subsample of 2K NORMAL images

In [None]:
pneu=all_xray_df[all_xray_df["pneumonia_class"]=="Y"]
Npneu=all_xray_df[all_xray_df["pneumonia_class"]=="N"]
NNpneu=Npneu.sample(n=2000,random_state=13)

In [None]:
data=pd.concat([pneu,NNpneu],axis=0)

This is the dataset used for training

In [None]:
data=data.sample(frac=1)
data.shape


(3431, 29)

### DataGenerator
Here the NIH dataset is used for training and DPHI training Dataset is used for validation

In [None]:
datagen=ImageDataGenerator(rescale=1./255,)

In [None]:
train_generator=datagen.flow_from_dataframe(
dataframe=data,
directory=None,
x_col="path",
y_col="pneumonia_class",
batch_size=32,
seed=42,
shuffle=True,
class_mode="categorical",
target_size=(224,224))

Found 3431 validated image filenames belonging to 2 classes.


In [None]:
train_generator.class_indices

{'N': 0, 'Y': 1}

In [None]:
image_folder = os.listdir("../input/dphipneumonia/pneumonia_dataset/train")
filename = []
label = []
for folder in image_folder:
  for image_name in os.listdir(os.path.join("../input/dphipneumonia/pneumonia_dataset/train",folder)):
    filename.append(folder+"/"+image_name)
    label.append(folder)

val = pd.DataFrame({'filename':filename,'label':label})
val["label"]=val["label"].map({"pneumonia":"Y","normal":"N"})
val.head()

Unnamed: 0,filename,label
0,pneumonia/CXR_train_508.png,Y
1,pneumonia/CXR_train_610.png,Y
2,pneumonia/CXR_train_1651.png,Y
3,pneumonia/CXR_train_505.png,Y
4,pneumonia/CXR_train_1259.png,Y


In [None]:
valid_datagen=ImageDataGenerator(rescale=1./255.,)
valid_generator=valid_datagen.flow_from_dataframe( 
dataframe=val,
directory="../input/dphipneumonia/pneumonia_dataset/train",
x_col="filename",
y_col="label",
batch_size=32,
seed=42,
shuffle=True,
class_mode="categorical",
target_size=(224,224))

Found 2425 validated image filenames belonging to 2 classes.


In [None]:
valid_generator.class_indices

{'N': 0, 'Y': 1}

### MODEL ACHITECTURE

In [None]:
from tensorflow.keras.layers import Dense, Flatten, GlobalAveragePooling2D, BatchNormalization, Dropout,AveragePooling2D
from tensorflow.keras.applications.resnet import ResNet50
import tensorflow as tf
from tensorflow.keras.applications import InceptionV3,DenseNet201,EfficientNetB7, MobileNetV2,Xception,VGG16,NASNetMobile,DenseNet169
from keras.applications.inception_resnet_v2 import InceptionResNetV2
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras.models import Model
from keras.models import Sequential
from keras.regularizers import *
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
resnet= Xception(weights="imagenet",)
x=resnet.layers[-2].output
fc1=Dense(2,activation='softmax',kernel_initializer='glorot_uniform', kernel_regularizer=l2(.0005))(x)
my_model=Model(inputs=resnet.input,outputs=fc1)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/xception/xception_weights_tf_dim_ordering_tf_kernels.h5


In [None]:
STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size
STEP_SIZE_VALID=valid_generator.n//valid_generator.batch_size

### EVAL METRIC

In [None]:
import keras.backend as K

def f1(y_true, y_pred): #taken from old keras source code
  true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
  predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
  precision = true_positives / (predicted_positives + K.epsilon())
  recall = true_positives / (possible_positives + K.epsilon())
  f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
  return f1_val

In [None]:
my_model.compile(optimizer =tf.keras.optimizers.Adam(learning_rate=0.00001,decay=0.0001),metrics=[f1],loss= tf.keras.losses.CategoricalCrossentropy())

### CHECKPOINTS

In [None]:
checkpoint_path = "training_0/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)
my_callbacks = [
               ModelCheckpoint(checkpoint_path, monitor = 'val_f1',verbose = 1,save_weights_only=True, save_best_only = True,mode="max"),
              EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='min'),
              ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=1, mode='min',min_delta=1e-4)
]

### TRAINNING

In [None]:
my_model.fit(
        train_generator,
        steps_per_epoch=STEP_SIZE_TRAIN,
        epochs=40,
        validation_data=valid_generator,
        validation_steps=STEP_SIZE_VALID,callbacks=[my_callbacks])

Epoch 1/40

Epoch 00001: val_f1 improved from -inf to 0.53417, saving model to training_0/cp.ckpt
Epoch 2/40

Epoch 00002: val_f1 improved from 0.53417 to 0.55958, saving model to training_0/cp.ckpt
Epoch 3/40

Epoch 00003: val_f1 improved from 0.55958 to 0.62750, saving model to training_0/cp.ckpt
Epoch 4/40

Epoch 00004: val_f1 improved from 0.62750 to 0.66667, saving model to training_0/cp.ckpt
Epoch 5/40

Epoch 00005: val_f1 improved from 0.66667 to 0.70792, saving model to training_0/cp.ckpt
Epoch 6/40

Epoch 00006: val_f1 improved from 0.70792 to 0.76000, saving model to training_0/cp.ckpt
Epoch 7/40

Epoch 00007: val_f1 improved from 0.76000 to 0.82625, saving model to training_0/cp.ckpt
Epoch 8/40

Epoch 00008: val_f1 improved from 0.82625 to 0.84833, saving model to training_0/cp.ckpt
Epoch 9/40

Epoch 00009: val_f1 improved from 0.84833 to 0.86083, saving model to training_0/cp.ckpt
Epoch 10/40

Epoch 00010: val_f1 improved from 0.86083 to 0.87875, saving model to training_0/

<tensorflow.python.keras.callbacks.History at 0x7f5390d3c190>

In [None]:
model.load_weights(checkpoint_path)

In [None]:
my_model.save_weights("Xceptionnet.h5")

Now using DPHI dataset for training and prediction 
with pretrained weights of NIH dataset

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(val, test_size=0.1, random_state=42)

In [None]:
train_generator=datagen.flow_from_dataframe(
dataframe=X_train,
directory="../input/dphipneumonia/pneumonia_dataset/train",
x_col="filename",
y_col="label",
batch_size=32,
seed=42,
shuffle=True,
class_mode="categorical",
target_size=(224,224))

Found 2182 validated image filenames belonging to 2 classes.


In [None]:
valid_datagen=ImageDataGenerator(rescale=1./255.,)
valid_generator=valid_datagen.flow_from_dataframe( 
dataframe=X_test,
directory="../input/dphipneumonia/pneumonia_dataset/train",
x_col="filename",
y_col="label",
batch_size=32,
seed=42,
shuffle=True,
class_mode="categorical",
target_size=(224,224))

Found 243 validated image filenames belonging to 2 classes.


In [None]:
STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size
STEP_SIZE_VALID=valid_generator.n//valid_generator.batch_size

In [None]:
my_model.fit(
        train_generator,
        steps_per_epoch=STEP_SIZE_TRAIN,
        epochs=10,
        validation_data=valid_generator,
        validation_steps=STEP_SIZE_VALID,callbacks=[my_callbacks])

Epoch 1/10

Epoch 00001: val_f1 improved from -inf to 0.95089, saving model to training_0/cp.ckpt
Epoch 2/10

Epoch 00002: val_f1 improved from 0.95089 to 0.97321, saving model to training_0/cp.ckpt
Epoch 3/10

Epoch 00003: val_f1 did not improve from 0.97321
Epoch 4/10

Epoch 00004: val_f1 improved from 0.97321 to 0.97768, saving model to training_0/cp.ckpt
Epoch 5/10

Epoch 00005: val_f1 did not improve from 0.97768
Epoch 6/10

Epoch 00006: val_f1 did not improve from 0.97768
Epoch 7/10

Epoch 00007: val_f1 did not improve from 0.97768
Epoch 8/10

Epoch 00008: val_f1 improved from 0.97768 to 0.98214, saving model to training_0/cp.ckpt
Epoch 9/10

Epoch 00009: val_f1 did not improve from 0.98214
Epoch 10/10

Epoch 00010: val_f1 did not improve from 0.98214


<tensorflow.python.keras.callbacks.History at 0x7f4ef8b50910>

In [None]:
my_model.load_weights(checkpoint_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f532c1eb110>

In [None]:
my_model.evaluate(valid_generator,verbose=1)



[0.06643582135438919, 0.9751233458518982]

In [None]:
test=pd.read_csv("../input/dphipneumonia/pneumonia_dataset/test.csv")
test_generator=valid_datagen.flow_from_dataframe(
dataframe=test,
directory="../input/dphipneumonia/pneumonia_dataset/test",
x_col="filename",
y_col=None,
batch_size=1,
seed=42,
shuffle=False,
class_mode=None,
target_size=(224,224))
STEP_SIZE_TEST=test_generator.n//test_generator.batch_size

Found 606 validated image filenames.


In [None]:
pred1=np.argmax(my_model.predict(test_generator,steps=STEP_SIZE_TEST,verbose=1),axis=1)



In [None]:
pred1

array([1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0,

In [None]:
sub=pd.read_csv("../input/dphipneumonia/pneumonia_dataset/sample_submission.csv")
sub["label"]=pred1
sub["label"]=sub["label"].map({0:'normal',
 1:'pneumonia'})

In [None]:
sub["label"].value_counts()

normal       308
pneumonia    298
Name: label, dtype: int64

In [None]:
sub.to_csv("sub2.csv",index=False)

### It achieved a F1_score of 97.36 on leaderboard.