In [1]:
import pandas as pd
import numpy as np
import errno
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.metrics import binary_accuracy
import os
import cv2
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import itertools
import shutil

In [2]:
NUM_AUG_IMAGES_WANTED = 1000
IMAGE_HEIGHT = 96
IMAGE_WIDTH = 96
print(len(os.listdir('ChinaSet_AllFiles/CXR_png')))
print(len(os.listdir('MontgomerySet/CXR_png')))
shen_image_list = os.listdir('ChinaSet_AllFiles/CXR_png')
mont_image_list = os.listdir('MontgomerySet/CXR_png')
df_shen = pd.DataFrame(shen_image_list, columns=['image_id'])
df_mont = pd.DataFrame(mont_image_list, columns=['image_id'])
df_shen = df_shen[df_shen['image_id'] != 'Thumbs.db']
df_mont = df_mont[df_mont['image_id'] != 'Thumbs.db']
df_shen.reset_index(inplace=True, drop=True)
df_mont.reset_index(inplace=True, drop=True)
print(df_shen.shape)
print(df_mont.shape)

663
139
(662, 1)
(138, 1)


In [3]:
df_shen.head()
df_mont.head()

Unnamed: 0,image_id
0,MCUCXR_0001_0.png
1,MCUCXR_0002_0.png
2,MCUCXR_0003_0.png
3,MCUCXR_0004_0.png
4,MCUCXR_0005_0.png


In [4]:
def extract_target(x):
    target = int(x[-5])
    if target == 0:
        return 'Normal'
    if target == 1:
        return 'Tuberculosis'

In [5]:
df_shen['target'] = df_shen['image_id'].apply(extract_target)
df_mont['target'] = df_mont['image_id'].apply(extract_target)
df_shen['target'].value_counts()
df_mont['target'].value_counts()

Normal          80
Tuberculosis    58
Name: target, dtype: int64

In [6]:
def draw_category_images(col_name,figure_cols, df, IMAGE_PATH):
    
    """
    Give a column in a dataframe,
    this function takes a sample of each class and displays that
    sample on one row. The sample size is the same as figure_cols which
    is the number of columns in the figure.
    Because this function takes a random sample, each time the function is run it
    displays different images.
    """

IMAGE_PATH = 'ChinaSet_AllFiles/CXR_png/'
draw_category_images('target',4, df_shen, IMAGE_PATH)
IMAGE_PATH = 'MontgomerySet/CXR_png/'
draw_category_images('target',4, df_mont, IMAGE_PATH)

In [7]:
def read_image_sizes(file_name):
    """
    1. Get the shape of the image
    2. Get the min and max pixel values in the image.
    Getting pixel values will tell if any pre-processing has been done.
    3. This info will be added to the original dataframe.
    """
    image = cv2.imread(IMAGE_PATH + file_name)
    max_pixel_val = image.max()
    min_pixel_val = image.min()

    # image.shape[2] represents the number of channels: (height, width, num_channels).
    # Here we are saying: If the shape does not have a value for num_channels (height, width)
    # then assign 1 to the number of channels.
    if len(image.shape) > 2: # i.e. more than two numbers in the tuple
        output = [image.shape[0], image.shape[1], image.shape[2], max_pixel_val, min_pixel_val]
    else:
        output = [image.shape[0], image.shape[1], 1, max_pixel_val, min_pixel_val]
    return output

In [8]:
IMAGE_PATH = 'ChinaSet_AllFiles/CXR_png/'
m = np.stack(df_shen['image_id'].apply(read_image_sizes))
df = pd.DataFrame(m,columns=['w','h','c','max_pixel_val','min_pixel_val'])
df_shen = pd.concat([df_shen,df],axis=1, sort=False)
df_shen.head()

Unnamed: 0,image_id,target,w,h,c,max_pixel_val,min_pixel_val
0,CHNCXR_0001_0.png,Normal,2919,3000,3,255,0
1,CHNCXR_0002_0.png,Normal,2951,3000,3,255,0
2,CHNCXR_0003_0.png,Normal,2945,2987,3,255,0
3,CHNCXR_0004_0.png,Normal,2933,3000,3,255,0
4,CHNCXR_0005_0.png,Normal,2933,3000,3,255,0


In [9]:
IMAGE_PATH = 'MontgomerySet/CXR_png/'
m = np.stack(df_mont['image_id'].apply(read_image_sizes))
df = pd.DataFrame(m,columns=['w','h','c','max_pixel_val','min_pixel_val'])
df_mont = pd.concat([df_mont,df],axis=1, sort=False)
df_mont.head()

Unnamed: 0,image_id,target,w,h,c,max_pixel_val,min_pixel_val
0,MCUCXR_0001_0.png,Normal,4020,4892,3,255,0
1,MCUCXR_0002_0.png,Normal,4020,4892,3,255,0
2,MCUCXR_0003_0.png,Normal,4892,4020,3,255,0
3,MCUCXR_0004_0.png,Normal,4892,4020,3,255,0
4,MCUCXR_0005_0.png,Normal,4892,4020,3,255,0


In [10]:
df_shen['c'].value_counts()
df_mont['c'].value_counts()
df_mont['target'].value_counts()
df_data = pd.concat([df_shen, df_mont], axis=0).reset_index(drop=True)
df_data = shuffle(df_data)
df_data.shape

(800, 7)

In [11]:
df_data['labels'] = df_data['target'].map({'Normal':0, 'Tuberculosis':1})

In [12]:
df_data.head()

Unnamed: 0,image_id,target,w,h,c,max_pixel_val,min_pixel_val,labels
374,CHNCXR_0375_1.png,Tuberculosis,2995,2990,3,255,0,1
652,CHNCXR_0653_1.png,Tuberculosis,2550,2336,3,255,0,1
337,CHNCXR_0338_1.png,Tuberculosis,2924,2397,3,255,0,1
597,CHNCXR_0598_1.png,Tuberculosis,2179,2221,3,255,0,1
439,CHNCXR_0440_1.png,Tuberculosis,2620,2435,3,255,0,1


In [13]:
y = df_data['labels']

df_train, df_val = train_test_split(df_data, test_size=0.15, random_state=101, stratify=y)

print(df_train.shape)
print(df_val.shape)

df_train['target'].value_counts()
df_val['target'].value_counts()

(680, 8)
(120, 8)


Normal          61
Tuberculosis    59
Name: target, dtype: int64

In [14]:
base_dir = 'base_dir'
try:
    os.makedirs(base_dir)
except OSError as e:
    if e.errno != errno.EEXIST:
        raise

In [15]:
train_dir = os.path.join(base_dir, 'train_dir')

try:
    os.makedirs(train_dir)
except OSError as e:
    if e.errno != errno.EEXIST:
        raise

In [16]:
val_dir = os.path.join(base_dir, 'val_dir')

try:
    os.makedirs(val_dir)
except OSError as e:
    if e.errno != errno.EEXIST:
        raise
Normal = os.path.join(train_dir, 'Normal')

try:
    os.makedirs(Normal)
except OSError as e:
    if e.errno != errno.EEXIST:
        raise

Tuberculosis = os.path.join(train_dir, 'Tuberculosis')
try:
    os.makedirs(Tuberculosis)
except OSError as e:
    if e.errno != errno.EEXIST:
        raise

In [17]:
Normal = os.path.join(val_dir, 'Normal')
try:
    os.makedirs(Normal)
except OSError as e:
    if e.errno != errno.EEXIST:
        raise
Tuberculosis = os.path.join(val_dir, 'Tuberculosis')
try:
    os.makedirs(Tuberculosis)
except OSError as e:
    if e.errno != errno.EEXIST:
        raise

In [18]:
df_data.set_index('image_id', inplace=True)
folder_1 = os.listdir('ChinaSet_AllFiles/CXR_png')
folder_2 = os.listdir('MontgomerySet/CXR_png')
train_list = list(df_train['image_id'])
val_list = list(df_val['image_id'])

In [19]:
for image in train_list:

    fname = image
    label = df_data.loc[image,'target']

    if fname in folder_1:
        src = os.path.join('ChinaSet_AllFiles/CXR_png', fname)
        dst = os.path.join(train_dir, label, fname)

        image = cv2.imread(src)
        image = cv2.resize(image, (IMAGE_HEIGHT, IMAGE_WIDTH))
        cv2.imwrite(dst, image)

    if fname in folder_2:
        src = os.path.join('MontgomerySet/CXR_png', fname)
        dst = os.path.join(train_dir, label, fname)

        image = cv2.imread(src)
        image = cv2.resize(image, (IMAGE_HEIGHT, IMAGE_WIDTH))
        cv2.imwrite(dst, image)

In [20]:
for image in val_list:

    fname = image
    label = df_data.loc[image,'target']

    if fname in folder_1:
        src = os.path.join('ChinaSet_AllFiles/CXR_png', fname)
        dst = os.path.join(val_dir, label, fname)

        image = cv2.imread(src)
        image = cv2.resize(image, (IMAGE_HEIGHT, IMAGE_WIDTH))
        cv2.imwrite(dst, image)

    if fname in folder_2:
        src = os.path.join('MontgomerySet/CXR_png', fname)
        dst = os.path.join(val_dir, label, fname)

        image = cv2.imread(src)
        image = cv2.resize(image, (IMAGE_HEIGHT, IMAGE_WIDTH))
        cv2.imwrite(dst, image)

In [21]:
print(len(os.listdir('base_dir/train_dir/Normal')))
print(len(os.listdir('base_dir/train_dir/Tuberculosis')))
print(len(os.listdir('base_dir/val_dir/Normal')))
print(len(os.listdir('base_dir/val_dir/Tuberculosis')))

1095
1060
157
146


In [22]:
class_list = ['Normal','Tuberculosis']
for item in class_list:
    aug_dir = 'aug_dir'
    os.mkdir(aug_dir)
    img_dir = os.path.join(aug_dir, 'img_dir')
    os.mkdir(img_dir)
    img_class = item
    img_list = os.listdir('base_dir/train_dir/' + img_class)
    for fname in img_list:
            src = os.path.join('base_dir/train_dir/' + img_class, fname)
            dst = os.path.join(img_dir, fname)
            shutil.copyfile(src, dst)
    path = aug_dir
    save_path = 'base_dir/train_dir/' + img_class
    datagen = ImageDataGenerator(
        rotation_range=10,
        width_shift_range=0.1,
        height_shift_range=0.1,
        zoom_range=0.1,
        horizontal_flip=True,
        fill_mode='nearest')

    batch_size = 50

    aug_datagen = datagen.flow_from_directory(path,
                                           save_to_dir=save_path,
                                           save_format='png',
                                                    target_size=(IMAGE_HEIGHT,IMAGE_WIDTH),
                                                    batch_size=batch_size)
    num_files = len(os.listdir(img_dir))
    num_batches = int(np.ceil((NUM_AUG_IMAGES_WANTED-num_files)/batch_size))
    for i in range(0,num_batches):
        imgs, labels = next(aug_datagen)
    shutil.rmtree('aug_dir')

Found 1095 images belonging to 1 classes.
Found 1060 images belonging to 1 classes.


In [23]:
print(len(os.listdir('base_dir/train_dir/Normal')))
print(len(os.listdir('base_dir/train_dir/Tuberculosis')))
print(len(os.listdir('base_dir/val_dir/Normal')))
print(len(os.listdir('base_dir/val_dir/Tuberculosis')))

1095
1060
157
146


In [24]:
train_path = 'base_dir/train_dir'
valid_path = 'base_dir/val_dir'

num_train_samples = len(df_train)
num_val_samples = len(df_val)
train_batch_size = 10
val_batch_size = 10


train_steps = np.ceil(num_train_samples / train_batch_size)
val_steps = np.ceil(num_val_samples / val_batch_size)

datagen = ImageDataGenerator(rescale=1.0/255)

In [25]:
train_gen = datagen.flow_from_directory(train_path,
                                        target_size=(IMAGE_HEIGHT,IMAGE_WIDTH),
                                        batch_size=train_batch_size,
                                        class_mode='categorical')
val_gen = datagen.flow_from_directory(valid_path,
                                        target_size=(IMAGE_HEIGHT,IMAGE_WIDTH),
                                        batch_size=val_batch_size,
                                        class_mode='categorical')
test_gen = datagen.flow_from_directory(valid_path,
                                        target_size=(IMAGE_HEIGHT,IMAGE_WIDTH),
                                        batch_size=val_batch_size,
                                        class_mode='categorical',
                                        shuffle=False)

Found 2155 images belonging to 2 classes.
Found 303 images belonging to 2 classes.
Found 303 images belonging to 2 classes.


In [26]:
kernel_size = (3,3)
pool_size= (2,2)
first_filters = 32
second_filters = 64
third_filters = 128
dropout_conv = 0.3
dropout_dense = 0.3

In [27]:
model = Sequential()
model.add(Conv2D(first_filters, kernel_size, activation = 'relu',
                 input_shape = (IMAGE_HEIGHT, IMAGE_WIDTH, 3)))
model.add(Conv2D(first_filters, kernel_size, activation = 'relu'))
model.add(Conv2D(first_filters, kernel_size, activation = 'relu'))
model.add(MaxPooling2D(pool_size = pool_size))
model.add(Dropout(dropout_conv))

model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
model.add(MaxPooling2D(pool_size = pool_size))
model.add(Dropout(dropout_conv))

model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
model.add(MaxPooling2D(pool_size = pool_size))
model.add(Dropout(dropout_conv))

model.add(Flatten())
model.add(Dense(256, activation = "relu"))
model.add(Dropout(dropout_dense))
model.add(Dense(2, activation = "softmax"))

model.summary()

model.compile(Adam(lr=0.0001), loss='binary_crossentropy',
              metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 94, 94, 32)        896       
                                                                 
 conv2d_1 (Conv2D)           (None, 92, 92, 32)        9248      
                                                                 
 conv2d_2 (Conv2D)           (None, 90, 90, 32)        9248      
                                                                 
 max_pooling2d (MaxPooling2D  (None, 45, 45, 32)       0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 45, 45, 32)        0         
                                                                 
 conv2d_3 (Conv2D)           (None, 43, 43, 64)        18496     
                                                        

  super(Adam, self).__init__(name, **kwargs)


In [28]:
filepath = "model_v2.h5"
print("saved model to project")

checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1,
                             save_best_only=True, mode='max')

reduce_lr = ReduceLROnPlateau(monitor='val_acc', factor=0.5, patience=2,
                                   verbose=1, mode='max', min_lr=0.00001)
callbacks_list = [checkpoint, reduce_lr]

history = model.fit_generator(train_gen, steps_per_epoch=train_steps,
                            validation_data=val_gen,
                            validation_steps=val_steps,
                            epochs=50, verbose=2,
                           callbacks=callbacks_list)

saved model to project
Epoch 1/50


  history = model.fit_generator(train_gen, steps_per_epoch=train_steps,


68/68 - 26s - loss: 0.6932 - accuracy: 0.5274 - val_loss: 0.6930 - val_accuracy: 0.4917 - lr: 1.0000e-04 - 26s/epoch - 386ms/step
Epoch 2/50
68/68 - 24s - loss: 0.6930 - accuracy: 0.5156 - val_loss: 0.6921 - val_accuracy: 0.8083 - lr: 1.0000e-04 - 24s/epoch - 351ms/step
Epoch 3/50
68/68 - 25s - loss: 0.6931 - accuracy: 0.5426 - val_loss: 0.6898 - val_accuracy: 0.4833 - lr: 1.0000e-04 - 25s/epoch - 364ms/step
Epoch 4/50
68/68 - 24s - loss: 0.6932 - accuracy: 0.4948 - val_loss: 0.6915 - val_accuracy: 0.5500 - lr: 1.0000e-04 - 24s/epoch - 353ms/step
Epoch 5/50
68/68 - 25s - loss: 0.6907 - accuracy: 0.5309 - val_loss: 0.6861 - val_accuracy: 0.7333 - lr: 1.0000e-04 - 25s/epoch - 367ms/step
Epoch 6/50
68/68 - 27s - loss: 0.6864 - accuracy: 0.5456 - val_loss: 0.6935 - val_accuracy: 0.4250 - lr: 1.0000e-04 - 27s/epoch - 392ms/step
Epoch 7/50
68/68 - 24s - loss: 0.6929 - accuracy: 0.5338 - val_loss: 0.6907 - val_accuracy: 0.5500 - lr: 1.0000e-04 - 24s/epoch - 358ms/step
Epoch 8/50
68/68 - 24s -

In [35]:
model.save("model_v2.h5")

In [36]:
model.load_weights('model_v2.h5')
val_loss, val_acc = \
model.evaluate_generator(test_gen,
                        steps=val_steps)

print('val_loss:', val_loss)
print('val_acc:', val_acc)

  model.evaluate_generator(test_gen,


val_loss: 0.35118693113327026
val_acc: 0.8999999761581421
