In [1]:
import glob
import os
import shutil
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import cv2
from tensorflow import keras
from keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.metrics import categorical_crossentropy
from keras.models import Sequential, Model
from keras.layers import Conv2D, MaxPooling2D,GlobalAveragePooling2D
from keras.layers import Activation, Dropout, BatchNormalization, Flatten, Dense, AvgPool2D,MaxPool2D
from keras.models import Sequential, Model
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
import torchvision
import torchvision.transforms as transforms
from keras.preprocessing.image import load_img
from keras.preprocessing.image import save_img
from keras.preprocessing.image import img_to_array
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

Using TensorFlow backend.


In [2]:
def train_directory():
    pwd = os.getcwd()
    
    os.mkdir(pwd+'/training')
    os.mkdir(pwd+'/training/LUAD_TRAIN')
    os.mkdir(pwd+'/training/LUSC_TRAIN')
    os.mkdir(pwd+'/training/MESO_TRAIN')

    
    train_dir = '/kaggle/input/histopathology-dataset/train/'
    cancers = ["LUAD","LUSC","MESO"]
    luad_train_dir = '/kaggle/working/training/LUAD_TRAIN/'
    lusc_train_dir = '/kaggle/working/training/LUSC_TRAIN/'
    meso_train_dir = '/kaggle/working/training/MESO_TRAIN/'
    cat_train_dir = [luad_train_dir,lusc_train_dir,meso_train_dir]

    for i in range(3):
        for f in glob.iglob(train_dir+cancers[i]+"/*"):
            for subf in glob.iglob(f+'/*'):
                shutil.copy(subf,cat_train_dir[i])
    
    labels = ["LUAD","LUSC","MESO"]
    dir_label_df = pd.DataFrame(columns = ["directory","label"])
    for i in range(3):
        filepaths_i = glob.glob(cat_train_dir[i]+"/*")
        series_i = pd.Series(filepaths_i)
        df_i = pd.DataFrame(series_i,columns = ["directory"])
        df_i["label"] = labels[i]
        dir_label_df = pd.concat([dir_label_df,df_i],axis=0)
    
    return dir_label_df.reset_index(drop=True)

In [3]:
def test_directory():
    
    pwd = os.getcwd()
    
    os.mkdir(pwd+'/testing')
    os.mkdir(pwd+'/testing/LUAD_TEST')
    os.mkdir(pwd+'/testing/LUSC_TEST')
    os.mkdir(pwd+'/testing/MESO_TEST')

    test_dir = '/kaggle/input/histopathology-dataset/dev/'
    cancers = ["LUAD","LUSC","MESO"]
    luad_test_dir = '/kaggle/working/testing/LUAD_TEST/'
    lusc_test_dir = '/kaggle/working/testing/LUSC_TEST/'
    meso_test_dir = '/kaggle/working/testing/MESO_TEST/'
    cat_test_dir = [luad_test_dir,lusc_test_dir,meso_test_dir]


    for i in range(3):
        for f in glob.iglob(test_dir+cancers[i]+"/*"):
            for subf in glob.iglob(f+'/*'):
                shutil.copy(subf,cat_test_dir[i])


    labels = ["LUAD","LUSC","MESO"]
    test_label_df = pd.DataFrame(columns = ["directory","label"])
    for i in range(3):
        filepaths_i = glob.glob(cat_test_dir[i]+"/*")
        series_i = pd.Series(filepaths_i)
        df_i = pd.DataFrame(series_i,columns = ["directory"])
        df_i["label"] = labels[i]
        test_label_df = pd.concat([test_label_df,df_i],axis=0)


    test_label_df = test_label_df.reset_index(drop=True)
    return test_label_df


The code above moves pictures from input to output in the kaggle directory structure. We also saved the directory of every patch with its respective label.

In [4]:
dir_label_df = train_directory()
test_label_df = test_directory()



In [None]:
train,test = train_test_split(dir_label_df, test_size=0.2, random_state=42)

print(f"train set shape: {train.shape}")
print(f"test set shape: {test.shape}")

In [None]:
transform_train = transforms.Compose(
    [transforms.Resize((128,128)),
     transforms.RandomApply([
        torchvision.transforms.RandomRotation(10),
        transforms.RandomHorizontalFlip(),
        transforms.RandomVerticalFlip()],0.5),
     transforms.ToTensor()])

Augments the data with respect to our transformations listed above. MESO is augmented 25x and LUAD is augmented 2x.

In [None]:
def data_augmentation(train):
    
    aug_label_df = pd.DataFrame(columns = ["directory","label"])
    
    for i in range(len(train)):
        row = train.iloc[i]
        row_directory = row['directory']
        row_label = row['label']
        
        if row_label == "LUAD":
            
            for j in range(2):
                img = load_img(row_directory)
                trans_img = transform_train(img)
                new_dir = row_directory[:-4] + '_' + str(j) + '.jpg'
                save_img(new_dir,img_to_array(trans_img),data_format='channels_first')

                series_i = pd.Series(new_dir)
                df_i = pd.DataFrame(series_i, columns = ["directory"])
                df_i["label"] = "LUAD"
                aug_label_df = pd.concat([aug_label_df,df_i],axis=0)
                
        if row_label == "MESO":
            
            for z in range(25):
                
                img = load_img(row_directory)
                trans_img = transform_train(img)
                new_dir = row_directory[:-4] + '_' + str(z) + '.jpg'
                save_img(new_dir,img_to_array(trans_img),data_format='channels_first')

                series_i = pd.Series(new_dir)
                df_i = pd.DataFrame(series_i,columns = ["directory"])
                df_i["label"] = "MESO"
                aug_label_df = pd.concat([aug_label_df,df_i],axis=0)
                
        else:
            img = load_img(row_directory)
            trans_img = transform_train(img)
            new_dir = row_directory[:-4] + '_' + str(i) + '.jpg'
            save_img(new_dir,img_to_array(trans_img),data_format='channels_first')

            series_i = pd.Series(new_dir)
            df_i = pd.DataFrame(series_i,columns = ["directory"])
            df_i["label"] = "LUSC"
            aug_label_df = pd.concat([aug_label_df,df_i],axis=0)
                
        
    return aug_label_df

In [None]:
train = data_augmentation(train)

Below is the code for RGB to HSV optimization as a color mask. (which we dropped)

In [None]:
'''from PIL import Image
import cv2 as cv
import numpy as np
dir_img_rgb = dir_label_df.iloc[83]['directory']
img_rbg = cv.imread(dir_img_rgb)
plt.imshow(img_rbg)
img_hsv = cv.cvtColor(img_rbg,cv.COLOR_BGR2HSV)

lower_blue = np.array([110,50,50])
upper_blue = np.array([130,255,255])

lower_red = np.array([150,135,100])
upper_red = np.array([170,255,255])

lower_notwhite = np.array([20,0,0])
upper_notwhite = np.array([102,255,255])
# Threshold the HSV image to get only blue colors
mask = cv.inRange(img_hsv, lower_red, upper_red)
# Bitwise-AND mask and original image
res = cv.bitwise_and(img_rbg,img_rbg, mask= mask)

plt.imshow(res)

lower_red = np.array([150,135,0])
upper_red = np.array([190,255,200])

plt.figure(figsize=(12,8))
pos = 1
for i in range(3):
    for j in range(5):
        labels_i = labels[i]
        index_i = dir_label_df[dir_label_df["label"] == labels_i].index
        random = np.random.randint(index_i[0],index_i[-1])
        plt.subplot(3,5,pos)
        pos+=1
        img_rbg = cv2.imread(dir_label_df.loc[random,"directory"])
        img_hsv = cv.cvtColor(img_rbg,cv.COLOR_BGR2HSV)
        mask = cv.inRange(img_hsv, lower_red, upper_red)
        res = cv.bitwise_and(img_rbg,img_rbg, mask= mask)
        plt.imshow(res)
        plt.title(dir_label_df.loc[random, "label"], size = 15, color = "white") 
        plt.xticks([])
        plt.yticks([])

plt.show()
'''


In [None]:
train_datagen = ImageDataGenerator(rescale = 1./224.,rotation_range = 40, width_shift_range = 0.2, height_shift_range = 0.2, 
                                   shear_range = 0.2, zoom_range = 0.2, horizontal_flip = True, vertical_flip =True)
test_datagen = ImageDataGenerator(rescale = 1.0/224.)

train_gen = train_datagen.flow_from_dataframe(dataframe = train,
                                              x_col = 'directory', y_col ='label',
                                              target_size = (224,224), batch_size = 32, 
                                              class_mode = 'categorical', shuffle = True)
test_gen = test_datagen.flow_from_dataframe(test,
                                            target_size = (224,224), x_col = 'directory', y_col ='label',
                                             class_mode = 'categorical',
                                            batch_size = 16, shuffle = False)

validation_datagen = ImageDataGenerator(rescale = 1/224.)

validation_gen = validation_datagen.flow_from_dataframe(test_label_df,
                                            target_size = (224,224), x_col = 'directory', y_col ='label',
                                             class_mode = 'categorical',

CNN

In [None]:
model = Sequential()
model.add(Conv2D(32,(2, 2), activation = 'relu', input_shape=(224, 224, 3)))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(64,(3,3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(128,(3,3), activation='relu'))

model.add(Conv2D(256,(2,2), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Dropout(0.5))
model.add(BatchNormalization())

model.add(Flatten())
model.add(Dense(64, activation='relu'))

model.add(Flatten())
model.add(Dense(16, activation='relu'))

model.add(Dense(3))
model.add(Activation('sigmoid'))

callbacks = [tf.keras.callbacks.ModelCheckpoint("cleaned_classififier.h5", save_best_only=True, verbose = 0)]


model.compile(loss="binary_crossentropy",
              optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
              metrics=['accuracy'])

model.fit(train_gen, validation_data = test_gen, use_multiprocessing=True, workers = 6, epochs=12, callbacks = callbacks, verbose = 1)

Inception v3

In [None]:
from tensorflow import keras
base_model = keras.applications.InceptionV3(
    weights="imagenet",
    input_shape=(224, 224, 3),
    include_top=False)

base_model.trainable = False
inputs = keras.Input(shape=(224, 224, 3))

x = base_model(inputs, training=False)
x = keras.layers.GlobalAveragePooling2D()(x)
x = keras.layers.Dropout(0.2)(x)  # Regularize with dropout

x = keras.layers.Dense(256)(x)
x = keras.layers.Dense(128)(x)
x = keras.layers.Dropout(0.2)(x)
x = keras.layers.Dense(32)(x)
outputs = keras.layers.Dense(3, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)

callbacks = [EarlyStopping(monitor='val_loss',patience=2)]

model.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])

model.fit(train_gen, validation_data = test_gen,
                    use_multiprocessing=True,
                    workers=6,
                    epochs = 15,
                    callbacks = callbacks,verbose=1)

ResNet50V2

In [None]:
base_model_resnet = keras.applications.ResNet50V2(
    weights="imagenet",  
    input_shape=(224, 224, 3),
    include_top=False)

base_model_resnet.trainable = False
inputs = keras.Input(shape=(224, 224, 3))

x = base_model_resnet(inputs, training=False)
x = keras.layers.GlobalAveragePooling2D()(x)
x = keras.layers.Dropout(0.2)(x)  # Regularize with dropout

x = keras.layers.Dense(256)(x)
x = keras.layers.Dense(128)(x)
x = keras.layers.Dropout(0.2)(x)
x = keras.layers.Dense(32)(x)
outputs = keras.layers.Dense(3, activation="sigmoid")(x)
model_resnet = keras.Model(inputs, outputs)

callbacks = [EarlyStopping(monitor='val_loss',patience=4)]

model_resnet.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])

model_resnet.fit(train_gen, validation_data = test_gen,
                    use_multiprocessing=True,
                    workers=6,
                    epochs = 15,
                    callbacks = callbacks,verbose=1)

DenseNet121

In [None]:
base_model_densenet = keras.applications.DenseNet121(
    weights="imagenet",
    input_shape=(224, 224, 3),
    include_top=False)

base_model_densenet.trainable = False
inputs = keras.Input(shape=(224, 224, 3))

x = base_model_densenet(inputs, training=False)
x = keras.layers.GlobalAveragePooling2D()(x)
x = keras.layers.Dropout(0.2)(x)  # Regularize with dropout

x = keras.layers.Dense(256)(x)
x = keras.layers.Dense(128)(x)
x = keras.layers.Dropout(0.2)(x)
x = keras.layers.Dense(32)(x)
outputs = keras.layers.Dense(3, activation="sigmoid")(x)
model_densenet = keras.Model(inputs, outputs)

callbacks = [EarlyStopping(monitor='val_loss',patience=4)]
model_densenet.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])

model_densenet.fit(train_gen, validation_data = test_gen,
                    use_multiprocessing=True,
                    workers=6,
                    epochs = 15,
                    callbacks = callbacks,verbose=1)

Code for Classification Metrics

In [None]:
def report_matrix(generator,model):
    test_steps_per_epoch = np.math.ceil(generator.samples / generator.batch_size)
    predictions = model.predict(generator,test_steps_per_epoch)
    predicted_classes = np.argmax(predictions, axis=1)
    true_classes = generator.classes
    class_labels = list(generator.class_indices.keys()) 
    
    report = classification_report(true_classes, predicted_classes, target_names=class_labels)
    print(report)
    
    conf_mat = confusion_matrix(true_classes, predicted_classes)
    print(conf_mat)
    
def model_metrics(generator,model):
    test_steps_per_epoch = np.math.ceil(generator.samples / generator.batch_size)
    predictions = model.predict(generator,test_steps_per_epoch)
    predicted_classes = np.argmax(predictions, axis=1)
    true_classes = generator.classes
    accuracy = accuracy_score(predicted_classes,true_classes)
    precision = precision_score(predicted_classes,true_classes,average='weighted')
    recall = recall_score(predicted_classes,true_classes,average='weighted')
    f1 = f1_score(predicted_classes,true_classes,average='weighted')
    
    print("Accuracy: {}\nPrecision: {}\nRecall: {}\nF1: {}".format(round(accuracy,4),
                                                                   round(precision,4),
                                                                   round(recall,4),
                                                                   round(f1,4)))

Metrics for models we ran

In [None]:
model_metrics(test_gen,model)
model_metrics(validation_gen,model)
report_matrix(test_gen,model)
report_matrix(validation_gen,model)


model_metrics(test_gen,model_resnet)
model_metrics(validation_gen,model_resnet)
report_matrix(test_gen,model_resnet)
report_matrix(validation_gen,model_resnet)


model_metrics(test_gen,model_densenet)
model_metrics(validation_gen,model_densenet)
report_matrix(test_gen,model_densenet)
report_matrix(validation_gen,model_densenet)