In [3]:
import glob
import os
import shutil
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import cv2
from keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.metrics import categorical_crossentropy
from keras.models import Sequential, Model
from keras.layers import Conv2D, MaxPooling2D,GlobalAveragePooling2D
from keras.layers import Activation, Dropout, BatchNormalization, Flatten, Dense, AvgPool2D,MaxPool2D
from keras.models import Sequential, Model
import tensorflow as tf
from sklearn.model_selection import train_test_split
import torchvision
import torchvision.transforms as transforms
from keras.preprocessing.image import load_img
from keras.preprocessing.image import save_img
from keras.preprocessing.image import img_to_array

# Lung and Bronchus Cancer Classification

## Import the Data

In [4]:
def train_directory():
    pwd = os.getcwd()
    
    os.mkdir(pwd+'/training')
    os.mkdir(pwd+'/training/LUAD_TRAIN')
    os.mkdir(pwd+'/training/LUSC_TRAIN')
    os.mkdir(pwd+'/training/MESO_TRAIN')

    
    train_dir = '/kaggle/input/histopathology-dataset/train/'
    cancers = ["LUAD","LUSC","MESO"]
    luad_train_dir = '/kaggle/working/training/LUAD_TRAIN/'
    lusc_train_dir = '/kaggle/working/training/LUSC_TRAIN/'
    meso_train_dir = '/kaggle/working/training/MESO_TRAIN/'
    cat_train_dir = [luad_train_dir,lusc_train_dir,meso_train_dir]

    for i in range(3):
        for f in glob.iglob(train_dir+cancers[i]+"/*"):
            for subf in glob.iglob(f+'/*'):
                shutil.copy(subf,cat_train_dir[i])
    
    labels = ["LUAD","LUSC","MESO"]
    dir_label_df = pd.DataFrame(columns = ["directory","label"])
    for i in range(3):
        filepaths_i = glob.glob(cat_train_dir[i]+"/*")
        series_i = pd.Series(filepaths_i)
        df_i = pd.DataFrame(series_i,columns = ["directory"])
        df_i["label"] = labels[i]
        dir_label_df = pd.concat([dir_label_df,df_i],axis=0)
    
    return dir_label_df.reset_index(drop=True)

In [5]:
dir_label_df = train_directory()

In [None]:
dir_label_df

In [None]:
print(dir_label_df['label'].value_counts())

In [None]:
dir_label_df.label.value_counts(normalize=True)

We can see that with our training data we have very imbalanced dataset with more data for the LUSC type of cancer. Hence, this will require some preprocessing.

In [None]:
train,test = train_test_split(dir_label_df, test_size=0.2, random_state=42)

print(f"train set shape: {train.shape}")
print(f"test set shape: {test.shape}")

## Data Preprocessing and Augmentation
We need to add more data to the imbalanced datasets. This means to the LUAD and MESO datasets. We augment our data using the following transformations:
* Random Rotation of 10 degrees
* Random Horizontal Flip
* Random Vertical Flip

This is randomly applied to the LUSC and for the LUAD and MESO data, it is applied to also create more data for these two classes so that it is more balanced.

In [None]:
transform_train = transforms.Compose(
    [transforms.Resize((64,64)),
     transforms.RandomApply([
        #torchvision.centercrop(10),
        torchvision.transforms.RandomRotation(10),
        transforms.RandomHorizontalFlip(),
        transforms.RandomVerticalFlip()],0.5),
     transforms.ToTensor()])

In [None]:
def data_augmentation(train):
    
    aug_label_df = pd.DataFrame(columns = ["directory","label"])
    
    for i in range(len(train)):
        row = train.iloc[i]
        row_directory = row['directory']
        row_label = row['label']
        
        if row_label == "LUAD":
            
            for j in range(2):
                img = load_img(row_directory)
                trans_img = transform_train(img)
                new_dir = row_directory[:-4] + '_' + str(j) + '.jpg'
                save_img(new_dir,img_to_array(trans_img),data_format='channels_first')

                series_i = pd.Series(new_dir)
                df_i = pd.DataFrame(series_i, columns = ["directory"])
                df_i["label"] = "LUAD"
                aug_label_df = pd.concat([aug_label_df,df_i],axis=0)
                
        if row_label == "MESO":
            
            for z in range(25):
                
                img = load_img(row_directory)
                trans_img = transform_train(img)
                new_dir = row_directory[:-4] + '_' + str(z) + '.jpg'
                save_img(new_dir,img_to_array(trans_img),data_format='channels_first')

                series_i = pd.Series(new_dir)
                df_i = pd.DataFrame(series_i,columns = ["directory"])
                df_i["label"] = "MESO"
                aug_label_df = pd.concat([aug_label_df,df_i],axis=0)
                
        else:
            img = load_img(row_directory)
            trans_img = transform_train(img)
            new_dir = row_directory[:-4] + '_' + str(i) + '.jpg'
            save_img(new_dir,img_to_array(trans_img),data_format='channels_first')

            series_i = pd.Series(new_dir)
            df_i = pd.DataFrame(series_i,columns = ["directory"])
            df_i["label"] = "LUSC"
            aug_label_df = pd.concat([aug_label_df,df_i],axis=0)
                
        
    return aug_label_df

In [None]:
train = data_augmentation(train)

In [None]:
train.label.value_counts()

In [None]:
train.label.value_counts(normalize=True)

In [None]:
! mkdir testing
%cd testing
! mkdir LUAD_TEST
! mkdir LUSC_TEST
! mkdir MESO_TEST

test_dir = '../kaggle/input/histopathology-dataset/dev/'
cancers = ["LUAD","LUSC","MESO"]
luad_test_dir = '../kaggle/working/testing/LUAD_TEST/'
lusc_test_dir = '../kaggle/working/testing/LUSC_TEST/'
meso_test_dir = '../kaggle/working/testing/MESO_TEST/'
cat_test_dir = [luad_test_dir,lusc_test_dir,meso_test_dir]

%cd /kaggle

for i in range(3):
    for f in glob.iglob(test_dir+cancers[i]+"/*"):
        for subf in glob.iglob(f+'/*'):
            shutil.copy(subf,cat_test_dir[i])
            
            
labels = ["LUAD","LUSC","MESO"]
test_label_df = pd.DataFrame(columns = ["directory","label"])
for i in range(3):
    filepaths_i = glob.glob(cat_test_dir[i]+"/*")
    series_i = pd.Series(filepaths_i)
    df_i = pd.DataFrame(series_i,columns = ["directory"])
    df_i["label"] = labels[i]
    test_label_df = pd.concat([test_label_df,df_i],axis=0)
    
    
test_label_df = test_label_df.reset_index(drop=True)
test_label_df.head(),test_label_df.tail()

In [None]:
plt.figure(figsize=(12,8))
labels = ["LUAD","LUSC","MESO"]
pos = 1
for i in range(3):
    for j in range(5):
        labels_i = labels[i]
        index_i = dir_label_df[dir_label_df["label"] == labels_i].index
        random = np.random.randint(index_i[0],index_i[-1])
        plt.subplot(3,5,pos)
        pos+=1
        plt.imshow(cv2.imread(dir_label_df.loc[random,"directory"]))
        plt.title(dir_label_df.loc[random, "label"], size = 15, color = "white") 
        plt.xticks([])
        plt.yticks([])

plt.show()

## Mosaics

Now, we want to get a mosaics for each of the classes and we'll keep the proportions similar to our transformed dataset.

In [None]:
test_label_df.label.value_counts(normalize=True)

In [None]:
train.label.value_counts(normalize=True)

In [None]:
luad = train[train.label == 'LUAD']
lusc = train[train.label == 'LUSC']
meso = train[train.label == 'MESO']

In [None]:
def create_mosaic(df, ncol):
    n = df.shape[0]
    indices = np.random.choice(n, ncol**2)
    
    cols = []
    for i in range(ncol):
        col_i = []
        for j in range(ncol):
            ind = indices[0]
            img_j = img_to_array(load_img(df.directory.iloc[ind]))
            col_i.append(img_j)
            indices = indices[1:]  

        y_col = np.concatenate(col_i, axis=0)
        cols.append(y_col)

    y = np.concatenate(cols, axis=1)
    
    return y

In [None]:
def get_mosaics(df, n, ncol):
    mosaics = []
    for i in range(n):
        new_mosaic = create_mosaic(df, ncol)
        mosaics.append(new_mosaic)
        
    return np.asarray(mosaics)

In [None]:
lusc_train = get_mosaics(lusc, 2000, 4)

In [None]:
meso_train = get_mosaics(meso, 2000, 4)

In [None]:
luad_train = get_mosaics(luad, 3000, 4)

In [None]:
luad_train.shape, meso_train.shape, luad_train.shape

In [None]:
train_datagen = ImageDataGenerator(rescale = 1./128.,rotation_range = 40, width_shift_range = 0.2, height_shift_range = 0.2, 
                                   shear_range = 0.2, zoom_range = 0.2, horizontal_flip = True, vertical_flip =True)
test_datagen = ImageDataGenerator(rescale = 1.0/128.)

In [None]:
X_train = np.concatenate([luad_train, lusc_train, meso_train], axis=0)
X_train.shape

In [None]:
y_train = np.concatenate([np.zeros(2000), np.ones(3000), np.full(2000, 2)], axis=0)
y_train.shape

not sure how to use the below part with mosaics

train_gen = train_datagen.flow_from_dataframe(dataframe = train,
                                              x_col = 'directory', y_col ='label',
                                              target_size = (128,128), batch_size = 32, 
                                              class_mode = 'categorical', shuffle = True)
test_gen = test_datagen.flow_from_dataframe(test,
                                            target_size = (128,128), x_col = 'directory', y_col ='label',
                                             class_mode = 'categorical',
                                            batch_size = 16, shuffle = False)

In [None]:
model = Sequential()
model.add(Conv2D(32,(2, 2), activation = 'relu', input_shape=X_train.shape[1:]))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(64,(3,3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(128,(3,3), activation='relu'))

model.add(Conv2D(256,(2,2), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Dropout(0.5))
model.add(BatchNormalization())

model.add(Flatten())
model.add(Dense(64, activation='relu'))

model.add(Flatten())
model.add(Dense(16, activation='relu'))

model.add(Dense(3))
model.add(Activation('sigmoid'))

callbacks = [tf.keras.callbacks.ModelCheckpoint("cleaned_classififier.h5", save_best_only=True, verbose = 0)]


model.compile(loss="binary_crossentropy",
              optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
              metrics=['accuracy'])

model.fit(X_train, y_train,batch_size=400, epochs=12)

In [None]:
model.save('model-mosaic.h5')

In [None]:
predictions = model.predict(test_gen)

In [None]:
predicted_classes = np.argmax(predictions, axis=1)

In [None]:
np.unique(predicted_classes)

In [None]:
true_classes = test_gen.classes
class_labels = list(test_gen.class_indices.keys()) 

class_labels

In [None]:
from sklearn.metrics import confusion_matrix,classification_report

In [None]:
report = classification_report(true_classes, predicted_classes, target_names=class_labels)
print(report)

In [None]:
conf_mat = confusion_matrix(true_classes, predicted_classes)
print(conf_mat)

In [None]:
validation_datagen = ImageDataGenerator(rescale = 1.0/128.)

validation_gen = validation_datagen.flow_from_dataframe(test_label_df,
                                            target_size = (128,128), x_col = 'directory', y_col ='label',
                                             class_mode = 'categorical',
                                            batch_size = 16, shuffle = False)

In [None]:
validation_steps_per_epoch = np.math.ceil(validation_gen.samples / validation_gen.batch_size)
validation_predictions = model.predict(validation_gen,validation_steps_per_epoch)
validation_predicted_classes = np.argmax(validation_predictions, axis=1)

validation_true_classes = validation_gen.classes
validation_class_labels = list(validation_gen.class_indices.keys()) 

In [None]:
validation_report = classification_report(validation_true_classes,
                               validation_predicted_classes, target_names=validation_class_labels)
print(validation_report)

In [None]:
v_conf_mat = confusion_matrix(validation_true_classes, validation_predicted_classes)
print(v_conf_mat)

In [None]:
np.sum(validation_true_classes == validation_predicted_classes)/len(validation_predicted_classes)