In [1]:
import pandas as pd 
import numpy as np
import os
from PIL import Image

from keras.preprocessing.image import load_img
from keras.preprocessing.image import array_to_img
from keras.preprocessing.image import img_to_array


import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D, BatchNormalization
from keras import * 

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import torchvision
import torchvision.transforms as transforms

# Bronchus and Lung Cancer Classification

By: Kanika Chopra

## Import Data

There are three different paths for each of the cancer classes. Let's create a DataFrame that stores the filepaths and their class for the training and testing data respectively.

In this case, LUAD is class 0, LUSC is class 1 and MESO is class 2.

In [130]:
def get_img_fps(fp):
    fps = [] 
    for folder in os.listdir(fp):
        for filepath in os.listdir(fp + folder):
            fps.append(fp + folder + '/' + filepath)
    
    return fps

In [131]:
def get_data(fp):
    luad = get_img_fps(fp + '/LUAD/')
    lusc = get_img_fps(fp + '/LUSC/')
    meso = get_img_fps(fp + '/MESO/')
    
    df = pd.DataFrame(luad)
    df = df.append(lusc, ignore_index = True)
    df = df.append(meso, ignore_index = True)
    df.columns = ['filepath']
    
    labels = np.concatenate([np.zeros(len(luad)), np.ones(len(lusc)), np.full(len(meso), 2)])
    
    df['label'] = labels.astype(int)
    df.drop(0, inplace=True)
    
    return df  

#### Training Data

First, we'll get the training data.

In [132]:
train_df = get_data('data/train')

In [133]:
train_df.head()

Unnamed: 0,filepath,label
1,data/train/LUAD/TCGA-49-4494-01Z-00-DX2.cac5ed...,0
2,data/train/LUAD/TCGA-49-4494-01Z-00-DX2.cac5ed...,0
3,data/train/LUAD/TCGA-49-4494-01Z-00-DX2.cac5ed...,0
4,data/train/LUAD/TCGA-49-4494-01Z-00-DX2.cac5ed...,0
5,data/train/LUAD/TCGA-49-4494-01Z-00-DX2.cac5ed...,0


In [136]:
train_df.dtypes

filepath    object
label        int64
dtype: object

In [137]:
train_df.label.value_counts()

1    16443
0     5557
2      403
Name: label, dtype: int64

In [138]:
train_df.label.value_counts(normalize=True)

1    0.733964
0    0.248047
2    0.017989
Name: label, dtype: float64

We can see that with our training data we have very imbalanced dataset with more data for the LUSC type of cancer. Hence, this will require some preprocessing. Next, let's get our testing dataset as well.

#### Testing Data

In [139]:
test_df = get_data('data/dev')

In [140]:
test_df.head()

Unnamed: 0,filepath,label
1,data/dev/LUAD/TCGA-86-7955-01Z-00-DX1.ef4f4d94...,0
2,data/dev/LUAD/TCGA-86-7955-01Z-00-DX1.ef4f4d94...,0
3,data/dev/LUAD/TCGA-86-7955-01Z-00-DX1.ef4f4d94...,0
4,data/dev/LUAD/TCGA-86-7955-01Z-00-DX1.ef4f4d94...,0
5,data/dev/LUAD/TCGA-86-7955-01Z-00-DX1.ef4f4d94...,0


In [141]:
test_df.label.value_counts()

1    4492
0    3213
2     495
Name: label, dtype: int64

In [142]:
test_df.label.value_counts(normalize=True)

1    0.547805
0    0.391829
2    0.060366
Name: label, dtype: float64

We can see that our testing set also has imbalanced data; however, the dataset is not imbalanced in the same way as our training dataset.

---

In [145]:
def get_images(df):
    n = df.shape[0]
    lst = []
    raw_img = []
    for i in range(n):
        filepath = df.filepath.iloc[i]
        img = load_img(filepath)
        raw_img.append(img)
        img_array = img_to_array(img.resize((32, 32)))
        

        lst.append(img_array)
        
    
    return np.asarray(lst)

In [144]:
X_train = get_images(train_df)
y_train = np.asarray(train_df.label)

In [147]:
X_train.shape, y_train.shape

(22403, 32, 32, 3)

In [146]:
X_test = get_images(test_df)
y_test = np.asarray(test_df.label)

In [148]:
X_test.shape, y_test.shape

(8200, 32, 32, 3)

---
## Data Preprocessing

---
## Building the Model

Learning rate of 1e-3 did not work well

In [162]:
model = Sequential()
model.add(Conv2D(32,(2, 2), activation = 'relu', input_shape=X_train.shape[1:]))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(64,(3,3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(128,(3,3), activation='relu'))

model.add(Conv2D(256,(2,2), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Dropout(0.5))
model.add(BatchNormalization())

model.add(Flatten())
model.add(Dense(64, activation='relu'))

model.add(Flatten())
model.add(Dense(16, activation='relu'))

model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss="binary_crossentropy",
              optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
              metrics=['accuracy'])

model.fit(X_train,y_train,batch_size=400,epochs=12)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x7ff3ef0ac5e0>

In [163]:
y_pred = model.predict(X_test)

In [175]:
y_pred[y_pred < 0.5] = 0
y_pred[y_pred >= 0.5] = 1 

In [176]:
sum(y_pred.flatten() == y_test)/len(y_test)

0.6169512195121951

In [177]:
model.save('no_cleaning.h5')

61% accuracy with test set and 80% accuracy with training set with no preprocessing. 