# Loading in Our Data

In [1]:
pip install pymatreader 

Note: you may need to restart the kernel to use updated packages.


In [27]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [66]:
import os
import numpy as np 
from pymatreader import read_mat
import pandas as pd
import tensorflow as tf
from scipy.signal import convolve2d
from matplotlib import pyplot as plt
from tensorflow.keras import layers, models

In [58]:
#path = "/Users/wendytran/Documents/GitHub/BrainTumorClassification/"
path = "/Users/jacobmejia/Documents/GitHub/BrainTumorClassification/"

In [59]:

folders = ["brainTumorDataPublic_15332298/", "brainTumorDataPublic_22993064/", "brainTumorDataPublic_7671532/", "brainTumorDataPublic_1766/"]

In [60]:
print(os.path.exists("/Users/jacobmejia/Documents/GitHub/BrainTumorClassification/brainTumorDataPublic_15332298/1915.mat"))

True


In [61]:
def load_mat_files(path, folders):
    """
    Loads in mat_files of brain MRI images from a list of folders as a dataframe 
    """
    # create the empty df to store data in 
    df = pd.DataFrame(columns=["label", "PID","image", "tumorBorder", "tumorMask"])
    # for each folder
    for folder in folders:
        # get the path to that particular folder
        path_to_folder = path + folder
        
        # get each file in that folder
        for filename in os.listdir(path_to_folder):
            # load in mat file 
            data = read_mat(os.path.join(folder, filename))
            # add to df
            df.loc[len(df.index)] = [data["cjdata"]["label"], data["cjdata"]["PID"], data["cjdata"]["image"],
                                     data["cjdata"]["tumorBorder"], data["cjdata"]["tumorMask"]] 
            
    return df

In [62]:
mri_df = load_mat_files(path, folders)

OSError: The file brainTumorDataPublic_15332298/1915.mat does not exist.

In [63]:
mri_df

Unnamed: 0,label,PID,image,tumorBorder,tumorMask


https://figshare.com/articles/dataset/brain_tumor_dataset/1512427

This brain tumor dataset containing 3064 T1-weighted contrast-inhanced images
from 233 patients with three kinds of brain tumor: meningioma (708 slices), 
glioma (1426 slices), and pituitary tumor (930 slices). Due to the file size
limit of repository, we split the whole dataset into 4 subsets, and achive 
them in 4 .zip files with each .zip file containing 766 slices.The 5-fold
cross-validation indices are also provided.

-----
This data is organized in matlab data format (.mat file). Each file stores a struct
containing the following fields for an image:

- `cjdata.label`: 1 for meningioma, 2 for glioma, 3 for pituitary tumor
- `cjdata.PID`: patient ID
- `cjdata.image`: image data
- `cjdata.tumorBorder`: a vector storing the coordinates of discrete points on tumor border.
    - For example, [x1, y1, x2, y2,...] in which x1, y1 are planar coordinates on tumor border. It was generated by manually delineating the tumor border. So we can use it to generate binary image of tumor mask.
- `cjdata.tumorMask`: a binary image with 1s indicating tumor region

-----
This data was used in the following paper:
1. Cheng, Jun, et al. "Enhanced Performance of Brain Tumor Classification via Tumor Region Augmentation
and Partition." PloS one 10.10 (2015).
2. Cheng, Jun, et al. "Retrieval of Brain Tumors by Adaptive Spatial Pooling and Fisher Vector 
Representation." PloS one 11.6 (2016). Matlab source codes are available on github 
https://github.com/chengjun583/brainTumorRetrieval

-----
Jun Cheng
School of Biomedical Engineering
Southern Medical University, Guangzhou, China
Email: chengjun583@qq.com

In [None]:
label_encoder = {
    1.0: "Meningioma Tumor",
    2.0: "Glioma Tumor",
    3.0: "Pituitary Tumor"
}

In [None]:
# /Users/wendytran/Documents/GitHub/BrainTumorClassification/

from matplotlib import pyplot as plt
fig, ax = plt.subplots(1, 2)
ax[0].imshow(data["image"])
ax[1].imshow(data["tumorMask"])
fig.suptitle("Label: " + label_encoder[data["label"]])

# Training Set vs Testing Set

In [None]:
from sklearn.model_selection import train_test_split

# take a random 30% of data
np.random.seed(1234)
train, test = train_test_split(mri_df, test_size = 0.3) # hold out 30% of data
train.shape, test.shape

In [None]:
train

# Exploratory Analysis

Brain MRI images are given in 3 different planes: axial, sagittal and coronal.

In [None]:
def plot(df):
    plt.figure(figsize=(8, 8))
    for i in range(9):
        plt.subplot(3,3,i+1)
        plt.xticks([])
        plt.yticks([])
        plt.grid(False)
        plt.imshow(df.iloc[i]["image"])
        plt.title("Label: " + str(df.iloc[0]["label"]))

train.groupby(["label"]).apply(plot)

In [None]:
plt.figure(figsize=(10,10))
for i in range(25):
    plt.subplot(5,5,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(train.iloc[i]["image"])

In [None]:
plt.figure(figsize=(15,15))
for i in range(25):
    plt.subplot(5,5,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(train.iloc[i]["tumorMask"])
    plt.title("Label: " + label_encoder[data["label"]])

# Data Cleaning

In [None]:
from sklearn import preprocessing 

def prep_mri_data(data_df):
    df = data_df.copy()
    
    # don't need patient ID
    df = df.drop(["PID"], axis = 1) # axis=1 allows us to drop the column rather than row
    
    # split into predictor (X) and target(Y)
    X = df[["image"]]
    y = df[["label", "tumorMask"]]
    
    return (X, y)

In [None]:
X_train, y_train = prep_mri_data(train)
X_test, y_test = prep_mri_data(test)

In [None]:
X_train

In [None]:
y_train

# Building Initial Model

In [67]:
# need to change input shape

model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 3)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(32, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(3) # number of classes
])

In [68]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 30, 30, 32)        896       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 15, 15, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 13, 13, 32)        9248      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 6, 6, 32)          0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 4, 4, 64)          18496     
_________________________________________________________________
flatten (Flatten)            (None, 1024)              0         
_________________________________________________________________
dense (Dense)                (None, 64)                6

In [69]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

history = model.fit(X_train, 
                    y_train, 
                    epochs=10, 
                    steps_per_epoch = 100,
                    validation_data=(X_test, y_test))

NameError: name 'X_train' is not defined

In [70]:
plt.plot(history.history["accuracy"], label = "training")
plt.plot(history.history["val_accuracy"], label = "validation")
plt.gca().set(xlabel = "epoch", ylabel = "accuracy")
plt.legend()

NameError: name 'history' is not defined