In [1]:
from google.colab import drive
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# mount google drive to access data
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# define paths
# dataset: https://www.kaggle.com/navoneel/brain-mri-images-for-brain-tumor-detection
path = "/content/drive/My Drive/01 - Courses/04 - Machine Learning Engineer Nanodegree/brain_tumor_detection/data/dataset"

In [5]:
# define parameters
img_height = 128
img_width = 128
img_size = (img_height, img_width)
n_augmented_images = 12

In [6]:
# define augmentation layer (https://neptune.ai/blog/data-augmentation-in-python)
data_augmentation = tf.keras.Sequential([
     layers.experimental.preprocessing.RandomFlip("horizontal_and_vertical"),
     layers.experimental.preprocessing.RandomRotation(0.2),
     layers.experimental.preprocessing.RandomZoom(0.1)])

In [7]:
def augment_image(image, n_augmented_images):
  '''
  Returns a list of augmented images for the given input image
  Arguments:
  image (array) - input image
  number_of_images (int) - number of augmented images to return
  Returns:
  images (list) - list of augmented images
  '''

  image = tf.expand_dims(image, 0)
  images = []

  for i in range(n_augmented_images):
    augmented_image = data_augmentation(image)
    images.append(np.array(augmented_image[0]).flatten())

  return images

In [8]:
def preprocess_data(path, img_size, n_augmented_images):
  '''
  Reads in images classified into folders, resizes and scales them. Returns 
  those processed images as features and their associated labels as well.
  Arguments:
    path (str) - path to classified image folders
    img_size (tuple) - tuple containing resized image height and width
  Returns:
    X (array) - features (brain scan images)
    y (array) - feature labels (0 - no tumor, 1 - tumor)
  '''

  unsuccessful_files = {}

  X = []
  y = []

  for folder_name in os.listdir(path):
    if folder_name == 'no':
      label = 0
    else:
      label = 1
    folder_path = os.path.join(path, folder_name)
  
    for fname in os.listdir(folder_path):
      fpath = os.path.join(folder_path, fname)
      try:
        img = cv2.imread(fpath)
        img = cv2.resize(img, img_size)
        img = img / 255.0
        X.append(img.flatten())
        y.append(label)
        X += augment_image(img, n_augmented_images)
        y += [label] * n_augmented_images

      except Exception as e:
        unsuccessful_files[fname] = e

  if unsuccessful_files:
    print(f'Error processing the following files:\n')
    for index, key in enumerate(unsuccessful_files, 1):
      print(f'{index}. {key} - {unsuccessful_files[key]}')
  else:
    print('Successfully processed all images.')

  X = np.array(X)
  y = np.array(y)

  return X, y

In [9]:
# obtain features and labels
X, y = preprocess_data(path, img_size, n_augmented_images)

Successfully processed all images.


In [10]:
print(f'After augmentation, our dataset now has {len(X)} samples.')

After augmentation, our dataset now has 3289 samples.


In [11]:
# split data into train, validation and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [12]:
from sklearn import svm

# Create a classifier: a support vector classifier
clf = svm.SVC(gamma=0.001)

# Learn the digits on the train subset
clf.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [13]:
# make predictions on the test set
y_pred = clf.predict(X_test)

In [14]:
# classifiation report
from sklearn.metrics import classification_report , confusion_matrix
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.64      0.73       352
           1       0.77      0.90      0.83       471

    accuracy                           0.79       823
   macro avg       0.80      0.77      0.78       823
weighted avg       0.80      0.79      0.79       823



In [15]:
# confusion matrix
actual_labels = ['no_tumor', 'tumor']
pred_labels = ['predicted_no_tumor', 'predicted_tumor']
matrix = confusion_matrix(y_test, y_pred)
matrix_df = pd.DataFrame(matrix, index=actual_labels, columns=pred_labels)
matrix_df

Unnamed: 0,predicted_no_tumor,predicted_tumor
no_tumor,227,125
tumor,47,424
