# CODE FOR DATA AUGMENTATION 

__**Information about Data**__ <br>
The dataset contains 253 brain MRI images in two folders: yes and no. Yes, there are 155 tumorous brain MRI images in the folder, and no, there are 98 non-tumorous brain MRI images in the folder. DATA SET SOURCE LINK here(https://www.kaggle.com/navoneel/brain-mri-images-for-brain-tumor-detection).

We are using data augmentation to create more images because we have a small handful of datasets. There's also the issue that we had an unbalanced proportion of tumorous and non-tumorous  instances (55+ percent tumorous). The issue of data imbalance can be resolved.

**First, we'll import all of the required modules, and we'll just format the time string.**

In [8]:
import tensorflow as tf
from keras.preprocessing.image import ImageDataGenerator
import cv2
import imutils
import matplotlib.pyplot as plt
from os import listdir
import time    



In [9]:
# Formating Time String
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return f"{h}:{m}:{round(s,1)}"

In [10]:
def augment_data(file_dir, n_generated_samples, save_to_dir):
    #from keras.preprocessing.image import ImageDataGenerator
    #from os import listdir
    data_gen = ImageDataGenerator(rotation_range=10, 
                                  width_shift_range=0.1, 
                                  height_shift_range=0.1, 
                                  shear_range=0.1, 
                                  brightness_range=(0.3, 1.0),
                                  horizontal_flip=True, 
                                  vertical_flip=True, 
                                  fill_mode='nearest'
                                 )

    
    for filename in listdir(file_dir):
        # load the image
        image = cv2.imread(file_dir + '\\' + filename)
        # reshape the image
        image = image.reshape((1,)+image.shape)
        # prefix of the names for the generated sampels.
        save_prefix = 'aug_' + filename[:-4]
        # generate 'n_generated_samples' sample images
        i=0
        for batch in data_gen.flow(x=image, batch_size=1, save_to_dir=save_to_dir, 
                                           save_prefix=save_prefix, save_format='jpg'):
            i += 1
            if i > n_generated_samples:
                break

**155 of the images are tumour. And 98 of the images aren't tumour.
Therefore, in order to balance the data, we can create 9 new images for each image in the 'no' class and 6 new images for each image in the 'yes' class.**

In [27]:
start_time = time.time()
augmented_data_path = 'Final Augmented Data/'
#tumurous examples
augment_data(file_dir= 'yes', n_generated_samples=6, save_to_dir=augmented_data_path+ 'yes')
#non-tumurous examples
augment_data(file_dir= 'no', n_generated_samples=9, save_to_dir=augmented_data_path+ 'no')
end_time = time.time()
execution_time = (end_time - start_time)
print(f"Elapsed time: {hms_string(execution_time)}")

Elapsed time: 0:2:14.1


In [32]:
def data_summary(main_path):
    yes_path = main_path+'yes'
    no_path = main_path+'no'
    #tumorous (positive) examples
    m_pos = len(listdir(yes_path))
    #non-tumorous (negative) examples
    m_neg = len(listdir(no_path))
    # number of all examples
    m = (m_pos+m_neg)
    pos_pr = (m_pos* 100.0)/ m
    neg_pr = (m_neg* 100.0)/ m
    print(f"Number of examples: {m}")
    print(f"Percentage of positive examples: {pos_pr}%, number of positive  examples: {m_pos}") 
    print(f"Percentage of negative examples: {neg_pr}%, number of negative examples: {m_neg}") 

In [33]:
data_summary(augmented_data_path)

Number of examples: 2065
Percentage of positive examples: 52.54237288135593%, number of positive  examples: 1085
Percentage of negative examples: 47.45762711864407%, number of negative examples: 980
