## Exploratory Data Analysis and Data Pre-processing

#### In this notebook, the basic data understanding and data preparation is done as per second and third phase of CRISP-DM process.

In [None]:
# Packages for EDA
import cv2
import os
import numpy as np
import shutil
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

# For data Augmentation
import random
from scipy import ndarray
import skimage as sk
from skimage import transform
from skimage import util

Set the dataset path the cropped images

In [None]:
dataset_path = r"E:\Class_Notes_Sem2\ADM\Project\malaria-bounding-boxes\malaria\Cropped_images"
os.chdir(dataset_path)

### BASIC EXPLORATORY DATA ANALYSIS
* Get the counts for each of the category
* Plot the barchart

In [None]:
# Path of the category and number of samples
df = pd.DataFrame(columns = ['Path', 'Size'])
for folders in os.listdir(dataset_path):
    df.loc[folders.replace(' ','_')] = [os.path.join(dataset_path, folders)]+[len(os.listdir(folders))]

In [None]:
df

In [None]:
plt.bar(df.index.values, height=df['Size'])
plt.show()

## The dataset is heavily imbalanced

### Techniques followed to handle data imbalance:
* A threshold value of **2000** has been set to categorise the datasets for upsample and downsample.
* For the dataset to be downsampled 2000 images has been chosen at random
* For datasets to be upsampled, 2000 of total images are created for each of the dataset. Basic data augmentation techniques:
 * Random Rotation
 * Horizontal Flip
 * Random noise
 will be applied. `scikit-image` has been used as the library to do the augmentation. [Tutorial](https://medium.com/@thimblot/data-augmentation-boost-your-image-dataset-with-few-lines-of-python-155c2dc1baec)
* There are total six classes:
 * gametocyte
 * red_blood_cell
 * ring
 * schizont
 * trophozoite
 * difficult

* **difficult** is the set of category which could not be classified by human annotators as mentioned in the datasource [description](https://data.broadinstitute.org/bbbc/BBBC041/), hence we will be ommitting `difficult` category from our dataset.

In [None]:
# Find out the classes to be upsample and classes to be downsampled
classes_to_downsample = df[df['Size']>2000].index.values
classes_to_upsample = df[df['Size']<=2000].index.values
print("Classes going to be downsampled {} and classes going to upsampled are {}".format(classes_to_downsample, classes_to_upsample))

In [None]:
def random_rotation(image_array: ndarray):
    """Rotate an image at random in with +50 to -50"""
    random_degree = random.uniform(-50, 50)
    return sk.transform.rotate(image_array, random_degree)

def random_noise(image_array: ndarray):
    """Add random noise to the image"""
    return sk.util.random_noise(image_array)

def horizontal_flip(image_array: ndarray):
    """Flip the image"""
    return image_array[:, ::-1]

In [None]:
# Folder path containing some images
cropped_images_path = r'E:\Class_Notes_Sem2\ADM\Project\malaria-bounding-boxes\malaria\Cropped_images'
# For every classes to upsample run the following set of operations
for categories in classes_to_upsample:
    # If the category is difficult ignore and continue to the next category
    if categories == "difficult":
        continue
    # Genrate 2000 more images for each category
    num_files_desired = 2000
    # Generate full path for each category of the images
    path = os.path.join(cropped_images_path, categories)
    # loop on all files of the folder and build a list of files paths
    images = [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
    # Instantiate the files generated
    num_generated_files = 0
    while num_generated_files <= num_files_desired:
        # random image from the folder
        image_path = random.choice(images)
        # read image as an two dimensional array of pixels
        image_to_transform = sk.io.imread(image_path)
        available_transformations = {
            'rotate': random_rotation,
            'noise': random_noise,
            'horizontal_flip': horizontal_flip
        }

        # Random number of transformations will be applied to an image
        num_transformations_to_apply = random.randint(1, len(available_transformations))
        num_transformations = 0
        transformed_image = None
        # Apply transformations
        while num_transformations <= num_transformations_to_apply:
            key = random.choice(list(available_transformations))
            transformed_image = available_transformations[key](image_to_transform)
            num_transformations += 1
            new_file_path = '%s/augmented_image_%s.jpg' % (path, num_generated_files)
            # Save images to the folder
            sk.io.imsave(new_file_path, transformed_image)
            num_generated_files += 1

* The final set of images will be in a different location `Processed Image`

In [None]:
# Create a folder processed images
try:
    os.mkdir("../Processed_Images")
except Exception as e:
        print(e)

# Create folders for each category inside the processed images
for categories in df.index.values:
    if categories == 'difficult':
        continue
    try:
        os.mkdir("../Processed_Images/{}".format(categories))
    except Exception as e:
        print(e)

#### Copy 2000 images from each category to the `Processed_image` folder at random

In [None]:
for categories in df.index.values:
    # Getting the file names
    if categories == 'difficult':
        continue
    files = os.listdir(df.loc[categories]['Path'])
    # Random Sampling the file indexes
    sample_idx = np.random.choice(len(files), size=2000, replace=False)
    for index in sample_idx:
        try:
            shutil.copy(os.path.join(df.loc[categories]['Path'], files[index]), "../Processed_Images/{}".format(categories))
        except Exception as e:
            print("Error occured as {}".format(e))

### Data distribution after the data augmentation and integration

In [None]:
df2 = pd.DataFrame(columns = ['Size'])
for folders in os.listdir("../Processed_Images"):
    df2.loc[folders.replace(' ','_')] = [len(os.listdir(os.path.join("../Processed_Images", folders)))]

In [None]:
plt.bar(df2.index.values, height=df2['Size'])
plt.show()

### Data Preparation
* We will divide the data into three different parts.
 * **`train`**: This will contain 75% of the data, this data will be exposed to the model every epoch for the model to learn.
 * **`valid`**: This will contain 10% of the data, this data will be used to evaluate model's performance every epoch on unseen data. This will help us validate if the model is overfitting.
 * **`test`** : This set data will contain the rest 15%, which will be used to validate the accuracy metrics of the model after the model is fully trained.
* We are making these datasets inside the `Processed_Images` folder itself. 

In [None]:
# Create train, test and valid datasets
for dataset_splits in ['train', 'test', 'valid']:
    try:
        os.mkdir("../Processed_Images/{}".format(dataset_splits))
        # Create folders for each category
        for categories in df2.index.values:
            try:
                os.mkdir("../Processed_Images/{}/{}".format(dataset_splits, categories))
                os.mkdir("../Processed_Images/{}/{}".format(dataset_splits, categories))
                os.mkdir("../Processed_Images/{}/{}".format(dataset_splits, categories))
            except Exception as e:
                print("Error occured as {}".format(e))
    except Exception as e:
                print("Error occured as {}".format(e))
# Walk through each of the category and copy data to respective dataset
for folders in os.listdir("../Processed_Images"):
    if folders in ['test', 'train', 'valid']:
        continue
    files = os.listdir(os.path.join("../Processed_Images", folders))

    # Shuffle the indexes for random sampling
    sample_idx = np.random.choice(len(files), size=int(len(files)), replace=False)

    # Copy the 75% data to the train dataset
    for file in files[:int(len(files)*.75)]:
        try:
            shutil.copy(os.path.join("../Processed_Images/{}".format(folders), file), 
                        "../Processed_Images/train/{}".format(folders))
        except Exception as e:
            print("Error occured as {}".format(e))
    
    # Copy the next 10% data to the valid dataset
    for file in files[int(len(files)*.75):int(len(files)*.85)]:
        try:
            shutil.copy(os.path.join("../Processed_Images/{}".format(folders), file), 
                        "../Processed_Images/valid/{}".format(folders))
        except Exception as e:
            print("Error occured as {}".format(e))
    
    # Copy the next 15% data to the valid dataset
    for file in files[int(len(files)*.85):]:
        try:
            shutil.copy(os.path.join("../Processed_Images/{}".format(folders), file), 
                        "../Processed_Images/test/{}".format(folders))
        except Exception as e:
            print("Error occured as {}".format(e))


### Data distribution in train, valid and test datasets

In [None]:
df3 = pd.DataFrame(columns=df2.index.values)
for folders in ['train', 'test', 'valid']:
    df3.loc[folders] = [len(os.listdir(os.path.join("../Processed_Images", folders, f))) for f in df2.index.values]

In [None]:
df3