<a href="https://colab.research.google.com/github/kqu7/Cassava-Leaf-Disease-Classification/blob/main/cassava_data_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load Packages

In [None]:
import os
import shutil
import random

from zipfile import ZipFile 
from google.colab import drive

import numpy as np
import pandas as pd

!pip install tqdm
!pip install -U albumentations

from tqdm.notebook import tqdm

import cv2
from albumentations import (
    HorizontalFlip, VerticalFlip, IAAPerspective, ShiftScaleRotate, CLAHE, RandomRotate90,
    Transpose, ShiftScaleRotate, Blur, OpticalDistortion, GridDistortion, HueSaturationValue,
    IAAAdditiveGaussianNoise, GaussNoise, MotionBlur, MedianBlur, IAAPiecewiseAffine,
    IAASharpen, IAAEmboss, RandomBrightnessContrast, Flip, OneOf, Compose, Normalize, Cutout, CoarseDropout, ShiftScaleRotate, CenterCrop, Resize
)

Collecting albumentations
[?25l  Downloading https://files.pythonhosted.org/packages/03/58/63fb1d742dc42d9ba2800ea741de1f2bc6bb05548d8724aa84794042eaf2/albumentations-0.5.2-py3-none-any.whl (72kB)
[K     |████████████████████████████████| 81kB 3.5MB/s 
Collecting opencv-python-headless>=4.1.1
[?25l  Downloading https://files.pythonhosted.org/packages/6d/6d/92f377bece9b0ec9c893081dbe073a65b38d7ac12ef572b8f70554d08760/opencv_python_headless-4.5.1.48-cp37-cp37m-manylinux2014_x86_64.whl (37.6MB)
[K     |████████████████████████████████| 37.6MB 1.3MB/s 
[?25hCollecting imgaug>=0.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/66/b1/af3142c4a85cba6da9f4ebb5ff4e21e2616309552caca5e8acefe9840622/imgaug-0.4.0-py2.py3-none-any.whl (948kB)
[K     |████████████████████████████████| 952kB 47.6MB/s 
Installing collected packages: opencv-python-headless, imgaug, albumentations
  Found existing installation: imgaug 0.2.9
    Uninstalling imgaug-0.2.9:
      Successfully uninstalle

# Define Data Loading, Saving Paths, and Constants

In [None]:
cassava_dataset_dir_prefix = '/content/gdrive/MyDrive/kaggle-competition-datasets/cassava-leaf-disease-classification/'
train_img_zip_path = cassava_dataset_dir_prefix + "train_images.zip"
train_img_dir_path = '/content/train_images/'

!rm -r new_train_imgs
!mkdir new_train_imgs

new_train_img_dir_path = '/content/new_train_images/'
output_filename = 'newly_sampled_train'

# Load Training Images From Drive to Colab

In [None]:
drive.mount('/content/gdrive')

if not os.path.isdir(train_img_dir_path):
  with ZipFile(train_img_zip_path, 'r') as zip_f: 
    zip_f.extractall(path='/content') 

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Undersampling

In [None]:
MAJORITY_CLASS_LABEL = 3
UNDER_SAMPLED_RATIO = 0.5

train_img_df = pd.read_csv(cassava_dataset_dir_prefix + 'train.csv')
train_img_id = train_img_df['img_id'].to_numpy()
train_img_label = train_img_df['label'].to_numpy()
img_label_lookup = dict(zip(train_img_id, train_img_label))
train_imgs = []

def read_img_from_path(path):
    im_bgr = cv2.imread(path)
    im_rgb = im_bgr[:, :, ::-1].copy()
    return im_rgb

majority_sample_loc = np.where(train_img_label == MAJORITY_CLASS_LABEL)
majority_sample_id = train_img_id[majority_sample_loc]
majority_sample_label = train_img_label[majority_sample_loc]

num_majority_sample = len(majority_sample_id)
undersampled_indices = np.random.choice(np.arange(num_majority_sample), int(num_majority_sample * UNDER_SAMPLED_RATIO), replace=False)
under_sampled_img_id = majority_sample_id[undersampled_indices]
under_sampled_img_label = majority_sample_label[undersampled_indices]

undersampled_img_dict = {
    'img_id' : under_sampled_id,
    'label' : under_sampled_label
}

under_sampled_train_df = pd.DataFrame(undersampled_img_dict)

for img_id in undersampled_img_dict['img_id']:
  img_write_path = new_train_img_dir_path + img_id
  cv2.imwrite(img_write_path, read_img_from_path(train_img_dir_path + img_id))

# Oversampling

In [None]:
RANDOM_NUMBER_UPPER_LIMIT = 10 ** 9

tranforms = Compose(
    [
     CoarseDropout(p = 0.5, max_holes = 12, max_height = 30, max_width = 30),
     GridDistortion(p = 0.5), 
     ShiftScaleRotate(p = 0.8),
     ShiftScaleRotate(scale_limit = [0.1, 0.3], p = 0.6),
     VerticalFlip(p = 0.7),
     HorizontalFlip(p = 0.7),
     RandomBrightnessContrast(p = 0.8),
     HueSaturationValue(hue_shift_limit = 30, sat_shift_limit = 40, val_shift_limit = 30, p = 0.8)
    ]
)

def oversample(minority_set, over_sample_ratio):
  over_sampled_set = []
  for img in minority_set:
    for i in range(over_sample_ratio):
      new_img = tranforms(img=img)['img']  
      over_sampled_set.append(new_img)
  return over_sampled_set

def append_oversampled_set_to_dataset(over_sampled_set, label, undersampled_df):
  new_ids = []
  labels = [label] * len(over_sampled_set)

  for img in over_sampled_set: 
    # get new id for the newly sampled image
    id = random.randint(0, RANDOM_NUMBER_UPPER_LIMIT)
    new_id = str(id) + '.jpg'
    # if the id is already being taken, get a new one and check, until there
    # is no collision in id
    while new_name in img_label_lookup:
      id = random.randint(0, RANDOM_NUMBER_UPPER_LIMIT)
      new_id = str(id) + '.jpg'

    new_ids.append(new_id)
    img_label_lookup[new_id] = label
    cv2.imwrite(new_train_img_dir_path + new_id, img)
    
  oversampled_img_dict = {
    'img_id' : new_ids,
    'label' : labels
  }
  new_sampled_df = pd.DataFrame(oversampled_img_dict)
  # concatenate the df of newly sampled images to the df of undersampled images
  undersampled_df = undersampled_df.append(new_sampled_df)
  return undersampled_df

In [None]:
for img_id, label in tqdm(zip(train_img_id, train_img_label)):
  if label != MAJORITY_CLASS_LABEL:
    img = read_img_from_path(train_img_dir_path + img_id)
    over_sampled_set = oversample([img], 4 if label == 0 else 2)
    under_sampled_train_df = append_oversampled_set_to_dataset(over_sampled_set, label, under_sampled_train_df)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [None]:
!ls /content/new_train_images | wc -l

25231


# Copy New Dataset Back to Drive to Download

In [None]:
FORMAT = 'zip'
shutil.make_archive(output_filename, FORMAT, new_train_img_dir_path)
under_sampled_train_df = under_sampled_train_df.sample(frac = 1).reset_index(drop=True)
under_sampled_train_df.to_csv('new_train.csv')

!cp /content/newly_sampled_train.zip /content/gdrive/MyDrive/kaggle-competition-datasets/cassava-leaf-disease-classification/

/content/gdrive/MyDrive/kaggle-competition-datasets/cassava-leaf-disease-classification/


'/content/newly_sampled_train.zip'