# Description
In this notebook we created a utility to fight class imbalance. For this reason, using Gaussian Blur over images that are imbalanced we are imputing the dataset to create ~38k samples overall.

In [96]:
from __future__ import print_function
import pandas as pd
from shutil import copyfile
from shutil import rmtree
import matplotlib.pyplot as plt
import os
import cv2
import numpy as np
%matplotlib inline

# Creating directory structure

In [97]:
src = "data/train_set/"
dst = "data/custom_split/"

os.makedirs(dst,exist_ok=True) # <---- creating custom destination directories
os.makedirs(dst+'train',exist_ok=True)
os.makedirs(dst+'validation',exist_ok=True)

In [98]:
train_df = pd.read_csv("data/train_labels.csv")

In [99]:
print("Unique labels:",len(train_df.label.unique()))
print(train_df.label.unique())
print(train_df.head())

Unique labels: 80
[21 29 17 50  8 42 68 10  9 39 11 44 49  5 65 61 56 43 54 40 23 64 58 33
 45  3 12 46 59 77 14 13 18 69 47 63 52 66 80 30 55 53 19 67 78 24  2 25
 37 57 36  6  1 72 27 60 31 48 74 51 15 26 75 62  4 22 41 79 34 73  7 71
 35 38 76 28 70 16 32 20]
      img_name  label
0  train_1.jpg     21
1  train_2.jpg     29
2  train_3.jpg     17
3  train_4.jpg     21
4  train_5.jpg     50


In [100]:
output_train = pd.DataFrame()
output_validation = pd.DataFrame()
output_test = pd.DataFrame()

first_n_labels = train_df.label.unique()[:80] # <------------------- edit here first n of unique labels
print("SELECTED UNIQUE:",first_n_labels, len(first_n_labels))

SELECTED UNIQUE: [21 29 17 50  8 42 68 10  9 39 11 44 49  5 65 61 56 43 54 40 23 64 58 33
 45  3 12 46 59 77 14 13 18 69 47 63 52 66 80 30 55 53 19 67 78 24  2 25
 37 57 36  6  1 72 27 60 31 48 74 51 15 26 75 62  4 22 41 79 34 73  7 71
 35 38 76 28 70 16 32 20] 80


In [101]:
##### WARNING ######
##### CLEAN ENV #####

rmtree(dst+'train')
rmtree(dst+'validation')

os.makedirs(dst+'train',exist_ok=True)
os.makedirs(dst+'validation',exist_ok=True)
os.remove(dst+"train_desc.csv")
os.remove(dst+"validation_desc.csv")

##### WARNING ######
##### CLEAN ENV #####

# Data Manipulation
Separating data into PyTorch directory structure and imputing missing data using blur

In [102]:
filenames_train = []
labels_train = []

filenames_val = []
labels_val = []


for lbl in first_n_labels:
    training = 450 # <--------------------------- EDIT HERE NUM OF TRAIN SAMPLES PER LABEL
    validation = 40 # <--------------------------- EDIT HERE NUM OF VALIDATION SAMPLES PER LABEL
    
    os.makedirs(dst+'train/'+str(lbl),exist_ok=True)
    os.makedirs(dst+'validation/'+str(lbl),exist_ok=True)
    os.makedirs(dst+'test/'+str(lbl),exist_ok=True)
    
    for index, row in train_df.iterrows():
        
        img_name = row['img_name']
        img_lbl = row['label']
        
        if lbl == img_lbl and validation > 0 and lbl != '20':
            copyfile(src+img_name, dst+"validation/"+str(lbl)+"/"+img_name)
            filenames_val.append(img_name)
            labels_val.append(img_lbl)
            validation -= 1
        elif lbl == img_lbl and training > 0 and validation < 1:
            copyfile(src+img_name, dst+"train/"+str(lbl)+"/"+img_name)
            filenames_train.append(img_name)
            labels_train.append(img_lbl)
            training -= 1
        elif validation < 1 and training < 1:
            break        

In [123]:
for i in range(80):
    #special exception case for very underrepresented class 20.
    if i+1 != 20:
        file_num = len([iq for iq in os.scandir('data/custom_split/train/'+str(i+1))])
        if(file_num != 450):
            for iq in os.scandir('data/custom_split/train/'+str(i+1)):
                if file_num == 450:
                    break
                else:
                    image = cv2.imread('data/custom_split/train/'+str(i+1)+"/"+iq.name)
                    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) 
                    figure_size = 9 
                    new_image = cv2.GaussianBlur(image, (figure_size, figure_size),0)
                    img_to_write = cv2.cvtColor(new_image, cv2.COLOR_RGB2BGR)
                    cv2.imwrite('data/custom_split/train/'+str(i+1)+'/g_'+iq.name,img_to_write)
                    file_num+=1            

## Single image blur demo

In [95]:
output_train['img_name'] = filenames_train
output_train['label'] = labels_train

output_validation['img_name'] = filenames_val
output_validation['label'] = labels_val

output_train.to_csv(dst+"train_desc.csv")
output_validation.to_csv(dst+"validation_desc.csv")

print("TRAIN:", output_train.shape)
print("TEST:", output_test.shape)
print("VALIDATION:", output_validation.shape)

TRAIN: (100, 2)
TEST: (0, 0)
VALIDATION: (40, 2)


In [75]:
rgb_img = plt.imread('data/train_set/train_1.jpg')

In [74]:
image = cv2.imread('data/train_set/train_2.jpg')
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) 
figure_size = 9 

new_image = cv2.GaussianBlur(image, (figure_size, figure_size),0)

img_to_write = cv2.cvtColor(new_image, cv2.COLOR_RGB2BGR)
cv2.imwrite('data/demo/train_2_g.jpg',img_to_write)

True