In [1]:
import os
import numpy as np
import pandas as pd
import random
from tqdm import tqdm

In [2]:
import cv2
import imgaug.augmenters as iaa
import imgaug as ia

In [3]:
ranSeed = 22
IMG_SIZE = 224
ColorCh = 3
dataPath = '../../dataset/Fish_Dataset/Fish_Dataset'

In [4]:
os.environ['PYTHONHASHSEED'] = str(ranSeed)
np.random.seed(ranSeed)
ia.seed(ranSeed)
random.seed(ranSeed)

In [5]:
category = []
for cat in os.listdir(dataPath):
    if not '.' in cat:
        category.append(cat)

print(category)

['Black Sea Sprat', 'Gilt-Head Bream', 'Hourse Mackerel', 'Red Mullet', 'Red Sea Bream', 'Sea Bass', 'Shrimp', 'Striped Red Mullet', 'Trout']


In [6]:
def imagePreProcess(dataPath):
    
    X = []
    y = []
    
    crop = iaa.Crop(px=(0, 24))
    zoom = iaa.Affine(scale=(0.4, 1.6))
    rotate = iaa.Affine(rotate=(-30, 30))
    flip = iaa.Fliplr(0.5)
    bright = iaa.Multiply((1, 1.2))
    gaussian = iaa.GaussianBlur(sigma=(0, 4.0))

    category = []
    
    for cat in os.listdir(dataPath):
        if not '.' in cat:
            category.append(cat)
    
    for i, cat in enumerate(category):
        path = os.path.join(dataPath, cat, (cat))        
        if not os.path.isdir(path):
            pass
        
        else:
            class_num = category.index(cat)
            
            limit1 = 1000
            img_list = os.listdir(path)[0: limit1]
            random.shuffle(img_list)
            
            limit2 = round(limit1*6/8)
            img_list = img_list+img_list[0: limit2]
            random.shuffle(img_list)
            
            limit3 = round(limit1*4/8)
            img_list = img_list+img_list[0: limit3]
            random.shuffle(img_list)
            
            limit4 = round(limit1*2/8)
            img_list = img_list+img_list[0: limit4]
            random.shuffle(img_list)
            
            for img in tqdm(img_list):
                orig_img = cv2.imread(os.path.join(path,img) , cv2.IMREAD_COLOR)
                image_aug = cv2.resize(orig_img, (IMG_SIZE, IMG_SIZE), 
                                       interpolation = cv2.INTER_CUBIC)
                
                image_aug = crop(image = image_aug)
                image_aug = zoom(image = image_aug)
                image_aug = rotate(image = image_aug)
                image_aug = flip(image = image_aug)
                image_aug = bright(image = image_aug)
                image_aug = gaussian(image = image_aug)

                image_aug = cv2.cvtColor(image_aug, cv2.COLOR_BGR2RGB)
                X.append(image_aug)
                y.append(class_num)
 

        
    features = pd.DataFrame((np.array(X)).reshape(-1, IMG_SIZE * IMG_SIZE * ColorCh))
    labels = pd.DataFrame({'label': y})

    df = pd.concat([features, labels], axis=1).sample(frac = 1, random_state = ranSeed)     
    return df

In [7]:
def dfSpilt(df, split_ratio):
    l = df.shape[0]
    dfArray = []
    start = 0
    end = split_ratio[0] * l
    split_ratio.append(1)

    for i in range(len(split_ratio)-1):
        dfArray.append(df[int(start):int(end)])
        start = end
        end += split_ratio[i+1] * l
    
    return dfArray

In [8]:
dataframe = imagePreProcess(dataPath)

100%|██████████| 2500/2500 [00:17<00:00, 140.85it/s]
100%|██████████| 2500/2500 [00:18<00:00, 135.51it/s]
100%|██████████| 2500/2500 [00:19<00:00, 131.19it/s]
100%|██████████| 2500/2500 [00:21<00:00, 118.21it/s]
100%|██████████| 2500/2500 [00:21<00:00, 114.27it/s]
100%|██████████| 2500/2500 [00:23<00:00, 107.63it/s]
100%|██████████| 2500/2500 [00:25<00:00, 99.29it/s] 
100%|██████████| 2500/2500 [00:26<00:00, 94.78it/s]
100%|██████████| 2500/2500 [00:27<00:00, 89.97it/s]


In [9]:
split_ratio = [0.25, 0.25, 0.5]
validation_ratio=[0.6, 0.4]

#dataframe = pd.read_pickle('saveImage.h5')
split_ratio = [0.25, 0.25, 0.5]
dfs = dfSpilt(dataframe, split_ratio)

df1 = dfs[0]
df2 = dfs[1]
dft = dfs[2]

dfs1 = dfSpilt(df1, validation_ratio)
dfs2 = dfSpilt(df2, validation_ratio)

train_df = dfs1[0].append([dfs2[0]], ignore_index=True)
valid_df = dfs1[1].append([dfs2[1]], ignore_index=True)

train_df1 = dfs1[0]
valid_df1 = dfs1[1]
train_df2 = dfs2[0]
valid_df2 = dfs2[1]
test_df = dft

train_df1.to_pickle('../../preprocDF/set1Train.h5')
valid_df1.to_pickle('../../preprocDF/set1Valid.h5')

train_df2.to_pickle('../../preprocDF/set2Train.h5')
valid_df2.to_pickle('../../preprocDF/set2Valid.h5')

train_df.to_pickle('../../preprocDF/train.h5')
valid_df.to_pickle('../../preprocDF/valid.h5')
test_df.to_pickle('../../preprocDF/test.h5')