# import 

In [1]:
import os
import os.path as osp
from glob import glob
import shutil

import cv2
import matplotlib.pyplot as plt
import numpy as np

from tqdm import tqdm
from sklearn.model_selection import train_test_split

# path

In [2]:
base_dir = '../'
input_dir = osp.join(base_dir, 'input')
dataset_dir = osp.join(input_dir, 'cat-dataset')
save_dir = osp.join(input_dir, 'cat')
os.makedirs(save_dir, exist_ok=True)

# copy image to one folder

In [3]:
data_names = os.listdir(dataset_dir)
data_names

['CAT_02', 'CAT_06', 'CAT_05', 'CAT_03', 'CAT_00', 'CAT_01', 'CAT_04']

In [5]:
# 種類によってフォルダが分けられている猫の画像を一つのフォルダにまとめる
image_num = 1
for data_name in tqdm(data_names):
    data_dir = osp.join(dataset_dir, data_name)
    file_names = os.listdir(data_dir)
    file_names = [file_name for file_name in file_names if osp.splitext(file_name)[1] == '.jpg']
    file_names.sort()
    
    for file_name in file_names:
        image_path = osp.join(data_dir, file_name)
        save_image_num = '{:07}.jpg'.format(image_num)
        save_path = osp.join(save_dir, save_image_num)
        
        shutil.copy(image_path, save_path)
        image_num += 1

100%|██████████| 7/7 [00:02<00:00,  2.77it/s]


# Split and crop data

In [6]:
image_paths = glob(osp.join(save_dir, '*'))
image_paths.sort()
image_paths[0:10]

['../input/cat/0000001.jpg',
 '../input/cat/0000002.jpg',
 '../input/cat/0000003.jpg',
 '../input/cat/0000004.jpg',
 '../input/cat/0000005.jpg',
 '../input/cat/0000006.jpg',
 '../input/cat/0000007.jpg',
 '../input/cat/0000008.jpg',
 '../input/cat/0000009.jpg',
 '../input/cat/0000010.jpg']

In [7]:
train_paths, test_paths = train_test_split(image_paths, test_size=100, random_state=0)
print(len(train_paths))
print(len(test_paths))

9897
100


In [8]:
def random_crop(image, crop_size):
    h, w, _ = image.shape

    top = np.random.randint(0, h - crop_size[0])
    left = np.random.randint(0, w - crop_size[1])

    bottom = top + crop_size[0]
    right = left + crop_size[1]

    image = image[top:bottom, left:right, :]
    return image

In [13]:
def split_save(image_paths, data_type, crop_size=(128, 128), num_aug=1):
    shapes = []
    for image_path in tqdm(image_paths):
        image_name = osp.basename(image_path)
        image = cv2.imread(image_path)
        for aug_n in range(1, num_aug+1):
            image_rsz = random_crop(image, crop_size)
            save_image_name = '{}-{:3}.jpg'.format(image_name, aug_n)
            image_save_path = osp.join(data_save_dir, 'cat_{}'.format(data_type), save_image_name)
            os.makedirs(osp.dirname(image_save_path), exist_ok=True)
            cv2.imwrite(image_save_path, image_rsz)

In [20]:
def split_crop(image_paths, data_type, crop=False, crop_size=(128,128), num_aug=1):
    shapes = []
    for image_path in tqdm(image_paths):
        image_name = osp.basename(image_path)
        file_name = osp.splitext(image_name)[0]
        image = cv2.imread(image_path)
        if (image.shape[0]<=crop_size[0]) | (image.shape[1]<=crop_size[1]):
            print('size problem', image_path)
            continue
        if crop:
            for aug_n in range(1, num_aug+1):
                image_rsz = random_crop(image, crop_size)
                save_image_name = '{}{:03}.jpg'.format(file_name, aug_n)
                image_save_path = osp.join(input_dir, 'cat_{}'.format(data_type), save_image_name)
                os.makedirs(osp.dirname(image_save_path), exist_ok=True)
                cv2.imwrite(image_save_path, image_rsz)
        else:
            save_image_name = '{}.jpg'.format(file_name)
            image_save_path = osp.join(input_dir, 'cat_{}'.format(data_type), save_image_name)
            os.makedirs(osp.dirname(image_save_path), exist_ok=True)
            cv2.imwrite(image_save_path, image)

In [21]:
random_seed = 0
np.random.seed(random_seed)
split_crop(train_paths, 'train', crop=True)
split_crop(test_paths, 'test')

 40%|███▉      | 3949/9897 [00:22<00:34, 172.08it/s]

size problem ../input/cat/0004278.jpg


100%|██████████| 9897/9897 [00:57<00:00, 173.17it/s]
100%|██████████| 100/100 [00:01<00:00, 85.82it/s]
