In [1]:
from __future__ import print_function, division
import numpy as np
import cv2
from matplotlib import pyplot as plt

import torch
import torch.nn as nn
from torchvision.datasets import ImageFolder
import time
import os
import copy
import sys

from PIL import Image
from PIL import ImageFont
from PIL import ImageDraw

import xml.etree.ElementTree as Et
from xml.etree.ElementTree import Element, ElementTree

import random

import shutil
from shutil import copyfile

In [2]:
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

In [3]:
classes = ['aeroplane', 'bicycle','bird','boat','bottle','bus','car','cat','chair','cow','diningtable','dog','horse','motorbike'
,'person','pottedplant','sheep','sofa','train','tvmonitor']

# training dataset 만들기

PASCAL VOC 2012(class 20개) 데이터 사용함.

Annotation이 존재하는 모든 이미지에서 object를 crop한 뒤,
각 class 별로 train: 최대 800개, val: 100개, test: 100개 이미지를 가지도록 split 함.

### 경로 지정

#### 수정 x

In [4]:
# PASCAL VOC DATASET JPEGImages 경로
IMAGE_PATH = 'JPEGImages/'
# PASCAL VOC DATASET Annotations 경로
ANNOTATION_PATH = 'Annotations/'


# 만든 dataset 저장할 최상위 폴더 (base 폴더가 존재하는 경로)
DATASET_PATH = './Dataset/'

#### 수정 가능 path

In [5]:
# original PASCAL VOC dataset 경로 (JPEGImages, Annotation 들어있는 상위 폴더) (수정 0)
PASCAL_PATH = './VOCtrainval_11-May-2012/VOCdevkit/VOC2012/'

# train, val, test 폴더가 존재할 경로
# main 함수에서 수정
CUSTOM_PATH = ''

In [6]:
def InitializeNumOfImg():
    for i in range(20):
        num_of_img[i] = 0

In [7]:
num_of_img = {}
InitializeNumOfImg()

### 이미지를 지정된 폴더에 저장
    
- base img (전체 데이터 편집 시 사용): DATASET_PATH/base/class번호/ 폴더 내부에 이미지 저장

- train img: DATASET_PATH/train/class번호/ 폴더 내부에 이미지 저장
    
- val img: DATASET_PATH/val/class번호/ 폴더 내부에 이미지 저장
    
- test img: DATASET_PATH/test/class번호/ 폴더 내부에 이미지 저장

In [8]:
def custom_imsave(img, label, mode = 'base'):
    
    if mode == 'train' or mode == 'trainval':
        path = DATASET_PATH + CUSTOM_PATH + 'train/' + str(label) + '/'
    elif mode == 'val':
        path = DATASET_PATH + CUSTOM_PATH + 'val/' + str(label) + '/'
    elif mode == 'test':
        path = DATASET_PATH + CUSTOM_PATH + 'test/' + str(label) + '/'
    elif mode == 'base':
        path = DATASET_PATH + 'base/' + str(label) + '/'

    if not os.path.exists(path):
        os.makedirs(path)
        
    cv2.imwrite(path+str(num_of_img[label])+'.jpg', img)
    num_of_img[label] += 1

### Annotation이 존재하는 모든 이미지를 crop하여 class별로 저장

In [9]:
def make_base_dataset():
    mypath = PASCAL_PATH+'/Annotations'
    img_list = [f.split('.')[0] for f in os.listdir(mypath) if f.endswith('.xml')]
    print(f'total image: {len(img_list)}')
    
    for index, img_name in enumerate(img_list):
        printProgressBar(index, len(img_list), prefix='Progress', suffix='Complete', length=50)
        tmp_img = cv2.imread(PASCAL_PATH+IMAGE_PATH+'/'+img_name+'.jpg')
        imout = tmp_img.copy()

        gtvalues = []

        img_xml = open(PASCAL_PATH+ANNOTATION_PATH+'/'+img_name+'.xml')
        tree = Et.parse(img_xml)
        root = tree.getroot()

        objects = root.findall("object")

        # Annotation 기준으로 object 추출
        for _object in objects:
            name = _object.find("name").text
            bndbox = _object.find("bndbox")
            xmin = int(float(bndbox.find("xmin").text))
            ymin = int(float(bndbox.find("ymin").text))
            xmax = int(float(bndbox.find("xmax").text))
            ymax = int(float(bndbox.find("ymax").text))
            
            timage = imout[ymin:ymax, xmin:xmax]
            # 정의된 class에 존재하는 object일 경우 이미지 crop 및 저장
            if name in classes:
                class_num = classes.index(name)
                custom_imsave(timage, class_num, mode = 'base')
    printProgressBar(len(img_list), len(img_list), prefix='Progress', suffix='Complete', length=50)

In [10]:
def split_data_into_train_val_test(excludedClass = [],ratio = False, ratioRange = "", size = False, sizeLimit = ""):
    
    CUSTOM_PATH = ""
    
    if ratio:
        if not isinstance(ratioRange, list):
            raise ValueError("ratioRange must be a list type")
        if not len(ratioRange) == 2:
            raise ValueError("ratioRange must have a two value: [minRatio, maxRatio]")
        CUSTOM_PATH += f'ratio_{str(ratioRange[0])}_{str(ratioRange[1])}'
    if size:
        if not isinstance(sizeLimit, int):
            raise ValueError("size must be a int type")
        if CUSTOM_PATH is not "":
            CUSTOM_PATH += "_"
        CUSTOM_PATH += f'minSize_{sizeLimit}'
    
    if CUSTOM_PATH is "":
        CUSTOM_PATH = 'all'
    
    
    if os.path.exists(DATASET_PATH + CUSTOM_PATH):
        shutil.rmtree(DATASET_PATH + CUSTOM_PATH)
    
    path_list = ['train/', 'val/', 'test/', 'out_of_condition/']
    condition_list = ['size/', 'aspectRatio/', 'both']
    
    for path in path_list:
        if not os.path.exists(os.path.join(DATASET_PATH, CUSTOM_PATH, path)):
            os.makedirs(os.path.join(DATASET_PATH, CUSTOM_PATH, path))
            
            for i in num_of_img:
                if not os.path.exists(os.path.join(DATASET_PATH, CUSTOM_PATH,path,str(i))):
                    os.makedirs(os.path.join(DATASET_PATH, CUSTOM_PATH,path,str(i)))
                    if path is path_list[3]:
                        for name in condition_list:
                            os.makedirs(os.path.join(DATASET_PATH, CUSTOM_PATH,path,str(i), name))
    
    for i in range(20):
        
        if i in excludedClass:
            print(f'class {i} is in excludedClass')        
        
        cnt = 0
        class_path = os.path.join(DATASET_PATH+'base/',str(i))
        img_list = [f for f in os.listdir(class_path)]
        #print(f'class {i} has {len(img_list)} items (total)')
        random.shuffle(img_list)
        
        candidate_img_list = []
        
        for index, img_name in enumerate(img_list):
            #printProgressBar(index, len(img_list), prefix=f'Class {i}', suffix='Complete', length=50)
            
        
            img = cv2.imread(os.path.join(class_path,img_name))
            
            minSize = min(img.shape[0], img.shape[1])
            maxSize = max(img.shape[0], img.shape[1])
            imgRatio = float(img.shape[1]) / img.shape[0]
            
            flag = ""
            
            if i not in excludedClass:
            
                if ratio and not (ratioRange[0] <= imgRatio and imgRatio <= ratioRange[1]):
                    flag = 'aspectRatio'
                    #print(f'({img.shape[1]}, {img.shape[0]}) , ratio = {imgRatio}')
                if size and not (sizeLimit < minSize):
                    #print(f'minSize = {minSize}')
                    if flag is "":
                        flag = 'size'
                    elif flag is 'aspectRatio':
                        flag = 'both'
                    
                if flag is 'size':
                    copyfile(os.path.join(class_path,img_name),os.path.join(DATASET_PATH, CUSTOM_PATH, path_list[3],str(i), condition_list[0],img_name))
                    continue
                elif flag is 'aspectRatio':
                    copyfile(os.path.join(class_path,img_name),os.path.join(DATASET_PATH, CUSTOM_PATH, path_list[3],str(i), condition_list[1],img_name))
                    continue
                elif flag is 'both':
                    copyfile(os.path.join(class_path,img_name),os.path.join(DATASET_PATH, CUSTOM_PATH, path_list[3],str(i), condition_list[2],img_name))
                    continue
                
            candidate_img_list.append(img_list)
            
            if cnt < 50:
                copyfile(os.path.join(class_path,img_name),os.path.join(DATASET_PATH, CUSTOM_PATH, path_list[1],str(i),img_name))
            elif cnt < 80:
                copyfile(os.path.join(class_path,img_name),os.path.join(DATASET_PATH, CUSTOM_PATH, path_list[2],str(i),img_name))
            elif cnt < 1000:
                copyfile(os.path.join(class_path,img_name),os.path.join(DATASET_PATH, CUSTOM_PATH, path_list[0],str(i),img_name))
                
            cnt += 1
        print(f'class {i} has {len(candidate_img_list)} items (candidate)')
                
        #printProgressBar(len(img_list), len(img_list), prefix=f'Class {i}', suffix='Complete', length=50)
        

### class 목록

0: aeroplane

1: bicycle

2: bird

3: boat

4: bottle

5: bus

6: car

7: cat

8: chair

9: cow

10: dining table

11: dog

12: horse

13: motorbike

14: person

15: pottedplant

16: sheep

17: sofa

18: train

19: tvmonitor

In [11]:
if __name__ == "__main__":
    if not os.path.exists(DATASET_PATH + 'base/'):
        make_base_dataset()
    else:
        print('base already exists, skip make_base_dataset()')
    
    excludedClass = [0, 4, 14]
    
    # 기준별로 저장 경로가 알아서 바뀜
    ## ratio,size 기준 없는 경우: DATASET_PATH + all/
    ## ratio 기준 있는 경우: DATASET_PATH + ratio_{str(ratioRange[0])}_{str(ratioRange[1])}
    ## minsize 기준 있는 경우: DATASET_PATH + minSize_{sizeLimit}
    ## 두 기준 모두 적용: DATASET_PATH + ratio_{str(ratioRange[0])}_{str(ratioRange[1])}_minSize_{sizeLimit}
    split_data_into_train_val_test(excludedClass = excludedClass, ratio = True, ratioRange = [0.5, 1.5], size = True, sizeLimit = 70)

base already exists, skip make_base_dataset()
class 0 is in excludedClass
class 0 has 1002 items (candidate)
class 1 has 425 items (candidate)
class 2 has 570 items (candidate)
class 3 has 237 items (candidate)
class 4 is in excludedClass
class 4 has 1561 items (candidate)
class 5 has 308 items (candidate)
class 6 has 462 items (candidate)
class 7 has 910 items (candidate)
class 8 has 1560 items (candidate)
class 9 has 342 items (candidate)
class 10 has 264 items (candidate)
class 11 has 1149 items (candidate)
class 12 has 511 items (candidate)
class 13 has 430 items (candidate)
class 14 is in excludedClass
class 14 has 17401 items (candidate)
class 15 has 421 items (candidate)
class 16 has 417 items (candidate)
class 17 has 378 items (candidate)
class 18 has 258 items (candidate)
class 19 has 525 items (candidate)
