# DATA BUILDER
- Faster R-CNN 모델 학습을 위해 데이터 셋을 COCO type으로 변경하는 과정 <br><br>

- 결과 디렉토리 구조
   - custom_data /
       - annotation / train.json
       - annotation / eval.json
       - train / *.jpg (학습에 사용되는 이미지) (필수)
       - eval / *.jpg (모델 평가에 사용되는 이미지) (필수)
       - annotation.csv (필수)<br>
           annotation columns = ('img_path','x1','y1','x2','y2','label','category_id','is_train')
           

## 1) Insatllation

In [None]:
# !pip --trusted-host pypi.org --trusted-host files.pythonhosted.org install pycocotools
# !pip --trusted-host pypi.org --trusted-host files.pythonhosted.org install pandas
# !pip --trusted-host pypi.org --trusted-host files.pythonhosted.org install scipy
# !pip --trusted-host pypi.org --trusted-host files.pythonhosted.org install setproctitle

## 2) Load Packages

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np

import shutil
import cv2
from tqdm import tqdm

import json
import os 
os.environ["CUDA_VISIBLE_DEVICES"]="99"

## 3) COCO 데이터 셋 생성

In [None]:
# 별도로 저장되어 있는 dataset(image files)을 읽어와 'custom_data' path에 저장
annotation = pd.read_csv('./custom_data/annotation.csv')

# DATA TO COCO TYPE
SAVE_IMAGE_PATH = './custom_data/' #이미지가 저장된 경로
SAVE_ANNOTATION_PATH='./custom_data/annotations/' # coco json이 저장될 경로

In [None]:
def data_to_coco(anno_df, destfile):

    anno_df["label"] = anno_df["label"].astype(str)
    
    
    label_names = list(anno_df['label'].unique())
    label_names = [str(x) for x in label_names]
    
    label_categories = {}
    
    # label 별 ID 부여
    for label in label_names:
        label_categories[label] = int(anno_df[anno_df["label"]==label]['category_id'].unique()[0])
    
    data_dict = {}
    data_dict['images'] = []
    data_dict['categories'] = []
    data_dict['annotations'] = []
    
    for idx, name in enumerate(label_names):
        single_category = {'id': label_categories[name], 'name': name, 'supercategory': name}
        data_dict['categories'].append(single_category)

    inst_count = 1
    image_id = 1
    
    with open(destfile, 'w') as f_out:
        images = list(anno_df['img_path'].unique())
        
        for image_path in tqdm(images): 
            img_name = image_path.split('/')[-1]
            
            # image가 gray scale이기 떄문
            img = cv2.imread(image_path)
            height, width, _  = img.shape

            single_image = {}
            single_image['file_name'] = img_name
            single_image['id'] = image_id
            single_image['width'] = width
            single_image['height'] = height
            data_dict['images'].append(single_image)

            # annotations
            objects = anno_df[anno_df['img_path']==image_path]
        
            for x1, y1, x2, y2,label in objects[["x1", "y1", "x2", "y2","label"]].values:
                single_obj = {}

                single_obj['category_id'] = label_categories[label]

                width, height = x2-x1, y2-y1
                
                single_obj['bbox'] = x1, y1, width, height
                single_obj['area'] = width*height
                
                single_obj['image_id'] = image_id
                
                single_obj['iscrowd'] = 0
                single_obj['ignore'] = 0
                
                data_dict['annotations'].append(single_obj)
                single_obj['id'] = inst_count
                inst_count = inst_count + 1
            image_id = image_id + 1
            
        json.dump(data_dict, f_out)

In [None]:
# json 파일 생성

train_anno = annotation[annotation['is_train']==True]
val_anno = annotation[annotation['is_train']==False]

print ("BUILD TRAINING SET")
data_to_coco(train_anno, SAVE_ANNOTATION_PATH+'train.json')

print ("BUILD EVALUATION SET")
data_to_coco(val_anno, SAVE_ANNOTATION_PATH+'eval.json')