## Make Mini Dataset
- HyperParameter Search를 위해 Dataset을 작게 sampling한다.
- MultiLabelStratified를 활용하여 Box의 개수와 class의 비율을 유지하면서 sampling한다.

In [1]:
from tqdm import tqdm
from glob import glob

import pandas as pd
import numpy as np
import json
import math
import os
import random

In [2]:
!pip install iterative-stratification

Collecting iterative-stratification
  Downloading iterative_stratification-0.1.7-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.7


In [3]:
# Google Drive mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
if not os.path.exists('data'):
    os.mkdir('data')
if not os.path.exists('data/Training'):
    os.mkdir('data/Training')

In [5]:
!unzip -q /content/drive/MyDrive/AI_Factory_가축/Training/[원천]돼지_bbox.zip -d /content/data/Training/pig_image
!unzip -q /content/drive/MyDrive/AI_Factory_가축/Training/[원천]소_bbox.zip -d /content/data/Training/cow_image

!unzip -q /content/drive/MyDrive/AI_Factory_가축/Training/[라벨]돼지_bbox.zip -d /content/data/Training/pig_label
!unzip -q /content/drive/MyDrive/AI_Factory_가축/Training/[라벨]소_bbox.zip -d /content/data/Training/cow_label

In [6]:
train_pig_image = glob('/content/data/Training/pig_image/*.jpg')
train_cow_image = glob('/content/data/Training/cow_image/*.jpg')
train_pig_label = glob('/content/data/Training/pig_label/*.json')
train_cow_label = glob('/content/data/Training/cow_label/*.json')

train_pig_label.sort()
train_pig_image.sort()
train_cow_label.sort()
train_cow_image.sort()

### MultiLableSplited Dataset 만들기 (ver. mini)

In [7]:
total_image = train_pig_image + train_cow_image
total_label = train_pig_label + train_cow_label

In [8]:
data = pd.DataFrame([x for x in zip(total_image, total_label)])
data.rename(columns = {0 : 'image',
                       1 : 'label'}, 
            inplace = True)

#### One-Hot Encoding 
- pig/cow
- little/medium/many

In [9]:
def box_count_pig(num):
    """
    돼지 이미지에서 가지고 있는 Box의 수를 기준으로 Pig Dataset을 분류한다.
    Args: 
        num: bounding box의 개수
    Return:
        "little" - bounding box의 개수가 8개 미만일 경우
        "medium" - bounding box의 개수가 8~13일 경우
        "many" - bounding box의 개수가 13이상일 경우
    """

    if num < 8:
        return 'little'
    elif num < 13:
        return 'medium'
    else:
        return 'many'

def box_count_cow(num):
    """
    소 이미지에서 가지고 있는 Box의 수를 기준으로 Pig Dataset을 분류한다.
    Args: 
        num: bounding box의 개수
    Return:
        "little" - bounding box의 개수가 4개 미만일 경우
        "medium" - bounding box의 개수가 4~6일 경우
        "many" - bounding box의 개수가 6이상일 경우
    """
    if num < 4:
        return 'little'
    if num < 6:
        return 'medium'
    else:
        return 'many'

In [10]:
cow = []
pig = []
many = []
medium = []
little = []
for i in range(len(data)):
    label = data['label'][i]
    with open(label, 'r') as f:
        label_data = json.load(f)

    # cow인지 pig인지 list에 담기
    if 'cow' in data['label'][i]:
        cow.append(1)
        pig.append(0)
    else:
        pig.append(1)
        cow.append(0)

    # Box의 개수에 따라 many, medium, little의 list에 담기
    box_info = label_data['label_info']['annotations']
    box_count = len(box_info)

    if 'cow' in data['label'][i]:
        count = box_count_cow(box_count)
        if count == 'little':
            little.append(1)
            many.append(0)
            medium.append(0)
        elif count == 'medium':
            little.append(0)
            many.append(1)
            medium.append(0)
        else:
            little.append(0)
            many.append(0)
            medium.append(1)
    else:
        count = box_count_pig(box_count)
        if count == 'little':
            little.append(1)
            many.append(0)
            medium.append(0)
        elif count == 'medium':
            little.append(0)
            many.append(1)
            medium.append(0)
        else:
            little.append(0)
            many.append(0)
            medium.append(1)

In [11]:
data['cow'] = cow
data['pig'] = pig
data['little'] = little
data['many'] = many
data['medium'] = medium

In [14]:
data.head()

Unnamed: 0,image,label,cow,pig,little,many,medium
0,/content/data/Training/pig_image/livestock_pig...,/content/data/Training/pig_label/livestock_pig...,0,1,0,0,1
1,/content/data/Training/pig_image/livestock_pig...,/content/data/Training/pig_label/livestock_pig...,0,1,0,1,0
2,/content/data/Training/pig_image/livestock_pig...,/content/data/Training/pig_label/livestock_pig...,0,1,0,0,1
3,/content/data/Training/pig_image/livestock_pig...,/content/data/Training/pig_label/livestock_pig...,0,1,0,0,1
4,/content/data/Training/pig_image/livestock_pig...,/content/data/Training/pig_label/livestock_pig...,0,1,0,1,0


#### Dataset 나누기

In [12]:
X = data['image'].to_numpy()
Y = data[['cow', 'pig', 'little', 'many', 'medium']].to_numpy(dtype=np.float32)

In [13]:
# 마지막 "KFold Stacking을 활용하여 YOLO학습하기"에 사용
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

mlsk = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 첫번째, 두번째 KFold에서의 Valid Dataset을 mini Train Dataset으로 활용
# 마지막 KFold에서의 Valid Dataset을 mini Valid Dataset으로 활용
for i, (train_index, valid_index) in enumerate(mlsk.split(X, Y)):
    if i == 0 :
        _, tmp_train_x1 = X[train_index], X[valid_index]
        _, tmp_train_y1 = Y[train_index], Y[valid_index]
    if i == 1:
        _, tmp_train_x2 = X[train_index], X[valid_index]
        _, tmp_train_y2 = Y[train_index], Y[valid_index]
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = Y[train_index], Y[valid_index]

In [15]:
print(len(tmp_train_x1))
print(len(tmp_train_x2))

# 겹치는 Dataset이 있는지 확인
print(len(set(tmp_train_x1) & set(tmp_train_x2) & set(X_valid)))

3291
3291
0


In [16]:
X_train = list(tmp_train_x1) + list(tmp_train_x2)
y_train = list(tmp_train_y1) + list(tmp_train_y2)
print("Mini Train Dataset의 image의 개수:", len(X_train))
print("Mini Train Dataset의 label의 개수:", len(y_train))

Mini Train Dataset의 image의 개수: 6582
Mini Train Dataset의 label의 개수: 6582


#### 나누어진 Dataset 확인

In [17]:
train_pig_count = 0
train_cow_count = 0
train_cow_many_count = 0
train_cow_medium_count = 0
train_cow_little_count = 0
train_pig_many_count = 0
train_pig_medium_count = 0
train_pig_little_count = 0

for y in y_train:
    if y[0] == 0.:
        train_pig_count += 1
        if y[2] == 1:
            train_pig_little_count += 1
        elif y[3] == 1:
            train_pig_many_count += 1
        elif y[4] == 1:
            train_pig_medium_count += 1
    elif y[0] == 1.:
        train_cow_count += 1
        if y[2] == 1:
            train_cow_little_count += 1
        elif y[3] == 1:
            train_cow_many_count += 1
        elif y[4] == 1:
            train_cow_medium_count += 1

In [18]:
print("소  : KFold 전 -> many", len(data[data['cow'] == 1][data['many'] == 1]))
print("소  : KFold 전 -> little", len(data[data['cow'] == 1][data['little'] == 1]))
print("소  : KFold 전 -> medium", len(data[data['cow'] == 1][data['medium'] == 1]))
print()
print("소  : KFold 후 -> many", train_cow_many_count)
print("소  : KFold 후 -> little", train_cow_little_count)
print("소  : KFold 후 -> medium", train_cow_medium_count)
print()
print("돼지: KFold 전 -> many", len(data[data['pig'] == 1][data['many'] == 1]))
print("돼지: KFold 전 -> little", len(data[data['pig'] == 1][data['little'] == 1]))
print("돼지: KFold 전 -> medium", len(data[data['pig'] == 1][data['medium'] == 1]))
print()
print("돼지: KFold 후 -> many", train_pig_many_count)
print("돼지: KFold 후 -> little", train_pig_little_count)
print("돼지: KFold 후 -> medium", train_pig_medium_count)

소  : KFold 전 -> many 3413
소  : KFold 전 -> little 7383
소  : KFold 전 -> medium 1356

소  : KFold 후 -> many 1388
소  : KFold 후 -> little 2936
소  : KFold 후 -> medium 537

돼지: KFold 전 -> many 2749
돼지: KFold 전 -> little 506
돼지: KFold 전 -> medium 1048

돼지: KFold 후 -> many 1076
돼지: KFold 후 -> little 220
돼지: KFold 후 -> medium 425


  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


### 사용할 Dataset 만들기

In [19]:
if not os.path.exists('mini_data'):
    os.mkdir('mini_data')
if not os.path.exists('mini_data/Training'):
    os.mkdir('mini_data/Training')
if not os.path.exists('mini_data/Training/cow_image'):
    os.mkdir('mini_data/Training/cow_image')
if not os.path.exists('mini_data/Training/cow_label'):
    os.mkdir('mini_data/Training/cow_label')
if not os.path.exists('mini_data/Training/pig_image'):
    os.mkdir('mini_data/Training/pig_image')
if not os.path.exists('mini_data/Training/pig_label'):
    os.mkdir('mini_data/Training/pig_label')
if not os.path.exists('mini_data/Validation'):
    os.mkdir('mini_data/Validation')
if not os.path.exists('mini_data/Validation/cow_image'):
    os.mkdir('mini_data/Validation/cow_image')
if not os.path.exists('mini_data/Validation/cow_label'):
    os.mkdir('mini_data/Validation/cow_label')
if not os.path.exists('mini_data/Validation/pig_image'):
    os.mkdir('mini_data/Validation/pig_image')
if not os.path.exists('mini_data/Validation/pig_label'):
    os.mkdir('mini_data/Validation/pig_label')

In [20]:
import shutil

# 돼지 Data 이동
for image, label in tqdm(zip(train_pig_image, train_pig_label)):
    if image in X_train:
        image_dst = '/content/mini_data/Training/pig_image'
        label_dst = '/content/mini_data/Training/pig_label'
        shutil.copy(image, image_dst)
        shutil.copy(label, label_dst)
    if image in X_valid:
        image_dst = '/content/mini_data/Validation/pig_image'
        label_dst = '/content/mini_data/Validation/pig_label'
        shutil.copy(image, image_dst)
        shutil.copy(label, label_dst)

# 소 Data 이동
for image, label in tqdm(zip(train_cow_image, train_cow_label)):
    if image in X_train:
        image_dst = '/content/mini_data/Training/cow_image'
        label_dst = '/content/mini_data/Training/cow_label'
        shutil.copy(image, image_dst)
        shutil.copy(label, label_dst)
    if image in X_valid:
        image_dst = '/content/mini_data/Validation/cow_image'
        label_dst = '/content/mini_data/Validation/cow_label'
        shutil.copy(image, image_dst)
        shutil.copy(label, label_dst)

4303it [00:11, 381.67it/s]
12152it [01:04, 187.29it/s]


In [21]:
train_pig_image_sub = glob('/content/mini_data/Training/pig_image/*.jpg')
train_cow_image_sub = glob('/content/mini_data/Training/cow_image/*.jpg')
train_pig_label_sub = glob('/content/mini_data/Training/pig_label/*.json')
train_cow_label_sub = glob('/content/mini_data/Training/cow_label/*.json')

valid_pig_image_sub = glob('/content/mini_data/Validation/pig_image/*.jpg')
valid_cow_image_sub = glob('/content/mini_data/Validation/cow_image/*.jpg')
valid_pig_label_sub = glob('/content/mini_data/Validation/pig_label/*.json')
valid_cow_label_sub = glob('/content/mini_data/Validation/cow_label/*.json')

train_pig_label_sub.sort()
train_pig_image_sub.sort()
train_cow_label_sub.sort()
train_cow_image_sub.sort()

valid_pig_label_sub.sort()
valid_pig_image_sub.sort()
valid_cow_label_sub.sort()
valid_cow_image_sub.sort()

In [22]:
print('Train Pig Data의 개수:', len(train_pig_image_sub))
print('Train Pig Data의 개수:', len(train_pig_label_sub))

print('Valid Pig Data의 개수:', len(valid_pig_image_sub))
print('Valid Pig Data의 개수:', len(valid_pig_label_sub))

print('Train Cow Data의 개수', len(train_cow_image_sub))
print('Train Cow Data의 개수', len(train_cow_label_sub))

print('Valid Cow Data의 개수', len(valid_cow_image_sub))
print('Valid Cow Data의 개수', len(valid_cow_label_sub))

Train Pig Data의 개수: 1721
Train Pig Data의 개수: 1721
Valid Pig Data의 개수: 861
Valid Pig Data의 개수: 861
Train Cow Data의 개수 4861
Train Cow Data의 개수 4861
Valid Cow Data의 개수 2430
Valid Cow Data의 개수 2430


In [23]:
# Data를 GoogleDrive tar형태로 압축
!tar -cvf /content/drive/MyDrive/AI_Factory/data/Train_Dataset_mini.tar /content/mini_data/Training
!tar -cvf /content/drive/MyDrive/AI_Factory/data/Valid_Dataset_mini.tar /content/mini_data/Validation

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
/content/mini_data/Validation/cow_label/livestock_cow_bbox_010486.json
/content/mini_data/Validation/cow_label/livestock_cow_bbox_009198.json
/content/mini_data/Validation/cow_label/livestock_cow_bbox_013289.json
/content/mini_data/Validation/cow_label/livestock_cow_bbox_011089.json
/content/mini_data/Validation/cow_label/livestock_cow_bbox_002452.json
/content/mini_data/Validation/cow_label/livestock_cow_bbox_003887.json
/content/mini_data/Validation/cow_label/livestock_cow_bbox_012032.json
/content/mini_data/Validation/cow_label/livestock_cow_bbox_013598.json
/content/mini_data/Validation/cow_label/livestock_cow_bbox_009150.json
/content/mini_data/Validation/cow_label/livestock_cow_bbox_000011.json
/content/mini_data/Validation/cow_label/livestock_cow_bbox_010693.json
/content/mini_data/Validation/cow_label/livestock_cow_bbox_000646.json
/content/mini_data/Validation/cow_label/livestock_cow_bbox_003115.json
/content/mini_data/Validati