In [1]:
from tqdm import tqdm

import pandas as pd
import numpy as np
import json
import math
import os
import random

In [2]:
# Google Drive mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
if not os.path.exists('data'):
    os.mkdir('data')
if not os.path.exists('data/Training'):
    os.mkdir('data/Training')

In [4]:
# Dataset unzip
!unzip -q /content/drive/MyDrive/AI_Factory/data/[원천]돼지_bbox.zip -d /content/data/Training/pig_image
!unzip -q /content/drive/MyDrive/AI_Factory/data/[원천]소_bbox.zip -d /content/data/Training/cow_image

!unzip -q /content/drive/MyDrive/AI_Factory/data/[라벨]돼지_bbox.zip -d /content/data/Training/pig_label
!unzip -q /content/drive/MyDrive/AI_Factory/data/[라벨]소_bbox.zip -d /content/data/Training/cow_label

In [5]:
import os
from glob import glob

train_pig_image = glob('/content/data/Training/pig_image/*.jpg')
train_cow_image = glob('/content/data/Training/cow_image/*.jpg')
train_pig_label = glob('/content/data/Training/pig_label/*.json')
train_cow_label = glob('/content/data/Training/cow_label/*.json')

train_pig_label.sort()
train_pig_image.sort()
train_cow_label.sort()
train_cow_image.sort()

### MultiLabelSplited Dataset 만들기

In [6]:
# pig와 cow를 하나로 통일
total_image = train_pig_image + train_cow_image
total_label = train_pig_label + train_cow_label

In [7]:
# 전체 Dataset을 Pandas DataFrame형태로 변형 (MultilabelStratifiedKFold를 적용하기 위해)
data = pd.DataFrame([x for x in zip(total_image, total_label)])
data.rename(columns = {0 : 'image',
                       1 : 'label'}, 
            inplace = True)

#### One-Hot Encoding 
- pig/cow
- little/medium/many

In [8]:
def box_count_pig(num):
    """
    돼지 이미지에서 가지고 있는 Box의 수를 기준으로 Pig Dataset을 분류한다.
    Args: 
        num: bounding box의 개수
    Return:
        "little" - bounding box의 개수가 8개 미만일 경우
        "medium" - bounding box의 개수가 8~13일 경우
        "many" - bounding box의 개수가 13이상일 경우
    """

    if num < 8:
        return 'little'
    elif num < 13:
        return 'medium'
    else:
        return 'many'

def box_count_cow(num):
    """
    소 이미지에서 가지고 있는 Box의 수를 기준으로 Pig Dataset을 분류한다.
    Args: 
        num: bounding box의 개수
    Return:
        "little" - bounding box의 개수가 4개 미만일 경우
        "medium" - bounding box의 개수가 4~6일 경우
        "many" - bounding box의 개수가 6이상일 경우
    """
    if num < 4:
        return 'little'
    if num < 6:
        return 'medium'
    else:
        return 'many'

In [9]:
cow = []
pig = []
many = []
medium = []
little = []
for i in range(len(data)):
    label = data['label'][i]
    with open(label, 'r') as f:
        label_data = json.load(f)

    # cow인지 pig인지 list에 담기
    if 'cow' in data['label'][i]:
        cow.append(1)
        pig.append(0)
    else:
        pig.append(1)
        cow.append(0)

    # Box의 개수에 따라 many, medium, little의 list에 담기
    box_info = label_data['label_info']['annotations']
    box_count = len(box_info)

    if 'cow' in data['label'][i]:
        count = box_count_cow(box_count)
        if count == 'little':
            little.append(1)
            many.append(0)
            medium.append(0)
        elif count == 'medium':
            little.append(0)
            many.append(1)
            medium.append(0)
        else:
            little.append(0)
            many.append(0)
            medium.append(1)
    else:
        count = box_count_pig(box_count)
        if count == 'little':
            little.append(1)
            many.append(0)
            medium.append(0)
        elif count == 'medium':
            little.append(0)
            many.append(1)
            medium.append(0)
        else:
            little.append(0)
            many.append(0)
            medium.append(1)

In [10]:
data['cow'] = cow
data['pig'] = pig
data['little'] = little
data['many'] = many
data['medium'] = medium

In [11]:
data.head()

Unnamed: 0,image,label,cow,pig,little,many,medium
0,/content/data/Training/pig_image/livestock_pig...,/content/data/Training/pig_label/livestock_pig...,0,1,0,0,1
1,/content/data/Training/pig_image/livestock_pig...,/content/data/Training/pig_label/livestock_pig...,0,1,0,1,0
2,/content/data/Training/pig_image/livestock_pig...,/content/data/Training/pig_label/livestock_pig...,0,1,0,0,1
3,/content/data/Training/pig_image/livestock_pig...,/content/data/Training/pig_label/livestock_pig...,0,1,0,0,1
4,/content/data/Training/pig_image/livestock_pig...,/content/data/Training/pig_label/livestock_pig...,0,1,0,1,0


#### Dataset 나누기

In [12]:
X = data['image'].to_numpy()
Y = data[['cow', 'pig', 'little', 'many', 'medium']].to_numpy(dtype=np.float32)

In [13]:
# MultilabelStratifiedKFold를 사용하기 위하여 library 다운로드
!pip install iterative-stratification

Collecting iterative-stratification
  Downloading iterative_stratification-0.1.7-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.7


In [14]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

# Train 50: Valid 50
random.seed(42) # shuffle에서 random성을 제어하기 위하여
mlsk = MultilabelStratifiedKFold(n_splits=2, shuffle=True, random_state=42)

# 하나의 Train으로 합치기
for i, (train_index, valid_index) in enumerate(mlsk.split(X,Y)):
    if i == 0:
        X_train_1, X_valid_1 = X[train_index], X[valid_index]
        y_train_1, y_valid_1 = Y[train_index], Y[valid_index]
    else:
        X_train_2, X_valid_2 = X[train_index], X[valid_index]
        y_train_2, y_valid_2 = Y[train_index], Y[valid_index]
X_train = list(X_train_1) + list(X_train_2)
y_train = list(y_train_1) + list(y_train_2)

print("TrainSet의 image의 개수:", len(X_train))
print("TrainSet의 label의 개수:", len(y_train))

TrainSet의 image의 개수: 16455
TrainSet의 label의 개수: 16455


#### 나누어진 Dataset 확인

In [15]:
train_pig_count = 0
train_cow_count = 0
train_cow_many_count = 0
train_cow_medium_count = 0
train_cow_little_count = 0
train_pig_many_count = 0
train_pig_medium_count = 0
train_pig_little_count = 0

for y in y_train:
    if y[0] == 0.:
        train_pig_count += 1
        if y[2] == 1:
            train_pig_little_count += 1
        elif y[3] == 1:
            train_pig_many_count += 1
        elif y[4] == 1:
            train_pig_medium_count += 1
    elif y[0] == 1.:
        train_cow_count += 1
        if y[2] == 1:
            train_cow_little_count += 1
        elif y[3] == 1:
            train_cow_many_count += 1
        elif y[4] == 1:
            train_cow_medium_count += 1

In [16]:
print("소  : Dataset 나누기 전 -> many", len(data[data['cow'] == 1][data['many'] == 1]))
print("소  : Dataset 나누기 전 -> little", len(data[data['cow'] == 1][data['little'] == 1]))
print("소  : Dataset 나누기 전 -> medium", len(data[data['cow'] == 1][data['medium'] == 1]))
print()
print("소  : Dataset 나누기 후 -> many", train_cow_many_count)
print("소  : Dataset 나누기 후 -> little", train_cow_little_count)
print("소  : Dataset 나누기 후 -> medium", train_cow_medium_count)
print()
print("돼지: Dataset 나누기 전 -> many", len(data[data['pig'] == 1][data['many'] == 1]))
print("돼지: Dataset 나누기 전 -> little", len(data[data['pig'] == 1][data['little'] == 1]))
print("돼지: Dataset 나누기 전 -> medium", len(data[data['pig'] == 1][data['medium'] == 1]))
print()
print("돼지: Dataset 나누기 후 -> many", train_pig_many_count)
print("돼지: Dataset 나누기 후 -> little", train_pig_little_count)
print("돼지: Dataset 나누기 후 -> medium", train_pig_medium_count)

소  : Dataset 나누기 전 -> many 3413
소  : Dataset 나누기 전 -> little 7383
소  : Dataset 나누기 전 -> medium 1356

소  : Dataset 나누기 후 -> many 3413
소  : Dataset 나누기 후 -> little 7383
소  : Dataset 나누기 후 -> medium 1356

돼지: Dataset 나누기 전 -> many 2749
돼지: Dataset 나누기 전 -> little 506
돼지: Dataset 나누기 전 -> medium 1048

돼지: Dataset 나누기 후 -> many 2749
돼지: Dataset 나누기 후 -> little 506
돼지: Dataset 나누기 후 -> medium 1048


  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


### 사용할 Dataset 만들기

In [17]:
if not os.path.exists('s_data'):
    os.mkdir('s_data')
if not os.path.exists('s_data/Training'):
    os.mkdir('s_data/Training')
if not os.path.exists('s_data/Training/cow_image'):
    os.mkdir('s_data/Training/cow_image')
if not os.path.exists('s_data/Training/cow_label'):
    os.mkdir('s_data/Training/cow_label')
if not os.path.exists('s_data/Training/pig_image'):
    os.mkdir('s_data/Training/pig_image')
if not os.path.exists('s_data/Training/pig_label'):
    os.mkdir('s_data/Training/pig_label')

In [18]:
import shutil

# 돼지 Data 이동
for image, label in tqdm(zip(train_pig_image, train_pig_label)):
    if image in X_train:
        image_dst = '/content/s_data/Training/pig_image'
        label_dst = '/content/s_data/Training/pig_label'
        shutil.copy(image, image_dst)
        shutil.copy(label, label_dst)
    else:
        image_dst = '/content/s_data/Validation/pig_image'
        label_dst = '/content/s_data/Validation/pig_label'
        shutil.copy(image, image_dst)
        shutil.copy(label, label_dst)

# 소 Data 이동
for image, label in tqdm(zip(train_cow_image, train_cow_label)):
    if image in X_train:
        image_dst = '/content/s_data/Training/cow_image'
        label_dst = '/content/s_data/Training/cow_label'
        shutil.copy(image, image_dst)
        shutil.copy(label, label_dst)
    else:
        image_dst = '/content/s_data/Validation/cow_image'
        label_dst = '/content/s_data/Validation/cow_label'
        shutil.copy(image, image_dst)
        shutil.copy(label, label_dst)

4303it [00:33, 130.13it/s]
12152it [01:50, 109.65it/s]


In [19]:
train_pig_image_sub = glob('/content/s_data/Training/pig_image/*.jpg')
train_cow_image_sub = glob('/content/s_data/Training/cow_image/*.jpg')
train_pig_label_sub = glob('/content/s_data/Training/pig_label/*.json')
train_cow_label_sub = glob('/content/s_data/Training/cow_label/*.json')

train_pig_label_sub.sort()
train_pig_image_sub.sort()
train_cow_label_sub.sort()
train_cow_image_sub.sort()

In [20]:
print('Train Pig Data의 개수:', len(train_pig_image_sub))
print('Train Pig Data의 개수:', len(train_pig_label_sub))

print('Train Cow Data의 개수', len(train_cow_image_sub))
print('Train Cow Data의 개수', len(train_cow_label_sub))

Train Pig Data의 개수: 4303
Train Pig Data의 개수: 4303
Train Cow Data의 개수 12152
Train Cow Data의 개수 12152


In [21]:
# Data를 GoogleDrive tar형태로 압축
!tar -cvf /content/drive/MyDrive/AI_Factory/data/Train_Dataset_Full.tar /content/s_data/Training

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
/content/s_data/Training/pig_label/livestock_pig_bbox_004312.json
/content/s_data/Training/pig_label/livestock_pig_bbox_004325.json
/content/s_data/Training/pig_label/livestock_pig_bbox_000538.json
/content/s_data/Training/pig_label/livestock_pig_bbox_003263.json
/content/s_data/Training/pig_label/livestock_pig_bbox_001438.json
/content/s_data/Training/pig_label/livestock_pig_bbox_001910.json
/content/s_data/Training/pig_label/livestock_pig_bbox_003496.json
/content/s_data/Training/pig_label/livestock_pig_bbox_003672.json
/content/s_data/Training/pig_label/livestock_pig_bbox_001454.json
/content/s_data/Training/pig_label/livestock_pig_bbox_001563.json
/content/s_data/Training/pig_label/livestock_pig_bbox_004770.json
/content/s_data/Training/pig_label/livestock_pig_bbox_004667.json
/content/s_data/Training/pig_label/livestock_pig_bbox_001441.json
/content/s_data/Training/pig_label/livestock_pig_bbox_003098.json
/content/s_data/Training/p