# Remake Dataset
지금까지는 카테고리별 obj 파일을 생성했다.<br/>
학습의 용이성을 위해 카테고리를 합친 형태의 파일을 생성할 것이다.

---

### Split 비율
train : val : test = 0.8 : 0.15 : 0.05 <br/>
(train.py의 코드를 참고할 것이다.)

### 사용할 카테고리
1. Serif (182개)
2. Display (283개)
3. Handwriting (140개) <br/>
→ 각 비율에 맞춰 train.obj, val.obj, test.obj 생성

### 기타
각 obj 파일은 카테고리 순으로 저장된다(단, 카테고리 내 font는 random).

---

In [8]:
import os
import glob
import numpy as np
import pickle as pickle
import random

from tqdm import tqdm
from common.dataset import FontDataset
from common.dataset import NewFontDataset
from common.dataset import PickledImageProvider

import torch
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

In [5]:
validation_split = .15
test_split       = .05
shuffle_dataset  = True
random_seed      = 42

- 카테고리마다 이미지를 비율에 맞춰 split한다.
- Random하게 추출한다.

In [16]:
def extract_filename(path):
    filenames = []
    for filename in glob.iglob(path + '*.png'):
        filenames.append(filename[17:])
    return filenames

In [18]:
# 원본 이미지 파일명
original_serif       = extract_filename('collection/img/0/')
original_display     = extract_filename('collection/img/2/')
original_handwriting = extract_filename('collection/img/3/')

In [20]:
# 카테고리별 파일 개수
size_serif       = len(original_serif)       # 182 * 52 = 9464
size_display     = len(original_display)     # 283 * 52 = 14716
size_handwriting = len(original_handwriting) # 140 * 52 = 7280

### split 개수 확인

In [48]:
def split_each_font(size, splt_test, splt_val):
    test = int(np.floor(splt_test * size))
    val  = int(np.floor((splt_test + splt_val) * size))
    return test, val

In [51]:
# serif split
split_test_serif, split_valid_serif = split_each_font(size_serif, test_split, validation_split)

# display split
split_test_display, split_valid_display = split_each_font(size_display, test_split, validation_split)

# serif split
split_test_handwriting, split_valid_handwriting = split_each_font(size_handwriting, test_split, validation_split)

In [78]:
print('---serif---')
print('test:  ', split_test_serif)
print('valid+test: ', split_valid_serif)

print('---display---')
print('test:  ', split_test_display)
print('valid+test: ', split_valid_display)

print('---handwriting---')
print('test:  ', split_test_handwriting)
print('valid+test: ', split_valid_handwriting)

---serif---
test:   473
valid+test:  1892
---display---
test:   735
valid+test:  2943
---handwriting---
test:   364
valid+test:  1456


### 폰트별로 split하여 파일 개수 확인

In [61]:
def get_idx_sampler(size, splt_test, splt_val):
    idx = list(range(size))
    train_idxs = idx[splt_val:]
    val_idxs   = idx[splt_test: splt_val]
    test_idxs  = idx[: splt_test]
    return train_idxs, val_idxs, test_idxs

In [70]:
train_i_serif, val_i_serif, test_i_serif = get_idx_sampler(size_serif, split_test_serif, split_valid_serif)
train_i_disp, val_i_disp, test_i_disp    = get_idx_sampler(size_display, split_test_display, split_valid_display)
train_i_hand, val_i_hand, test_i_hand    = get_idx_sampler(size_handwriting, split_test_handwriting, split_valid_handwriting)

In [77]:
print('              train val test')
print('serif:       ', len(train_i_serif), len(val_i_serif), len(test_i_serif))
print('display:     ', len(train_i_disp), len(val_i_disp), len(test_i_disp))
print('handwriting: ', len(train_i_hand), len(val_i_hand), len(test_i_hand))

              train val test
serif:        7572 1419 473
display:      11773 2208 735
handwriting:  5824 1092 364


---
기존 train.obj 파일에서는 폰트들이 랜덤하게 저장되어 있음을 확인했다. <br/>
그렇다면 그냥 일정 개수만큼의 pickle을 불러온 후 따로 저장하면 되지 않을까?

In [79]:
data_dir = './dataset/integrated/'

In [83]:
dataset_serif = FontDataset(PickledImageProvider(data_dir+'train_0.obj'))
dataset_disp  = FontDataset(PickledImageProvider(data_dir+'train_2.obj'))
dataset_hand  = FontDataset(PickledImageProvider(data_dir+'train_3.obj'))

processed 1000 examples
processed 2000 examples
processed 3000 examples
processed 4000 examples
processed 5000 examples
processed 6000 examples
processed 7000 examples
processed 8000 examples
processed 9000 examples
processed 10000 examples
processed 11000 examples
processed 12000 examples
processed 13000 examples
processed 14000 examples
processed 15000 examples
processed 16000 examples
processed 17000 examples
processed 18000 examples
processed 19000 examples
processed 20000 examples
processed 21000 examples
processed 22000 examples
processed 23000 examples
processed 24000 examples
processed 25000 examples
processed 26000 examples
processed 27000 examples
processed 28000 examples
processed 29000 examples
processed 30000 examples
processed 31000 examples
processed 32000 examples
processed 33000 examples
processed 34000 examples
processed 35000 examples
processed 36000 examples
processed 37000 examples
processed 38000 examples
processed 39000 examples
processed 40000 examples
processed

In [92]:
save_dir = './dataset/allfonts/'

dict_train = {
    'filename': 'train.obj',
    'serif': len(train_i_serif),
    'display': len(train_i_disp),
    'handwriting': len(train_i_hand)
}

dict_val = {
    'filename': 'val.obj',
    'serif': len(val_i_serif),
    'display': len(val_i_disp),
    'handwriting': len(val_i_hand)
}

dict_test = {
    'filename': 'test.obj',
    'serif': len(test_i_serif),
    'display': len(test_i_disp),
    'handwriting': len(test_i_hand)
}

dsets = [dataset_serif, dataset_disp, dataset_hand]

### train.obj

In [106]:
train_path = os.path.join(save_dir, dict_train['filename'])

with open(train_path, 'wb') as ft:
    count = 0
    for dset in dsets: # for serif, disp, hand
        category = np.argmax(dset[0][0]['category_vector'], axis=0) # one-hot to Integer
        
        idx = -1
        if category == 0:
            idx = dict_train['serif']
        elif category == 2:
            idx = dict_train['display']
        elif category == 3:
            idx = dict_train['handwriting']
        
        for (i, dd) in enumerate(dset): # for each data
            if i < idx:
                torch.save(dd, ft)
                count += 1
            else:
                break
    print('{} pickles saved in train.obj'.format(count))

25169 pickles saved in train.obj


### val.obj

In [112]:
val_path = os.path.join(save_dir, dict_val['filename'])

with open(val_path, 'wb') as ft:
    count = 0
    for dset in dsets: # for serif, disp, hand
        category = np.argmax(dset[0][0]['category_vector'], axis=0) # one-hot to Integer
        
        before, idx = -1, -1
        if category == 0:
            before = dict_train['serif']
            idx = dict_val['serif']
        elif category == 2:
            before = dict_train['display']
            idx = dict_val['display']
        elif category == 3:
            before = dict_train['handwriting']
            idx = dict_val['handwriting']
        
        for (i, dd) in enumerate(dset): # for each data
            if i < before:
                continue
            elif i < before + idx:
                torch.save(dd, ft)
                count += 1
            else:
                break
    print('{} pickles saved in val.obj'.format(count))

4719 pickles saved in val.obj


### test.obj

In [113]:
test_path = os.path.join(save_dir, dict_test['filename'])

with open(test_path, 'wb') as ft:
    count = 0
    for dset in dsets: # for serif, disp, hand
        category = np.argmax(dset[0][0]['category_vector'], axis=0) # one-hot to Integer
        
        before = -1
        if category == 0:
            before = dict_train['serif'] + dict_val['serif']
        elif category == 2:
            before = dict_train['display'] + dict_val['display']
        elif category == 3:
            before = dict_train['handwriting'] + dict_val['handwriting']
        
        for (i, dd) in enumerate(dset): # for each data
            if i >= before:
                torch.save(dd, ft)
                count += 1

    print('{} pickles saved in test.obj'.format(count))

1572 pickles saved in test.obj


In [2]:
new_dir = './dataset/allfonts/'

In [14]:
new_train = NewFontDataset(PickledImageProvider(new_dir+'train.obj'))
new_val   = NewFontDataset(PickledImageProvider(new_dir+'val.obj'))
new_test  = NewFontDataset(PickledImageProvider(new_dir+'test.obj'))

processed 1000 examples
processed 2000 examples
processed 3000 examples
processed 4000 examples
processed 5000 examples
processed 6000 examples
processed 7000 examples
processed 8000 examples
processed 9000 examples
processed 10000 examples
processed 11000 examples
processed 12000 examples
processed 13000 examples
processed 14000 examples
processed 15000 examples
processed 16000 examples
processed 17000 examples
processed 18000 examples
processed 19000 examples
processed 20000 examples
processed 21000 examples
processed 22000 examples
processed 23000 examples
processed 24000 examples
processed 25000 examples
processed 26000 examples
processed 27000 examples
processed 28000 examples
processed 29000 examples
processed 30000 examples
processed 31000 examples
processed 32000 examples
processed 33000 examples
processed 34000 examples
processed 35000 examples
processed 36000 examples
processed 37000 examples
processed 38000 examples
processed 39000 examples
processed 40000 examples
processed

In [16]:
len(new_train), len(new_val), len(new_test)

(25169, 4719, 1572)

---
### Test 카테고리 순서 확인

#### 1. index 0 ~ 472 → **SERIF**

In [17]:
new_test[472] 

({'category_vector': array([1, 0, 0, 0, 0]),
  'font': 26,
  'alphabet_vector': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0])},
 array([[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        ...,
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.]]),
 {'category_vector': 5, 'alphabet_vector': 52, 'font_vector': 16384})

In [18]:
new_test[473]

({'category_vector': array([0, 0, 1, 0, 0]),
  'font': 248,
  'alphabet_vector': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0])},
 array([[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        ...,
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.]]),
 {'category_vector': 5, 'alphabet_vector': 52, 'font_vector': 16384})

#### 2. index 473 ~ 1207 → **DISPLAY**

In [20]:
new_test[1207]

({'category_vector': array([0, 0, 1, 0, 0]),
  'font': 277,
  'alphabet_vector': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0])},
 array([[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        ...,
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.]]),
 {'category_vector': 5, 'alphabet_vector': 52, 'font_vector': 16384})

In [21]:
new_test[1208]

({'category_vector': array([0, 0, 0, 1, 0]),
  'font': 0,
  'alphabet_vector': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
         0, 0, 0, 0, 0, 0, 0, 0])},
 array([[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        ...,
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.]]),
 {'category_vector': 5, 'alphabet_vector': 52, 'font_vector': 16384})

#### 3. index 1208 ~  → **HANDWRITING**

In [23]:
new_test[1571]

({'category_vector': array([0, 0, 0, 1, 0]),
  'font': 63,
  'alphabet_vector': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0])},
 array([[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        ...,
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.]]),
 {'category_vector': 5, 'alphabet_vector': 52, 'font_vector': 16384})

---
# 정리
train / val / test == 0.8 / 0.15 / 0.05 <br/>
새로운 데이터 경로: dataset/allfonts/

## train.obj
- serif: 7572 개
- display: 11773 개
- handwriting: 5824 개

## val.obj
- serif: 1419 개
- display: 2208 개
- handwriting: 1092 개

## test.obj
- serif: 473 개
- display: 735 개
- handwriting: 364 개


### 참고
- train/val/test 모든 파일은 카테고리별로 indexing이 되어 있다.
- 파일의 용량이 커서 github에는 압축 파일만 올린다.