In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
REF_PATH = '/content/drive/MyDrive/Github/10_도배하자유형분류'
os.chdir(REF_PATH)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from lib.base import mkdir, label_encoder, label_decoder

# Setting

<br>

## Import

In [3]:
import gc
gc.collect()

35

In [4]:
import pandas as pd
import numpy as np
import glob
import cv2

import torch

import albumentations as A

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from tqdm import tqdm, trange

from PIL import Image
import torchvision.utils as vutils
from joblib import Parallel, delayed, parallel_backend

<br>

## Hyperparameter Setting

In [5]:
CFG = {
    'IMG_SIZE':224, #224,320,384
    'EPOCHS':128,
    'LEARNING_RATE':1e-4,
    'BATCH_SIZE':16,
    'SEED':0,
}

<br></br>

# Data Load

<br>

## Data Pre-processing

In [6]:
# !unzip\
#   /content/drive/MyDrive/Github/10_도배하자유형분류/data/open.zip\
#   -d\
#   /content/drive/MyDrive/Github/10_도배하자유형분류/data/

In [7]:
all_img_list = glob.glob(REF_PATH+'/data/train/*/*')

In [8]:
df = pd.DataFrame(columns=['img_path', 'label'])
df['img_path'] = all_img_list
df['label'] = df['img_path'].apply(lambda x : x.split('/')[-2].replace('.png',''))

In [9]:
train_data, val_data, _, _ = train_test_split(df, df['label'], test_size=0.3, stratify=df['label'], random_state=CFG['SEED'])

<br>

<br>

## Label-Encoding

In [10]:
le = preprocessing.LabelEncoder()
train_data['label'] = le.fit_transform(train_data['label'])
val_data  ['label'] = le.transform(val_data['label'])

<br></br>

# Augmentation을 통한 이미지 생성

In [11]:
# 이미지 생성을 위한 augmentations 정의
augmentations_essential = A.Compose([
    A.HorizontalFlip(p=1),            # 이미지를 좌우로 뒤집는 함수
])

augmentations_selective = A.SomeOf([
    A.RandomBrightnessContrast(p=1),  # 이미지의 밝기와 대조를 무작위로 조정하는 함수
    A.OneOf([
        A.MotionBlur(),               # 이미지에 블러 효과를 적용하는 함수
        A.MedianBlur(),               # 이미지에 미디언 필터를 적용하는 함수
        A.Blur(),                     # 이미지에 가우시안 필터를 적용하는 함수
    ], p=1),
    A.OneOf([
        A.IAAAdditiveGaussianNoise(), # 이미지에 가우시안 노이즈를 추가하는 함수
        A.GaussNoise(),               # 이미지에 가우시안 노이즈를 추가하는 함수
    ], p=1),
    A.ColorJitter(                    # 이미지의 채도, 명도, 대조, 색조 등을 랜덤하게 조정하는 함수
        brightness=0.2,
        contrast=0.2,
        saturation=0.2,
        hue=0.2,
        p=1,
    ),
    A.ElasticTransform(               # 이미지를 탄성 변환시키는 함수
        alpha=1,
        sigma=50,
        alpha_affine=50,
        p=1,
    ),
    A.CLAHE(                          # 이미지의 대비를 향상시키는 함수
        p=1,
        clip_limit=2.0,
        tile_grid_size=(8, 8),
    ),
    A.RandomGamma(                    # 이미지의 채도를 향상시키는 함수
        p=1,
        gamma_limit=(150, 255),
    ),
    # A.Flip(p=1),                      # 이미지를 상하/좌우로 뒤집는 함수
    # A.Transpose(p=p),                 # 이미지를 대각선을 기준으로 뒤집는 함수
    # A.RandomRotate90(p=p),            # 90도 단위로 회전하는 함수
    # A.ShiftScaleRotate(               # 이미지를 이동, 확대/축소, 회전하는 함수
    #     shift_limit=0.0625,
    #     scale_limit=0.2,
    #     rotate_limit=45,
    #     p=0.2),
    A.OneOf([
        A.RandomSnow(),                 # 이미지에 눈 모양의 노이즈를 추가하는 함수
        A.RandomRain(),                 # 이미지에 비 모양의 노이즈를 추가하는 함수
    ])
],n=4)

augmentations = {
    'essential' : augmentations_essential,
    'selective'  : augmentations_selective,
}



In [54]:
# augmentation의 조합의 수
from math import comb
maximum = comb(len(augmentations['selective']),augmentations['selective'].n)
print('maximum :',maximum)

maximum : 70


In [55]:
CFG['a'] = 30  # 20
CFG['b'] = 0.2 # 0.2

In [56]:
def n_generate_fn(freq,a,b):
    percentage = freq / freq.sum()
    return CFG['a'] * (percentage**-CFG['b']) / freq

In [57]:
from math import comb
def get_n_generate(label_list,ref_percentage,n_generate_fn,augmentations,check=False):

    # 비중이 ref_percentage보다 작은 대상에 대해서, 증강시킬 개수를 np.sqrt(1/비중)으로 계산
    freq = pd.Series(label_list).value_counts()
    percentage = freq / freq.sum()
    augmentation_info = pd.Series([p if p<ref_percentage else 999999 for p in percentage],index=percentage.index)
    augmentation_info = n_generate_fn(freq,CFG['a'],CFG['b'])
    augmentation_label = augmentation_info.index.tolist()
    augmentation_n = augmentation_info.values
    augmentation_n = [int(n) for n in augmentation_n] # float -> int

    # 최대 생성개수 제한 넣기
    # (OneOf가 있어서 조합의 수가 더 커지긴 하지만, 적당히 주기위해서 더이상 계산하지않음)
    max_n_augmentation = comb(len(augmentations['selective']),augmentations['selective'].n)
    augmentation_n = [min(n,max_n_augmentation) for n in augmentation_n]
    print('> a={}, b={}, augmentation maximum={}'.format(CFG['a'],CFG['b'],maximum))
    
    # 최종 생성개수 확인
    n_df = pd.Series({l:n for f,n,l in zip(freq,augmentation_n,augmentation_label)})\
        .to_frame().reset_index().rename(columns={'index':'label',0:'n_each_generate'})
    asis_df = pd.Series({l:f for f,n,l in zip(freq,augmentation_n,augmentation_label)})\
        .to_frame().reset_index().rename(columns={'index':'label',0:'freq_asis'})
    tobe_df = pd.Series({l:f*(n+1) for f,n,l in zip(freq,augmentation_n,augmentation_label)})\
        .to_frame().reset_index().rename(columns={'index':'label',0:'freq_tobe'})

    total_df = pd.merge(asis_df,tobe_df,how='outer',on='label')
    total_df = pd.merge(total_df,n_df,how='outer',on='label')
    total_df['n_generated'] = total_df['freq_tobe']-total_df['freq_asis']
    total_df['p_asis'] = 100 * total_df['freq_asis'] / total_df['freq_asis'].sum()
    total_df['p_tobe'] = 100 * total_df['freq_tobe'] / total_df['freq_tobe'].sum()
    total_df = total_df.sort_values('freq_asis').reset_index(drop=True)
    
    print('> Asis: {:,}, Tobe: {:,}, Generated: {:,}\n'\
        .format(total_df.freq_asis.sum(), total_df.freq_tobe.sum(), total_df.freq_tobe.sum()-total_df.freq_asis.sum()))
    
    if not check:
        return np.array(augmentation_label),np.array(augmentation_n),total_df.freq_tobe.sum()-total_df.freq_asis.sum()
    else:
        display(total_df.round(2))

In [58]:
get_n_generate(train_data.label,0.01,n_generate_fn,augmentations,True)

> a=30, b=0.2, augmentation maximum=70
> Asis: 2,419, Tobe: 3,168, Generated: 749



Unnamed: 0,label,freq_asis,freq_tobe,n_each_generate,n_generated,p_asis,p_tobe
0,8,2,126,62,124,0.08,3.98
1,16,4,112,27,108,0.17,3.54
2,0,8,96,11,88,0.33,3.03
3,4,10,90,8,80,0.41,2.84
4,13,12,96,7,84,0.5,3.03
5,12,15,90,5,75,0.62,2.84
6,14,19,95,4,76,0.79,3.0
7,17,36,72,1,36,1.49,2.27
8,5,38,76,1,38,1.57,2.4
9,9,40,80,1,40,1.65,2.53


In [59]:
class ImageGenerateByAugmentation:
    def __init__(self, img_path_list, label_list, augmentations, ref_percentage=0.05):
        self.img_path_list = np.array(img_path_list)
        self.label_list = np.array(label_list)
        self.augmentations = augmentations
        self.ref_percentage = ref_percentage
        
        # 조합확인
        self.augmentation_label, self.augmentation_n, self.n_image_total = get_n_generate(
            self.label_list,self.ref_percentage,n_generate_fn,self.augmentations,False)
        self.generate_augmentation_label = [label for n,label in zip(self.augmentation_n,self.augmentation_label) if n>0]
        
        # 상위폴더 생성
        mkdir('./out/augmentation_images')

        # augmentation으로 생성되는 이미지를 저장하기위한 라벨 폴더생성
        for label in self.generate_augmentation_label:
            mkdir('./out/augmentation_images/{}'.format(label))

        self.generate_img_path   = [path for path,label in zip(self.img_path_list,self.label_list)
                                    if label in self.generate_augmentation_label]
        self.generate_label_list = [label for path,label in zip(self.img_path_list,self.label_list)
                                    if label in self.generate_augmentation_label]

    def generate(self,n_jobs=1):
        self.generated_path = []
        self.n_image = 0
        self.pbar = trange(len(self.generate_img_path),desc='[Augmentation] Generate',leave=False,position=0)

        if n_jobs==1:
            for i in self.pbar:
                self._generate_iteration(i)
        else:
            with parallel_backend('threading', n_jobs=n_jobs):
                Parallel()(
                    delayed(self._generate_iteration)(i)
                    for i in self.pbar
                )
    
    def _generate_iteration(self,i):

        # (1) raw image
        label = self.generate_label_list[i]
    
        # (2) augmentationed image
        image = cv2.imread(self.generate_img_path[i])
        augmentation_idx = np.where(self.augmentation_label==label)[0][0]
        n_augmentation = self.augmentation_n[augmentation_idx]
        for _ in range(n_augmentation):
            _augmentations = A.Compose([
                self.augmentations['essential'],
                self.augmentations['selective'],
            ])
            new_image = _augmentations(image=image)['image']
            Image.fromarray(new_image).save('./out/augmentation_images/{}/{}.png'.format(label,self.n_image))
            self.n_image+=1

            progress = '[Augmentation] Generate: {}/{}, Label: {}'.format(self.n_image,self.n_image_total,label)
            self.pbar.set_description(progress)

            self.generated_path.append('./out/augmentation_images/{}/{}.png'.format(label,self.n_image))

In [60]:
# 기존파일 삭제
import shutil
shutil.rmtree(f'./out/augmentation_images')

In [61]:
image_generator = ImageGenerateByAugmentation(
    img_path_list=train_data.img_path,
    label_list=train_data.label,
    augmentations=augmentations,
    ref_percentage=0.05,
)

image_generator.generate(n_jobs=1)

> a=30, b=0.2, augmentation maximum=70
> Asis: 2,419, Tobe: 3,168, Generated: 749

folder created: ./out/augmentation_images
folder created: ./out/augmentation_images/9
folder created: ./out/augmentation_images/5
folder created: ./out/augmentation_images/17
folder created: ./out/augmentation_images/14
folder created: ./out/augmentation_images/12
folder created: ./out/augmentation_images/13
folder created: ./out/augmentation_images/4
folder created: ./out/augmentation_images/0
folder created: ./out/augmentation_images/16
folder created: ./out/augmentation_images/8




In [62]:
pd.Series(image_generator.generated_path).value_counts()

./out/augmentation_images/9/1.png       1
./out/augmentation_images/16/504.png    1
./out/augmentation_images/8/495.png     1
./out/augmentation_images/8/496.png     1
./out/augmentation_images/8/497.png     1
                                       ..
./out/augmentation_images/4/253.png     1
./out/augmentation_images/4/254.png     1
./out/augmentation_images/4/255.png     1
./out/augmentation_images/4/256.png     1
./out/augmentation_images/5/749.png     1
Length: 749, dtype: int64

In [63]:
import glob

generated_image_path = []
for label_path in glob.glob('./out/augmentation_images/*'):
    for img_path in glob.glob(label_path+'/*'):
        generated_image_path.append(img_path)

In [64]:
len(generated_image_path)

749