In [None]:
import random
import pandas as pd
import numpy as np
import os
import re
import glob
import cv2

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler, SubsetRandomSampler

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
import torchvision.models as models

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from tqdm.auto import tqdm


from torchvision.transforms.functional import to_pil_image

import warnings
warnings.filterwarnings(action='ignore') 

# 기본 설정

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
device

device(type='cuda')

In [None]:
CFG = {
    'IMG_SIZE':224,
    'EPOCHS':10,
    'LEARNING_RATE':3e-4,
    'BATCH_SIZE':32,
    'SEED':41
}

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

# 데이터 로드

In [None]:
base_dir = '/content/gdrive/MyDrive/open (2)/'

In [None]:
train_folder = glob.glob(base_dir + 'train/*')

In [None]:
train_img_list = glob.glob(base_dir + 'train/*/*')

In [None]:
aug_img_list = glob.glob(base_dir + 'aug/*/*')

In [None]:
train = pd.DataFrame(columns=['img_path', 'label'])     # 원본 이미지
train['img_path'] = train_img_list         # 이미지 경로 저장
train['label'] = train['img_path'].apply(lambda x : str(x).split('/')[-2])    # 라벨 저장

In [None]:
aug = pd.DataFrame(columns=['img_path', 'label'])        # WeightedRandomSampler, Augmentation을 적용한 추가 이미지
aug['img_path'] = aug_img_list
aug['label'] = aug['img_path'].apply(lambda x : str(x).split('/')[-2])

In [None]:
le = preprocessing.LabelEncoder()
train['label'] = le.fit_transform(train['label'])

In [None]:
df = pd.concat([train,aug])             # 원본 데이터와 추가 데이터를 합쳐서 훈련할 예정

In [None]:
df['label'] = df['label'].astype(int)

In [None]:
train

Unnamed: 0,img_path,label
0,/content/gdrive/MyDrive/open (2)/train/가구수ᄌ...,0
1,/content/gdrive/MyDrive/open (2)/train/가구수ᄌ...,0
2,/content/gdrive/MyDrive/open (2)/train/가구수ᄌ...,0
3,/content/gdrive/MyDrive/open (2)/train/가구수ᄌ...,0
4,/content/gdrive/MyDrive/open (2)/train/가구수ᄌ...,0
...,...,...
3452,/content/gdrive/MyDrive/open (2)/train/틈새과...,16
3453,/content/gdrive/MyDrive/open (2)/train/틈새과...,16
3454,/content/gdrive/MyDrive/open (2)/train/틈새과...,16
3455,/content/gdrive/MyDrive/open (2)/train/틈새과...,16


In [None]:
aug

Unnamed: 0,img_path,label
0,/content/gdrive/MyDrive/open (2)/aug/0/20.jpg,0
1,/content/gdrive/MyDrive/open (2)/aug/0/37.jpg,0
2,/content/gdrive/MyDrive/open (2)/aug/0/63.jpg,0
3,/content/gdrive/MyDrive/open (2)/aug/0/64.jpg,0
4,/content/gdrive/MyDrive/open (2)/aug/0/71.jpg,0
...,...,...
10366,/content/gdrive/MyDrive/open (2)/aug/18/10319.jpg,18
10367,/content/gdrive/MyDrive/open (2)/aug/18/10321.jpg,18
10368,/content/gdrive/MyDrive/open (2)/aug/18/10335.jpg,18
10369,/content/gdrive/MyDrive/open (2)/aug/18/10344.jpg,18


In [None]:
df

Unnamed: 0,img_path,label
0,/content/gdrive/MyDrive/open (2)/train/가구수ᄌ...,0
1,/content/gdrive/MyDrive/open (2)/train/가구수ᄌ...,0
2,/content/gdrive/MyDrive/open (2)/train/가구수ᄌ...,0
3,/content/gdrive/MyDrive/open (2)/train/가구수ᄌ...,0
4,/content/gdrive/MyDrive/open (2)/train/가구수ᄌ...,0
...,...,...
10366,/content/gdrive/MyDrive/open (2)/aug/18/10319.jpg,18
10367,/content/gdrive/MyDrive/open (2)/aug/18/10321.jpg,18
10368,/content/gdrive/MyDrive/open (2)/aug/18/10335.jpg,18
10369,/content/gdrive/MyDrive/open (2)/aug/18/10344.jpg,18


# Valid데이터 분리

In [None]:
train, val, _, _ = train_test_split(train, train['label'], test_size=0.3, stratify=train['label'], random_state=CFG['SEED'])            # 원본 데이터의 일부 만큼 Validation을 해볼 예정

In [None]:
val

Unnamed: 0,img_path,label
133,/content/gdrive/MyDrive/open (2)/train/걸레바...,1
3119,/content/gdrive/MyDrive/open (2)/train/훼손/1...,18
12,/content/gdrive/MyDrive/open (2)/train/걸레바...,1
1567,/content/gdrive/MyDrive/open (2)/train/오염/4...,10
3321,/content/gdrive/MyDrive/open (2)/train/훼손/1...,18
...,...,...
1949,/content/gdrive/MyDrive/open (2)/train/터짐/9...,15
1367,/content/gdrive/MyDrive/open (2)/train/오염/2...,10
2286,/content/gdrive/MyDrive/open (2)/train/훼손/3...,18
486,/content/gdrive/MyDrive/open (2)/train/꼬임/5...,3


# 커스텀 데이터셋, DataLoader 설정

In [None]:
class CustomDataset(Dataset):
    def __init__(self, img_path_list, label, transforms=None):
        self.img_path_list = img_path_list
        self.label = label
        self.transforms = transforms
        
    def __getitem__(self, index):
        img_path = self.img_path_list[index]
        
        image = cv2.imread(img_path)
        
        if self.transforms is not None:
            image = self.transforms(image=image)['image']
        
        if self.label is not None:
            label = self.label[index]
            return image, label
        else:
            return image
        
    def __len__(self):
        return len(self.img_path_list)

In [None]:
train_transform = A.Compose([
                            A.Resize(CFG['IMG_SIZE'],CFG['IMG_SIZE']),
                            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, always_apply=False, p=1.0),
                            ToTensorV2()
                            ])

test_transform = A.Compose([
                            A.Resize(CFG['IMG_SIZE'],CFG['IMG_SIZE']),
                            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, always_apply=False, p=1.0),
                            ToTensorV2()
                            ])

In [None]:
train_set = CustomDataset(df['img_path'].values, df['label'].values, train_transform)
train_loader = DataLoader(train_set, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=8,pin_memory=True)

valid_set = CustomDataset(val['img_path'].values, val['label'].values, test_transform)
valid_loader = DataLoader(valid_set, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=8,pin_memory=True)


In [None]:
from time import time
import multiprocessing as mp
train_set = CustomDataset(df['img_path'].values, df['label'].values, train_transform)
valid_set = CustomDataset(val['img_path'].values, val['label'].values, test_transform)
for num_workers in range(2, mp.cpu_count(), 2):  
    train_loader = DataLoader(valid_set,shuffle=False,num_workers=num_workers,batch_size=64,pin_memory=True)
    start = time()
    for epoch in tqdm(range(1, 3)):
        for i, data in tqdm(enumerate(train_loader, 0)):
            pass
    end = time()
    print("Finish with:{} second, num_workers={}".format(end - start, num_workers))                               # 훈련 속도 향상(데이터 로더 속도 향상)을 위하여 최적의 num_workers수

  0%|          | 0/2 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Finish with:191.38342189788818 second, num_workers=2


  0%|          | 0/2 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Finish with:8.425860404968262 second, num_workers=4


  0%|          | 0/2 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Finish with:7.0235395431518555 second, num_workers=6


  0%|          | 0/2 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Finish with:5.930846214294434 second, num_workers=8


  0%|          | 0/2 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Finish with:5.8551552295684814 second, num_workers=10
