# Setting

In [1]:
# function ClickConnect(){
# console.log("Working"); 
# document.querySelector("colab-toolbar-button#connect").click() 
# }setInterval(ClickConnect, 1800000)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import os
REF_PATH = '/content/drive/MyDrive/Github/10_도배하자유형분류'
os.chdir(REF_PATH)

Mounted at /content/drive


<br>

## Import

In [3]:
import gc
gc.collect()

import os,sys
import random
import time

import joblib
import pandas as pd
import numpy as np
import glob
import cv2
import itertools

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

from torchvision import datasets, transforms

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
import torchvision.models as models

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn import preprocessing
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from tqdm.auto import tqdm, trange

from joblib import Parallel, delayed, parallel_backend

import warnings
warnings.filterwarnings(action='ignore') 

import matplotlib.pyplot as plt

In [4]:
from lib.base import mkdir, label_encoder, label_decoder

<br>

## Hyperparameter Setting

In [5]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [26]:
CFG = {
    'IMG_SIZE':224, #224,320,384
    'EPOCHS':50,
    'LEARNING_RATE':5e-4,
    'BATCH_SIZE':16,
    'SEED':0,
    'APPLY_SAMPLER':True,
}

<br>

## Fixed RandomSeed

In [7]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

<br></br>

# User Defined Functions

<br>

## Duplicated Image
- 반점은 2개밖에 없기 때문에, train_test_split에서 1개씩밖에 못가져감
- 이 부분을 해결하기 위해서, 기준빈도 20개 이하인 label에 대해서는 중복해서 데이터를 추가하도록 하였음

In [8]:
def make_dup_data(data):
    df = data.copy()

    # 기준빈도 설정
    # -> 기준빈도만큼 중복추출
    ref_freq = int(df.label.value_counts().mean())
    #print('ref_freq={}'.format(ref_freq))

    # 기준빈도 이하인 라벨 및 생성 할 빈도 확인
    vc = df.label.value_counts()
    vc = vc[vc<ref_freq]

    dup_label_list = vc.index.tolist()
    dup_freq_list  = ref_freq - vc.values

    # 기준빈도 이하인 라벨을 기준빈도만큼 생성
    dup_data = []
    for i in range(len(dup_label_list)):
        labels = df[df.label==dup_label_list[i]].img_path.values.tolist()

        for j in range(dup_freq_list[i]):
            random.seed(j)
            dup_label = random.sample(labels, k=1)
            dup_data.append([dup_label[0],dup_label_list[i]])

    dup_data = pd.DataFrame(dup_data,columns=df.columns)

    return dup_data

<br>

## Custom Dataset

In [9]:
#  이미지 변환
transform = A.Compose([
    A.Resize(CFG['IMG_SIZE'],CFG['IMG_SIZE']),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, always_apply=False, p=1.0),
    ToTensorV2(),
])

In [10]:
# brainAI 이희원님 공유내용
# (참조) https://dacon.io/competitions/official/236082/codeshare/7891?page=1&dtype=recent
class CustomDataset(Dataset):
    def __init__(self, img_path_list, label_list=None, transforms=None, n_jobs=1, desc=''):
        self.img_path_list = img_path_list
        self.label_list = label_list
        self.transforms = transforms
        self.n_jobs = n_jobs

        gc.collect()
        self.pbar = trange(len(self.img_path_list),desc=desc)#,leave=False,position=0)
        if self.n_jobs==1:
            self.features = []
            for i in self.pbar:
                image = self._load_iteration(i)
                self.features.append(image)
        else:
            with parallel_backend('threading', n_jobs=n_jobs):
                self.features = Parallel()(
                    delayed(self._load_iteration)(i)
                    for i in self.pbar
                )

    def _load_iteration(self,i):
        # (1) raw image
        image = cv2.imread(self.img_path_list[i])
        # (2) transform
        if self.transforms is not None:
            image = self.transforms(image=image)['image']
        return image

    def __getitem__(self, index):
        if self.label_list is not None:
            return self.features[index], self.label_list[index]
        else:
            return self.features[index]
        
    def __len__(self):
        return len(self.features)

<br>

## Model Define

In [11]:
class BaseModel(nn.Module):
    def __init__(self, num_classes, dropout, activation):
        super(BaseModel, self).__init__()
        # self.backbone = models.efficientnet_b0(pretrained=True)
        self.backbone = models.efficientnet_b7(pretrained=True)
        # self.backbone = models.vit_b_16(pretrained=True)
        # self.backbone = timm.create_model('swin_base_patch4_window7_224', pretrained=True)
        # self.backbone = timm.models.swin_base_patch4_window7_224()
        # self.backbone.load_state_dict(torch.load('./pretrained/swin_base_patch4_window7_224.pt'))
        
        self.dropout = nn.Dropout(dropout)
        self.bn = nn.BatchNorm1d(1000)
        self.activation = activation
        self.classifier = nn.Linear(1000, num_classes)
        
    def forward(self, x):
        x = self.backbone(x)
        x = self.dropout(x)
        x = self.bn(x)
        x = self.activation(x)
        x = self.classifier(x)
        return x

<br>

## Model Train & Validation

In [12]:
def highlight_diag(df):
    a = np.full(df.shape, '', dtype='<U24')
    np.fill_diagonal(a, 'background-color: green')
    return pd.DataFrame(a, index=df.index, columns=df.columns)

In [13]:
# https://aimaster.tistory.com/82
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, logits=False, reduce=True):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.logits = logits
        self.reduce = reduce

    def forward(self, inputs, targets):
        ce_loss = nn.CrossEntropyLoss()(inputs, targets)

        pt = torch.exp(-ce_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * ce_loss

        if self.reduce:
            return torch.mean(F_loss)
        else:
            return F_loss

In [14]:
def train(model, num_epochs, optimizer, train_loader, val_loader, scheduler, device, mc_path, crosstable):
    model.to(device)
    #criterion = nn.CrossEntropyLoss().to(device)
    criterion = FocalLoss().to(device)
    
    best_score = 0
    best_model = None
    best_iter  = 0
    
    preds, true_labels = [],[]
    total_s = time.time()
    for epoch in range(1, num_epochs+1):
        gc.collect()
        epoch_s = time.time()
        model.train()
        train_loss = []
        pbar = tqdm(iter(train_loader))
        for imgs, labels in pbar:
            imgs = imgs.float().to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            
            output = model(imgs)            
            loss = criterion(output, labels)
            
            preds += output.argmax(1).detach().cpu().numpy().tolist()
            true_labels += labels.detach().cpu().numpy().tolist()
            train_score = f1_score(true_labels, preds, average='weighted')
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
            pbar.set_description('Train Dataset (score: {:.4f}),{}'.format(train_score,len(pd.unique(preds))))
                    
        mean_val_loss, val_score = validation(model, criterion, val_loader, device, crosstable)
        mean_train_loss = np.mean(train_loss)
       
        if scheduler is not None:
            scheduler.step(val_score)
            
        if best_score < val_score:
            best_score = val_score
            best_model = model
            torch.save(model.state_dict(), mc_path)
            best_iter=epoch
            is_best=1
        else:
            is_best=0
        
        mark = '*' if is_best==1 else ' '
        epoch_e = time.time()
        epoch_r = epoch_e-epoch_s
        total_r = epoch_e-total_s
        remain_r = (num_epochs-epoch)*epoch_r
        epoch_str = str(epoch).zfill(len(str(num_epochs)))
        progress = '{}[{}/{}], loss: {:.4f}, val_loss: {:.4f}, val_score: {:.4f}, best: {:.4f}({}), elapsed: {:.1f}s, total: {:.1f}s, remaining: {:.1f}s\n'\
            .format(mark,epoch_str,num_epochs,mean_train_loss,mean_val_loss,val_score,best_score,best_iter,epoch_r,total_r,remain_r)
        print(progress)
    
    return best_model

In [15]:
def validation(model, criterion, val_loader, device, crosstable):
    model.eval()
    val_loss = []
    preds, true_labels = [], []

    with torch.no_grad():
        pbar = tqdm(iter(val_loader))
        for imgs, labels in pbar:
            imgs = imgs.float().to(device)
            labels = labels.to(device)
            
            pred = model(imgs)
            loss = criterion(pred, labels)
            
            preds += pred.argmax(1).detach().cpu().numpy().tolist()
            true_labels += labels.detach().cpu().numpy().tolist()
            val_score = f1_score(true_labels, preds, average='weighted')
            
            val_loss.append(loss.item())
            pbar.set_description('Valid Dataset (score: {:.4f})'.format(val_score))
        
        mean_val_loss = np.mean(val_loss)
        val_score = f1_score(true_labels, preds, average='weighted')
        
        if crosstable:
            ct = pd.crosstab(
                    le.inverse_transform(true_labels), le.inverse_transform(preds),
                    rownames=['Actual'], colnames=['Prediction'], margins=True,
                )
            display(ct.style.apply(highlight_diag, axis=None))
    return mean_val_loss, val_score

<br></br>

# Data Load

In [16]:
# !unzip\
#   /content/drive/MyDrive/Github/10_도배하자유형분류/data/open.zip\
#   -d\
#   /content/drive/MyDrive/Github/10_도배하자유형분류/data/

In [17]:
all_img_list = glob.glob(REF_PATH+'/data/train/*/*')

In [18]:
df = pd.DataFrame(columns=['img_path', 'label'])
df['img_path'] = all_img_list
df['label'] = df['img_path'].apply(lambda x : x.split('/')[-2].replace('.png',''))

In [19]:
df.label.value_counts()

훼손                 1405
오염                  595
걸레받이수정        307
꼬임                  210
터짐                  162
곰팡이               145
오타공                142
몰딩수정            130
면불량               99
석고수정              57
들뜸                  54
피스                    51
창틀,문틀수정      27
울음                  22
이음부불량           17
녹오염                14
가구수정               12
틈새과다                5
반점                   3
Name: label, dtype: int64

<br></br>

# Modeling
- 19: 중복데이터추가(x), GAN(x), Augmentation(o) - 0.5844344949
- 20: 중복데이터추가(x), GAN(o), Augmentation(o) - 0.5918361101

In [20]:
# class가 적은 라벨을 위한 중복데이터 추가
df_new = pd.concat([
    df               .assign(is_dup=0),
    #make_dup_data(df).assign(is_dup=1),
],axis=0)

# (2) label encoding
le = preprocessing.LabelEncoder()
df_new['label'] = le.fit_transform(df_new['label'])

# (3) add the generated image
# (3-1) GAN
gan_label_paths = glob.glob('./out/gan_images/generate/*')
gan_data = []
for i in range(len(gan_label_paths)):
    d = pd.DataFrame(glob.glob(gan_label_paths[0]+'/*'),columns=['img_path'])
    d['label'] = int(gan_label_paths[i].split('/')[-1])
    d = d[['img_path','label']]
    gan_data.append(d)
gan_data = pd.concat(gan_data,axis=0)

# (3-2) Augmentation
augmentation_label_paths = glob.glob('./out/augmentation_images/*')
augmentation_data = []
for i in range(len(augmentation_label_paths)):
    d = pd.DataFrame(glob.glob(augmentation_label_paths[i]+'/*'),columns=['img_path'])
    d['label'] = int(augmentation_label_paths[i].split('/')[-1])
    d = d[['img_path','label']]
    augmentation_data.append(d)
augmentation_data = pd.concat(augmentation_data,axis=0)

# Concatenate
df_new2 = pd.concat([
    df_new           .assign(generate_type='None'),
    #gan_data         .assign(generate_type='GAN'         ,is_dup=0),
    augmentation_data.assign(generate_type='Augmentation',is_dup=0),
],axis=0).reset_index(drop=True)

In [21]:
df_new2.generate_type.value_counts()

None            3457
Augmentation     749
Name: generate_type, dtype: int64

In [22]:
df_new2.label.value_counts()

18    1405
10     595
1      307
3      210
15     162
2      145
11     142
7      130
8      127
16     113
14     103
13     101
0      100
6       99
12      97
9       97
4       94
5       92
17      87
Name: label, dtype: int64

In [23]:
# generate_type도 stratify로 적용
stratify = []
for label,gen_type in df_new2[['label','generate_type']].values:
    stratify.append(str(label)+'_'+gen_type)
# pd.Series(stratify).value_counts().sort_index()

In [27]:
n_splits = 5
kfold_iter = 0
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=CFG['SEED'])
for train_idx, valid_idx in skf.split(df_new2,stratify):
    kfold_iter+=1

    print('-'*80)
    print('> kfold ({}/{})'.format(kfold_iter,n_splits))
    print('-'*80)

    # (1) stratified k-fold
    train_data = df_new2.iloc[train_idx]
    val_data   = df_new2.iloc[valid_idx]

    # (2) dataset : train(augmentation 적용O), validation(augmentation 적용X)
    train_dataset = CustomDataset(
        train_data['img_path'].values, train_data['label'].values, transform, n_jobs=-1, desc='Train Data Load')
    val_dataset   = CustomDataset(
        val_data['img_path'].values, val_data['label'].values, transform, n_jobs=-1, desc='Validation Data Load')
    
    # (3) Setting Data Loader
    if CFG['APPLY_SAMPLER']:
        print('Apply the Weighted Random Sampler')
        # get sampler weights
        train_label = np.array([y for x,y in train_dataset])
        class_sample_count = [len(np.where(train_label==t)[0]) for t in np.unique(train_label)]
        weights = 1 / np.array(class_sample_count)
        train_samples_weights = weights[train_label]

        # sampler
        # -> validation에는 적용하지 않음
        sampler = WeightedRandomSampler(weights=train_samples_weights, num_samples=len(train_samples_weights), replacement=True)

        # loader
        train_loader  = DataLoader(
            train_dataset, batch_size=CFG['BATCH_SIZE'], num_workers=4, sampler=sampler, pin_memory=True)
    else:
        train_loader  = DataLoader(
            train_dataset, batch_size=CFG['BATCH_SIZE'], num_workers=4, shuffle=True, pin_memory=True)
        
    val_loader = DataLoader(
        val_dataset, batch_size=CFG['BATCH_SIZE'], num_workers=4, shuffle=True, pin_memory=True)

    # (4) Modeling
    gc.collect()
    seed_everything(CFG['SEED'])

    model = BaseModel(
        num_classes=len(le.classes_),
        dropout=0.5,
        activation=nn.ReLU(),
    )
    model.eval()
    optimizer = torch.optim.Adam(params=model.parameters(), lr=CFG['LEARNING_RATE'], weight_decay=5e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='max', factor=0.5, patience=2, threshold_mode='abs', min_lr=1e-8, verbose=True)

    infer_model = train(
        model=model,
        num_epochs=CFG['EPOCHS'],
        optimizer=optimizer,
        train_loader=train_loader,
        val_loader=val_loader,
        scheduler=scheduler,
        device=device,
        mc_path='./mc/efficientnet_b7/best_model_kfold_{}.pt'.format(kfold_iter),
        crosstable=False,
    )

    del train_dataset, val_dataset, train_loader, val_loader

--------------------------------------------------------------------------------
> kfold (1/5)
--------------------------------------------------------------------------------


Train Data Load:   0%|          | 0/3364 [00:00<?, ?it/s]

Validation Data Load:   0%|          | 0/842 [00:00<?, ?it/s]

Apply the Weighted Random Sampler


OutOfMemoryError: ignored

<br>

<br></br>

# Inference

In [None]:
def inference(model, loader, device):
    model.eval()
    trues = []
    preds = []
    with torch.no_grad():
        for imgs,labels in tqdm(iter(loader)):
            imgs = imgs.float().to(device)
            
            pred = model(imgs)

            trues += labels.numpy().tolist()
            preds += pred.argmax(1).detach().cpu().numpy().tolist()

    return trues, preds

In [None]:
def inference_test(model, test_loader, device):
    model.eval()
    preds = []
    with torch.no_grad():
        for imgs in tqdm(iter(test_loader)):
            imgs = imgs.float().to(device)
            
            pred = model(imgs)
            
            preds += pred.argmax(1).detach().cpu().numpy().tolist()

    # for i,label in enumerate(le.classes_):
    #   print(i,label)
    
    # preds = le.inverse_transform(preds)
    
    return preds

In [None]:
def label_decoder(label):
    new_label = [
        '가구수정' if l==0 else
        '걸레받이수정' if l==1 else
        '곰팡이' if l==2 else
        '꼬임' if l==3 else
        '녹오염' if l==4 else
        '들뜸' if l==5 else
        '면불량' if l==6 else
        '몰딩수정' if l==7 else
        '반점' if l==8 else
        '석고수정' if l==9 else
        '오염' if l==10 else
        '오타공' if l==11 else
        '울음' if l==12 else
        '이음부불량' if l==13 else
        '창틀,문틀수정' if l==14 else
        '터짐' if l==15 else
        '틈새과다' if l==16 else
        '피스' if l==17 else
        '훼손' if l==18 else
        'NaN' for l in label
    ]
    return new_label

<br>

## Train Dataset

In [None]:
dataset     = CustomDataset(df_new2['img_path'].values, df_new2['label'].values, transform, n_jobs=-1)
data_loader = DataLoader(dataset, batch_size=CFG['BATCH_SIZE'], num_workers=4, shuffle=False, pin_memory=True)

  0%|          | 0/4206 [00:00<?, ?it/s]

In [None]:
trues_list, preds_list = [],[]

for k in range(5):
    infer_model = BaseModel(
        num_classes=19,
        dropout=0.5,
        activation=nn.ReLU(),
    )
    infer_model.to(device)
    infer_model.load_state_dict(torch.load(f'./mc/efficientnet_b0/best_model_kfold_{k+1}.pt'))

    trues, preds = inference(infer_model, data_loader, device)

    trues_list.append(trues)
    preds_list.append(preds)

  0%|          | 0/33 [00:00<?, ?it/s]

  0%|          | 0/33 [00:00<?, ?it/s]

  0%|          | 0/33 [00:00<?, ?it/s]

  0%|          | 0/33 [00:00<?, ?it/s]

  0%|          | 0/33 [00:00<?, ?it/s]

In [None]:
# kfold로부터 얻어진 예측값을 hard voting 방법으로 집계
kfold_preds_list = []
for i in trange(len(preds_list[0])):
    preds = np.array(preds_list)[:,i]
    preds = pd.Series(preds).value_counts().sort_values(ascending=False).index[0]
    kfold_preds_list.append(preds)

# true에 있는 라벨은 한글이 utf-8로 되어있지 않아서 에러가발생함
# -> LabelEncoder를 통해서 decoder dictionary 생성해서 우회
le = preprocessing.LabelEncoder()
le.fit_transform(trues_list[0])

decoder_dict = {}
for i,label in enumerate(le.classes_):
    decoder_dict[i] = label

kfold_preds_list = [decoder_dict[preds] for preds in kfold_preds_list]

# score 산출
score = f1_score(trues_list[0], kfold_preds_list, average='weighted')
print('Weighted F1 Score: {:.4f}'.format(score))

# crosstable 확인
pd.crosstab(
    trues_list[0], kfold_preds_list,
    rownames=['Actual'], colnames=['Prediction'], margins=True,
)

  0%|          | 0/4206 [00:00<?, ?it/s]

Weighted F1 Score: 0.9758


Prediction,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,100
1,0,307,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,307
2,0,0,145,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,145
3,0,0,0,210,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,210
4,0,0,0,0,94,0,0,0,0,0,0,0,0,0,0,0,0,0,0,94
5,0,0,0,0,0,92,0,0,0,0,0,0,0,0,0,0,0,0,0,92
6,0,0,0,0,0,0,99,0,0,0,0,0,0,0,0,0,0,0,0,99
7,0,0,0,0,0,0,0,130,0,0,0,0,0,0,0,0,0,0,0,130
8,0,0,0,0,0,0,0,0,127,0,0,0,0,0,0,0,0,0,0,127
9,0,0,0,0,0,0,0,0,0,97,0,0,0,0,0,0,0,0,0,97


In [None]:
print('(1) Raw Image')
# score 산출
score = f1_score(
    np.array(trues_list[0])[df_new2.generate_type=='None'],
    np.array(kfold_preds_list)[df_new2.generate_type=='None'],
    average='weighted'
)
print('Weighted F1 Score: {:.4f}'.format(score))

# crosstable 확인
display(pd.crosstab(
    np.array(trues_list[0])[df_new2.generate_type=='None'],
    np.array(kfold_preds_list)[df_new2.generate_type=='None'],
    rownames=['Actual'], colnames=['Prediction'], margins=True,
))

print('(2) Augmentation Image')
# score 산출
score = f1_score(
    np.array(trues_list[0])[df_new2.generate_type=='Augmentation'],
    np.array(kfold_preds_list)[df_new2.generate_type=='Augmentation'],
    average='weighted'
)
print('Weighted F1 Score: {:.4f}'.format(score))

# crosstable 확인
display(pd.crosstab(
    np.array(trues_list[0])[df_new2.generate_type=='Augmentation'],
    np.array(kfold_preds_list)[df_new2.generate_type=='Augmentation'],
    rownames=['Actual'], colnames=['Prediction'], margins=True,
))

(1) Raw Image
Weighted F1 Score: 0.9705


Prediction,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12
1,0,307,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,307
2,0,0,145,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,145
3,0,0,0,210,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,210
4,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14
5,0,0,0,0,0,54,0,0,0,0,0,0,0,0,0,0,0,0,0,54
6,0,0,0,0,0,0,99,0,0,0,0,0,0,0,0,0,0,0,0,99
7,0,0,0,0,0,0,0,130,0,0,0,0,0,0,0,0,0,0,0,130
8,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,3
9,0,0,0,0,0,0,0,0,0,57,0,0,0,0,0,0,0,0,0,57


(2) Augmentation Image
Weighted F1 Score: 1.0000


Prediction,0,4,5,8,9,12,13,14,16,17,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,88,0,0,0,0,0,0,0,0,0,88
4,0,80,0,0,0,0,0,0,0,0,80
5,0,0,38,0,0,0,0,0,0,0,38
8,0,0,0,124,0,0,0,0,0,0,124
9,0,0,0,0,40,0,0,0,0,0,40
12,0,0,0,0,0,75,0,0,0,0,75
13,0,0,0,0,0,0,84,0,0,0,84
14,0,0,0,0,0,0,0,76,0,0,76
16,0,0,0,0,0,0,0,0,108,0,108
17,0,0,0,0,0,0,0,0,0,36,36


<br>

## Test Data

In [None]:
test = pd.read_csv(REF_PATH+'/data/test.csv')
test.img_path = test.img_path.apply(lambda x: x.replace('./','./data/'))

test_dataset = CustomDataset(test['img_path'].values, None, transform, n_jobs=-1, desc='Test Data Load')
test_loader  = DataLoader(test_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

Test Data Load:   0%|          | 0/792 [00:00<?, ?it/s]

In [None]:
preds_list = []

for k in range(5):
    infer_model = BaseModel(
        num_classes=19,
        dropout=0.5,
        activation=nn.ReLU(),
    )
    infer_model.to(device)
    infer_model.load_state_dict(torch.load(f'./mc/efficientnet_b0/best_model_kfold_{k+1}.pt'))

    preds = inference_test(infer_model, test_loader, device)
    preds_list.append(preds)

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
# kfold로부터 얻어진 예측값을 hard voting 방법으로 집계
kfold_preds_list = []
for i in trange(len(preds_list[0])):
    preds = np.array(preds_list)[:,i]
    preds = pd.Series(preds).value_counts().sort_values(ascending=False).index[0]
    kfold_preds_list.append(preds)

kfold_preds_list = label_decoder(kfold_preds_list)

  0%|          | 0/792 [00:00<?, ?it/s]

In [None]:
print(len(pd.unique(kfold_preds_list)))
pd.Series(kfold_preds_list).value_counts()

18


훼손         333
오염         232
꼬임          35
터짐          32
오타공         26
면불량         24
몰딩수정        20
석고수정        18
곰팡이         13
창틀,문틀수정     12
걸레받이수정      10
피스          10
가구수정         7
들뜸           7
이음부불량        5
울음           4
녹오염          3
반점           1
dtype: int64

In [None]:
# 저장 후, 해당파일 열어서 UTF-8 선택필요
submit = pd.read_csv('./data/sample_submission.csv')
submit['label'] = kfold_preds_list
submit.to_csv('./out/baseline_submit_22.csv', index=False, encoding='utf-8')

<br>

## Compare with other submission

In [None]:
submit_1 = pd.read_csv('./out/baseline_submit_21.csv')
submit_2 = pd.read_csv('./out/baseline_submit_20.csv')
print(submit_1.label.nunique(), submit_2.label.nunique())

f1=f1_score(submit_1['label'],submit_2['label'],average='weighted')
print('f1_score:',f1)

ct = pd.crosstab(submit_1['label'],submit_2['label'],margins=True)
ct.style.apply(highlight_diag, axis=None)

17 17
f1_score: 0.729971708296774


label,가구수정,걸레받이수정,곰팡이,꼬임,녹오염,들뜸,면불량,몰딩수정,석고수정,오염,오타공,울음,이음부불량,"창틀,문틀수정",터짐,피스,훼손,All
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
가구수정,2,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,2,7
걸레받이수정,0,8,0,0,0,0,2,0,1,1,0,0,0,0,2,0,2,16
곰팡이,0,0,15,0,0,0,1,0,1,4,0,0,0,0,3,0,2,26
꼬임,0,0,0,26,0,1,1,0,0,2,0,0,0,1,0,0,3,34
녹오염,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,3
들뜸,0,0,0,2,0,5,0,0,0,0,0,0,0,0,0,0,1,8
면불량,0,0,0,0,0,0,8,0,0,4,0,0,0,1,0,0,7,20
몰딩수정,1,0,0,0,0,1,0,16,0,2,0,0,0,0,1,0,7,28
석고수정,1,0,0,0,0,0,0,0,12,3,2,0,0,0,0,0,28,46
오염,0,0,0,0,0,1,3,0,0,179,0,0,0,1,1,0,28,213


In [None]:
submit_1['label'].nunique(),submit_2['label'].nunique()

In [None]:
list(set(pd.unique(le.inverse_transform(val_data.label)))-set(submit.label.unique()))

<br></br>