In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import random
import time
import copy
import datetime as dt
import warnings
import itertools

from PIL import Image
from tqdm.auto import tqdm
import torch
from torch import nn
from torch.nn import functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader, random_split
import torchvision
import torchvision.models as models

import albumentations as A
import albumentations.pytorch
import sklearn
from sklearn.metrics import confusion_matrix

from sklearn.metrics import f1_score

In [None]:
CFG = {
    'fold_num': 5,
    'seed': 42,
    'model': 'inception_resnet_v2',
    'img_size': 260,
    'epochs': 200,
    'train_bs':128,
    'valid_bs':32,
    'T_0': 10,
    'lr': 1e-4,
    'min_lr': 1e-6,
    'num_workers': 8,
    'accum_iter': 2, # suppoprt to do batch accumulation for backprop with effectively larger batch size
    'verbose_step': 1,
    'patience' : 5,
    'device': 'cuda:0',
    'freezing': False,
    'model_path': './models'
}

In [2]:
warnings.filterwarnings(action='ignore') 
time_now = dt.datetime.now()
run_id = time_now.strftime("%Y%m%d%H%M%S")


def seed_everything(seed: int=42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = False


seed_everything(42)


def imshow(img):
    img = img / 2 + 0.5 
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()


def plot_confusion_matrix(
                        cm, classes, runid, epoch, 
                        f1, normalize=False, 
                        title='Confusion matrix',
                        cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(f'{title}-{runid}-{epoch}-{f1:.4f}')
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                horizontalalignment="center",
                color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig(f'./cms/cm-{runid}.jpg', dpi=400)
    plt.clf()

### 학습 전 준비

- 테스트 데이터셋은 대표적인 화가 50명에 대한 예술 작품(이미지)의 일부분(약 1/4)만 제공 --> RandomResizedCrop으로 원본의 24%~26%를 자른 후 Noisy Student B7 모델에 맞게 600 x 600으로 resize
- 여러 augmentation들과 TTA를 시도해봤지만 public 점수에 악영향을 줘서 HorizontalFlip 하나만 p=0.5로 사용

In [3]:
transform_train = A.Compose(
    [
        A.RandomResizedCrop(
            height=600, 
            width=600, 
            scale=(0.24, 0.26),
            ratio=(0.90, 1.10),
            always_apply=True
            ),
        A.HorizontalFlip(p=0.5),
        A.Normalize(mean=(0.548, 0.504, 0.479), std=(0.237, 0.247, 0.246)),
        A.pytorch.transforms.ToTensorV2()
        ])

transform_test = A.Compose(
    [
        A.Resize(600, 600),
        A.Normalize(mean=(0.548, 0.504, 0.479), std=(0.237, 0.247, 0.246)),
        A.pytorch.transforms.ToTensorV2()
        ])

- custom dataset

In [4]:
class ARTDataset(Dataset):
    def __init__(self, phase, root, csv, transform) -> None:
        super().__init__()
        df = csv.sort_values(by=['id'])
        self.phase = phase
        self.root = root
        self.transform = transform
        self.images = df['img_path']

        if self.phase != 'test':
            self.label = df['artist']
    
    def __getitem__(self, index):
        image_path = self.images.iloc[index]
        if self.phase != 'test':
            label = int(self.label.iloc[index])

        image_path = f'{self.root}/{image_path[2:]}'
        temp = Image.open(image_path).convert("RGB")
        image = np.array(temp).copy()
        temp.close()

        if self.transform:
            transformed = self.transform(image=image)
            image = transformed['image']

        if self.phase != 'test':
            return image, label
        else:
            return image

    def __len__(self):
        return len(self.images)


- 10 epoch동안 F1 score가 갱신되지 않으면 조기 종료

In [5]:
class EarlyStopping:
    def __init__(self, patience=10, verbose=False, delta=0):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta

    def __call__(self, score):
        if self.best_score is None:
            self.best_score = score
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            print(f'Best F1 score from now: {self.best_score}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.counter = 0
        
        return self.early_stop

- **trainset**을 불러와 LabelEncoder에 fit_transform

In [6]:
train_csv = pd.read_csv('../Data/data/train.csv')
le = sklearn.preprocessing.LabelEncoder()
train_csv['artist'] = le.fit_transform(train_csv['artist'].values)

assert len(le.classes_) == 50

### 학습

- 일반 KFold를 사용하면 가끔 특정 클래스가 포함되지 않은 폴드가 생겨 StratifiedKFold를 사용
- Fold마다 훈련 세트와 검증 세트를 나누고, trainset의 class 별 이미지 수에 반비례하는 weight를 설정 
- 클래스 별 가중치를 구하고, 학습 세트 속 이미지에 weight를 대응시키기 위해 shuffle은 False가 되어야 함

In [None]:
# skf = sklearn.model_selection.StratifiedKFold(n_splits=20, shuffle=False)
skf = sklearn.model_selection.StratifiedKFold(n_splits=10, shuffle=False)
t = train_csv.artist

for fold, (train_index, test_index) in enumerate(skf.split(np.zeros(len(t)), t)):
    early_stopping = EarlyStopping(patience=10, verbose=True)
    data_train = train_csv.loc[train_index]
    data_validation = train_csv.loc[test_index]

    class_counts = data_train['artist'].value_counts(sort=False).to_dict()
    num_samples = sum(class_counts.values())
    print(f'cls_cnts: {len(class_counts)}\nnum_samples:{num_samples}')
    labels = data_train['artist'].to_list()

    # weight 제작, 전체 학습 데이터 수를 해당 클래스의 데이터 수로 나누어 줌
    class_weights = {l:round(num_samples/class_counts[l], 2) for l in class_counts.keys()}
    
    # weighted vote를 시도해보기 위해 만들었지만 최종 제출에는 사용하지 않았음
    class_weights_keys = le.inverse_transform(list(class_weights.keys()))
    class_weights_values = class_weights.values()
    class_weights2 = dict(zip(class_weights_keys, class_weights_values))
    print(class_weights2)

    # class 별 weight를 전체 trainset에 대응시켜 sampler에 넣어줌
    weights = [class_weights[labels[i]] for i in range(int(num_samples))] 
    sampler = torch.utils.data.WeightedRandomSampler(torch.DoubleTensor(weights), int(num_samples))

    # batch_size=288; GPU 개당 batch 32개 * 9 == 288
    train_dataset = ARTDataset('train', '../Data/data/', data_train, transform=transform_train)
    validation_dataset = ARTDataset('validation', '../Data/data/', data_validation, transform=transform_train)
    
    train_loader = DataLoader(
        train_dataset, 
        batch_size=4,
        sampler=sampler,  # trainset에 sampler를 설정해줌
        shuffle=False,
        num_workers=6,
        pin_memory=True
        )
    validation_loader = DataLoader(
        validation_dataset, 
        batch_size=4,
        shuffle=False,
        num_workers=6)

    ss = pd.read_csv('../Data/data/test.csv')
    test_dataset = ARTDataset('test', '../Data/data/', ss, transform=transform_test)

    test_loader = DataLoader(
        test_dataset, 
        batch_size=4,
        shuffle=False, 
        num_workers=6,
        pin_memory=True)

    dataloaders = {
        'train': train_loader,
        'val': validation_loader,
        'test': test_loader
    }

    dataset_sizes = {
        'train': len(train_dataset),
        'val': len(validation_dataset),
        'test': len(test_dataset)
    }

    # timm에서 모델을 가져옴
    device =  torch.device("cuda")
    model = timm.create_model('tf_efficientnet_b7_ns', pretrained=True, num_classes=50)
    model.to(device)
    model = nn.DataParallel(model, device_ids=[0, 1])

    epochs = 300  # 보통 30~40 epoch에서 멈춥니다.
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    # optimizer = optim.AdamW(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss()
    scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

    os.makedirs(f'./runs/{run_id}', exist_ok=True)
    os.makedirs(f'./cms/', exist_ok=True)
    
    since = time.time()
    best_f1 = 0.0
    scaler = torch.cuda.amp.GradScaler()

    os.environ['WANDB_START_METHOD'] = 'thread'
    fold_run_id = f'{run_id}_fold{str(fold)}'
    wandb.init(project="temp_test", entity="hojunking", name=fold_run_id)
    wandb.watch(model)
    
    # 학습
    for epoch in range(epochs):
        print('-'*50)
        print(f'Fold: {fold}')
        print('Epoch {}/{}'.format(epoch, epochs - 1))
        train_loss = 0.0

        for phase in ['train', 'val']:
            running_loss = 0.0
            cm_preds = []
            cm_labels = []
            model_preds = []
            model_labels = []

            if phase == 'train':
                model.train()
            else:
                model.eval()
            
            for x, y in tqdm(iter(dataloaders[phase])):
                x = x.to(device)
                y = y.to(device)
                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    with torch.cuda.amp.autocast(enabled=True):
                        y_hat = model(x)
                        loss = criterion(y_hat, y)
                    _, preds = torch.max(y_hat, 1)

                    if phase == 'train':
                        scaler.scale(loss).backward()
                        scaler.step(optimizer)
                        scaler.update()

                running_loss += loss.item() * x.size(0)
                
                model_labels += y.detach().cpu().numpy().tolist()
                model_preds += preds.detach().cpu().numpy().tolist()

            if phase == 'train' and scheduler != None:
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_f1 = f1_score(
                        model_labels, 
                        model_preds, 
                        average='macro')
            print(f'[{phase}] Loss: {epoch_loss:.4f} Macro F1: {epoch_f1:.4f}')

            # 체크포인트 저장
            if phase == 'val':
                if epoch_f1 > best_f1:
                    best_f1 = epoch_f1
                    torch.save(model, f'./runs/{run_id}/best_model_fold{fold}.pt')
                    confusion_mtx = confusion_matrix(model_labels, model_preds)
                    plot_confusion_matrix(confusion_mtx, classes=class_counts.keys(), runid=fold_run_id, epoch=epoch, f1=best_f1)
                else:
                    # torch.save(model, f'./runs/{run_id}/{epoch}-val_loss{epoch_loss}-val_f1{epoch_f1}.pt')
                    pass
            
            # 로그
            if phase == 'val':
                wandb.log({"val_loss": epoch_loss, "val_f1": epoch_f1, "train_loss": train_loss, "train_f1": train_f1})
            else:
                train_loss = epoch_loss
                train_f1 = epoch_f1
            
        # EARLY STOPPING
        stop = early_stopping(epoch_f1)
        if stop:
            print("called")   
            break

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val_F1: {:4f}'.format(best_f1))

    # 해당 fold의 checkpoint를 불러와 test
    device =  torch.device("cuda")
    checkpoint = f'./runs/{run_id}/best_model_fold{fold}.pt'
    print(f'CHECKPOINT LOADED: {checkpoint}')
    model = torch.load(checkpoint)
    model.to(device)
    model.eval()

    test_preds = []

    with torch.no_grad():
        for x in tqdm(iter(dataloaders['test'])):
                batch_pred = model(x)
                _, pred = torch.max(batch_pred, 1)
                pred = pred.detach().cpu().numpy().tolist()
                test_preds.extend(pred)

    # trainset에 fit_trainsform 되어있는 LabelEncoder로 inverse transform 해줌
    test_preds = le.inverse_transform(test_preds)

    sample_submission = pd.read_csv('../Data/data/sample_submission.csv')
    sample_submission['artist'] = test_preds
    os.makedirs('./output/', exist_ok=True)
    sample_submission.to_csv(f'./output/{run_id}_fold{fold}.csv', index=False)


cls_cnts: 50
num_samples:5319
{'Jackson Pollock': 279.95, 'Eugene Delacroix': 221.62, 'Andrei Rublev': 80.59, 'Peter Paul Rubens': 61.14, 'Mikhail Vrubel': 50.18, 'El Greco': 90.15, 'Claude Monet': 100.36, 'Pieter Bruegel': 69.99, 'Pierre-Auguste Renoir': 25.33, 'Andy Warhol': 44.7, 'Marc Chagall': 34.1, 'Georges Seurat': 197.0, 'Henri Rousseau': 113.17, 'Kazimir Malevich': 64.87, 'Paul Klee': 41.55, 'Jan van Eyck': 93.32, 'Rembrandt': 32.63, 'Leonardo da Vinci': 58.45, 'Albrecht Du rer': 26.86, 'Pablo Picasso': 19.56, 'Alfred Sisley': 36.18, 'Vincent van Gogh': 9.38, 'Frida Kahlo': 69.99, 'Michelangelo': 171.58, 'Edgar Degas': 12.06, 'Diego Velazquez': 73.88, 'Raphael': 80.59, 'Paul Gauguin': 26.86, 'Edvard Munch': 132.97, 'Joan Miro': 78.22, 'Vasiliy Kandinskiy': 98.5, 'Edouard Manet': 94.98, 'Gustave Courbet': 139.97, 'Diego Rivera': 118.2, 'Amedeo Modigliani': 44.7, 'Titian': 34.1, 'Hieronymus Bosch': 51.64, 'Rene Magritte': 43.24, 'Camille Pissarro': 93.32, 'Sandro Botticelli': 49

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mhojunking[0m. Use [1m`wandb login --relogin`[0m to force relogin


--------------------------------------------------
Fold: 0
Epoch 0/299


  0%|          | 0/1330 [00:00<?, ?it/s]

[train] Loss: 3.7319 Macro F1: 0.0569


  0%|          | 0/148 [00:00<?, ?it/s]

[val] Loss: 3.5571 Macro F1: 0.0652
--------------------------------------------------
Fold: 0
Epoch 1/299


  0%|          | 0/1330 [00:00<?, ?it/s]

[train] Loss: 3.4305 Macro F1: 0.0994


  0%|          | 0/148 [00:00<?, ?it/s]

[val] Loss: 4.0162 Macro F1: 0.0826
--------------------------------------------------
Fold: 0
Epoch 2/299


  0%|          | 0/1330 [00:00<?, ?it/s]

[train] Loss: 3.2100 Macro F1: 0.1315


  0%|          | 0/148 [00:00<?, ?it/s]

[val] Loss: 4.6150 Macro F1: 0.1334
--------------------------------------------------
Fold: 0
Epoch 3/299


  0%|          | 0/1330 [00:00<?, ?it/s]

### 투표

- 경로를 수정해서 사용하세요

In [None]:
import pandas as pd
from collections import Counter
from glob import glob


# 경로 수정(run_id*.csv)
csvs = glob('./output/20221010220426*.csv')
csvs2 = glob('./output/20221030183609*.csv')
csvs.extend(csvs2)
print(len(csvs))

preds = []
for csv in csvs:
    f = pd.read_csv(csv)
    artist = f['artist'].tolist()
    preds.append(artist)

out = []
cols = list(zip(*preds))
for c in cols:
    most = Counter(c).most_common()[0][0]
    out.append(most)

print(out[:20])
ss = pd.read_csv('../Data/data/sample_submission.csv')
ss['artist'] = out
ss.to_csv('vote1234.csv', index=False)  # 구분 가능하게 경로 수정

### 기타
- Class Imbalance가 심하기 때문에 Private에서 점수가 크게 흔들릴 가능성이 있다고 판단(실제로도 크게 흔들렸음)
- 가장 많은 표를 받은 class가 2개 이상일 경우 weight가 높은 class를 선택하도록 함
- Public에서 0.001점 정도 감소
- 하지만 Private 점수의 변동을 줄여준다는 보장이 없기 때문에 최종 제출로 선택하지는 않았음
- 낮은 weight를 선택하는 것 보다는 높은 weight를 선택했을 때 Public 점수가 0.002점 정도 높았음


In [None]:
import pandas as pd
from collections import Counter
from glob import glob


# class_weights2 = {'Jackson Pollock': 280.75, 'Peter Paul Rubens': 61.03, 'Andrei Rublev': 80.21, 'Eugene Delacroix': 224.6, 'Rene Magritte': 43.19, 'Claude Monet': 100.27, 'Edouard Manet': 95.17, 'Edvard Munch': 133.69, 'Rembrandt': 32.65, 'Marc Chagall': 34.03, 'El Greco': 90.56, 'Leonardo da Vinci': 58.49, 'Albrecht Du rer': 26.87, 'Edgar Degas': 12.08, 'Pieter Bruegel': 69.32, 'Diego Velazquez': 73.88, 'Pablo Picasso': 19.56, 'Andy Warhol': 44.56, 'Titian': 34.03, 'Jan van Eyck': 93.58, 'Paul Klee': 41.59, 'Vincent van Gogh': 9.39, 'Henri Rousseau': 114.59, 'Alfred Sisley': 35.76, 'Mikhail Vrubel': 50.13, 'Pierre-Auguste Renoir': 25.41, 'Amedeo Modigliani': 44.56, 'Vasiliy Kandinskiy': 98.51, 'Kazimir Malevich': 65.29, 'Paul Gauguin': 26.87, 'Georges Seurat': 193.62, 'Gustav Klimt': 86.38, 'Hieronymus Bosch': 51.51, 'Giotto di Bondone': 81.38, 'Michelangelo': 170.15, 'Raphael': 81.38, 'Frida Kahlo': 70.19, 'Francisco Goya': 28.94, 'Sandro Botticelli': 49.25, 'Salvador Dali': 59.73, 'Paul Cezanne': 181.13, 'Diego Rivera': 119.47, 'Gustave Courbet': 140.38, 'Camille Pissarro': 92.05, 'Joan Miro': 77.99, 'Henri Matisse': 48.83, 'Caravaggio': 187.17, 'William Turner': 136.95, 'Piet Mondrian': 100.27, 'Henri de Toulouse-Lautrec': 96.81}

csvs = glob('./output/20221010220426*.csv')
csvs2 = glob('./output/20221030183609*.csv')
csvs.extend(csvs2)
print(len(csvs))

preds = []
called = 0

for csv in csvs:
    f = pd.read_csv(csv)
    artist = f['artist'].tolist()
    preds.append(artist)

out = []
cols = list(zip(*preds))

for c in cols:
    counted = dict(Counter(c).most_common())
    most_cnt = list(counted.values()).count(list(counted.values())[0])

    if most_cnt != 1:
        called += 1
        vote_keys = list(counted.keys())[:most_cnt]
        vote_weights = [class_weights2[k] for k in vote_keys]
        idx = vote_weights.index(max(vote_weights))
        most = vote_keys[idx]

        print(f'weighted vote called\n{vote_keys}: {vote_weights}')
        print(f'selected: {most}\n')
    else:
        most = Counter(c).most_common()[0][0]
    
    out.append(most)

print(f'called: {called} times')
print(out[:20])

ss = pd.read_csv('../Data/data/sample_submission.csv')
ss['artist'] = out
ss.to_csv('max_weightd_vote_20221030183609+20221010220426.csv', index=False)  # 경로 수정

선택되지 못한 모델들
- Noisy Student B6 (10fold): 0.8783410572
- convnext_xlarge_384_in22ft1k (10fold): 0.8589929047
- EfficientNetV2 XL (10fold): 0.8424065127
- CaiT: 수렴이 너무 느려 포기