# 작물 병해 분류 AI 경진대회

## Overview
- Training dataset이 250장으로 매우 적었습니다.
- Train:200, Val:50 장으로 학습시켜서 5Fold로 제출해도 0.991이라는 Public Score가 나왔으나 조금 더 안정적인 학습을 위해 Pseudo Labeling 방법을 사용하였습니다.
- RegNetY-064를 기반으로 Pseudo Labeling을 반복적으로 수행하였고,
- RegNetY-064를 통해 Pseudo label set으로 학습한 5Fold 앙상블 (RegNetY-064, EfficientNet-B3) 2개 모델 제출했습니다.
- Private에서 최종적으로 선택되어진 모델은 5Fold EfficientNet-B3 앙상블 모델입니다.

## 배운점
- 평가 Measure가 F1-Score였는데 Class의 Imbalance에는 크게 신경을 쓰지 않았습니다.
- 1등, 2등 하신 분들의 Solution을 보면 Pseudo labeling 기법은 동일하게 사용하였지만 Class별 Sampling or Loss 가중치를 다르게 준 것을 확인했습니다.
- 그래서, Public에서 1등이었으나 Private에서는 아쉽게 3등이 되지 않았나 생각합니다.
- F1-Score가 Metric일 경우 Class Imbalance를 고려하여 모델링을 해야겠다는 생각을 합니다.

## 0. Prerequisites

## 0-1. Requirements
Ubuntu 18.04, Cuda 11.1

- opencv-python  
- numpy  
- pandas
- timm
- torch==1.8.0 torchvision 0.9.0 with cuda 11.1
- natsort
- scikit-learn
- pillow
- torch_optimizer
- tqdm
- easydict

## 0-2 Directory 구조

- data
    - train.csv
    - test.csv
    - sample_submission.csv
    - train_imgs
        - 10000.jpg
        - 10001.jpg
        - ...
    - test_imgs
        - 20000.jpg
        - 20001.jpg
        - ...
- notebook
    - 3rd place solution.ipynb
    - results

## 0-3. Import Library

In [3]:
import os
import cv2
import time
import random
import logging
import easydict
import numpy as np
import pandas as pd
from tqdm import tqdm
from os.path import join as opj
from ptflops import get_model_complexity_info
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from PIL import Image

import timm
import torch
import torch.nn as nn
import torch_optimizer as optim
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, grad_scaler
from torchvision import transforms

import warnings
warnings.filterwarnings('ignore')


## 0-4. Config

모델 및 학습의 Hyper-parameter를 정의합니다.

In [4]:
args = easydict.EasyDict(
    {'exp_num':'0',
     'experiment':'Base',
     'tag':'Default',

     # Path settings
     'data_path':'../data',
     'fold':0,
     'Kfold':5,
     'model_path':'results/',

     # Model parameter settings
     'encoder_name':'regnety_064',
     'drop_path_rate':0.2,
     
     # Training parameter settings
     ## Base Parameter
     'img_size':352,
     'batch_size':16,
     'epochs':60,
     'optimizer':'Lamb',
     'initial_lr':5e-6,
     'weight_decay':1e-3,

     ## Augmentation
     'aug_ver':2,

     ## Scheduler
     'scheduler':'cycle',
     'warm_epoch':5,
     ### OnecycleLR
     'max_lr':1e-3,

     ## etc.
     'patience':15,
     'clipping':None,

     # Hardware settings
     'amp':True,
     'multi_gpu':False,
     'logging':False,
     'num_workers':4,
     'seed':42
    })

## 1. Make dataset

Dataset

In [5]:
df = pd.read_csv('../data/train.csv')
df

Unnamed: 0,uid,img_path,disease,disease_code
0,10000,train_imgs/10000.jpg,시설포도노균병,1
1,10001,train_imgs/10001.jpg,시설포도노균병,1
2,10002,train_imgs/10002.jpg,시설포도노균병반응,2
3,10003,train_imgs/10003.jpg,축과병,4
4,10004,train_imgs/10004.jpg,시설포도노균병,1
...,...,...,...,...
245,10245,train_imgs/10245.jpg,시설포도노균병반응,2
246,10246,train_imgs/10246.jpg,시설포도탄저병반응,6
247,10247,train_imgs/10247.jpg,시설포도노균병,1
248,10248,train_imgs/10248.jpg,시설포도노균병반응,2


In [4]:
# 원본 이미지의 Resolution이 커서 Load시 병목으로 인하여 적당한 사이즈로 Resize하여 저장
# Pseudo Labeling을 위해 Train/Test Image를 모두 한폴더에 미리 저장 (불필요한 저장용량 증가 방지)

path = '../data/total_imgs_1024'

# Make Train
df = pd.read_csv('../data/train.csv')
os.makedirs(path, exist_ok=True)
for img in df['img_path']:
    name = os.path.basename(img)
    img = cv2.imread(opj('../data/', img))
    img = cv2.resize(img, dsize=(1024, 1024))
    img = cv2.imwrite(opj(path, name), img)

# Make Test
df = pd.read_csv('../data/test.csv')
for img in df['img_path']:
    name = os.path.basename(img)
    img = cv2.imread(opj('../data/', img))
    img = cv2.resize(img, dsize=(1024, 1024))
    img = cv2.imwrite(opj(path, name), img)

## Dataset & Loader

In [6]:
class Train_Dataset(Dataset):
    def __init__(self, df, transform=None):
        self.img_path = df['img_path'].values
        self.target = df['disease_code'].values
        self.transform = transform

        print(f'Dataset size:{len(self.img_path)}')

    def __getitem__(self, idx):
        image = Image.open(opj('../data/', self.img_path[idx])).convert('RGB')
        image = self.transform(image)
        target = self.target[idx]

        return image, target

    def __len__(self):
        return len(self.img_path)

class Test_dataset(Dataset):
    def __init__(self, df, transform=None):
        self.img_path = df['img_path'].values
        self.transform = transform

        print(f'Test Dataset size:{len(self.img_path)}')

    def __getitem__(self, idx):
        image = Image.open(opj('../data/', self.img_path[idx])).convert('RGB')
        image = self.transform(image)
        return image

    def __len__(self):
        return len(self.img_path)

def get_loader(df, phase: str, batch_size, shuffle,
               num_workers, transform):
    if phase == 'test':
        dataset = Test_dataset(df, transform)
        data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=True)
    else:
        dataset = Train_Dataset(df, transform)
        data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=True,
                                 drop_last=False)
    return data_loader

def get_train_augmentation(img_size, ver):
    if ver==1: # for valid, test
        transform = transforms.Compose([
                transforms.ToTensor(),
                transforms.Resize((img_size, img_size)),
                transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225]),
                ])

    if ver==2:
        transform = transforms.Compose([
                transforms.RandomHorizontalFlip(),
                transforms.RandomVerticalFlip(),
                transforms.RandomAffine((-20, 20)),
                transforms.RandomRotation(90),
                transforms.ToTensor(),
                transforms.Resize((img_size, img_size)),
                transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225]),
            ])
            
    return transform                

## Network
pytorch image models(timm) 라이브러리를 활용하여 Generalization Performance에 강점을 가지는 RegNet을 Base 모델로 사용하였습니다.

In [7]:
class Network(nn.Module):
    def __init__(self, args):
        super().__init__()
        self.encoder = timm.create_model(args.encoder_name, pretrained=True, drop_path_rate=args.drop_path_rate)

        if 'regnet' in args.encoder_name:
            num_head = self.encoder.head.fc.in_features
            self.encoder.head.fc = nn.Linear(num_head, 7)

        elif 'efficient' in args.encoder_name:
            num_head = self.encoder.classifier.in_features
            self.encoder.classifier = nn.Linear(num_head, 7)

    def forward(self, x):
        x = self.encoder(x)
        return x

class Network_test(nn.Module):   # Without drop_path_rate
    def __init__(self, encoder_name):
        super().__init__()
        self.encoder = timm.create_model(encoder_name, pretrained=False)

        if 'regnet' in encoder_name:
            num_head = self.encoder.head.fc.in_features
            self.encoder.head.fc = nn.Linear(num_head, 7)

        elif 'efficient' in encoder_name:
            num_head = self.encoder.classifier.in_features
            self.encoder.classifier = nn.Linear(num_head, 7)
    
    def forward(self, x):
        x = self.encoder(x)
        return x

## Utils for training and Logging
Logging과 AvgMeter를 통해 실험 기록을 log파일로 남도록 저장하였습니다.  
추가로 실험마다 비교를 쉽게 하기위해 Neptune을 활용하였는데 코드에서는 제거하였습니다.

In [8]:
# Warmup Learning rate scheduler
from torch.optim.lr_scheduler import _LRScheduler
class WarmUpLR(_LRScheduler):
    """warmup_training learning rate scheduler
    Args:
        optimizer: optimzier(e.g. SGD)
        total_iters: totoal_iters of warmup phase
    """
    def __init__(self, optimizer, total_iters, last_epoch=-1):
        
        self.total_iters = total_iters
        super().__init__(optimizer, last_epoch)

    def get_lr(self):
        """we will use the first m batches, and set the learning
        rate to base_lr * m / total_iters
        """
        return [base_lr * self.last_epoch / (self.total_iters + 1e-8) for base_lr in self.base_lrs]

# Logging
def get_root_logger(logger_name='basicsr',
                    log_level=logging.INFO,
                    log_file=None):

    logger = logging.getLogger(logger_name)
    # if the logger has been initialized, just return it
    if logger.hasHandlers():
        return logger

    format_str = '%(asctime)s %(levelname)s: %(message)s'
    logging.basicConfig(format=format_str, level=log_level)

    if log_file is not None:
        file_handler = logging.FileHandler(log_file, 'w')
        file_handler.setFormatter(logging.Formatter(format_str))
        file_handler.setLevel(log_level)
        logger.addHandler(file_handler)

    return logger

class AvgMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        self.losses = []

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        self.losses.append(val)


## Trainer
모델의 학습(training function)과 검증(Validation)을 위한 Class입니다.  

In [9]:
class Trainer():
    def __init__(self, args, save_path):
        '''
        args: arguments
        save_path: Model 가중치 저장 경로
        '''
        super(Trainer, self).__init__()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        # Logging
        log_file = os.path.join(save_path, 'log.log')
        self.logger = get_root_logger(logger_name='IR', log_level=logging.INFO, log_file=log_file)
        self.logger.info(args)
        self.logger.info(args.tag)

        # Train, Valid Set load
        ############################################################################

        original_train_length = 250
        if args.phase == 0 :
            df_train = pd.read_csv(opj(args.data_path, 'train.csv'))
        else :
            df_train = pd.read_csv(opj(args.data_path, f'train_pseudo{(args.phase)-1}.csv'))

        df_train['fold'] = -1
        kf = StratifiedKFold(n_splits=args.Kfold, shuffle=True, random_state=args.seed)

        # 기존 Train set 250장을 Train 200장과 Valid 50장으로 나누고, 추가적인 Pseudo set이 들어온 경우 Train 200장 + pseudo set 개수가 되도록 코드 작성
        for fold, (train_idx, val_idx) in enumerate(kf.split(range(original_train_length), y=df_train['disease_code'].values[:original_train_length])):
            df_train.loc[val_idx, 'fold'] = fold
        val_idx = list(df_train[df_train['fold'] == int(args.fold)].index)

        df_val = df_train[df_train['fold'] == args.fold].reset_index(drop=True)
        df_train = df_train[df_train['fold'] != args.fold].reset_index(drop=True)

        # Augmentation
        self.train_transform = get_train_augmentation(img_size=args.img_size, ver=args.aug_ver)
        self.test_transform = get_train_augmentation(img_size=args.img_size, ver=1)

        # TrainLoader
        self.train_loader = get_loader(df_train, phase='train', batch_size=args.batch_size, shuffle=True,
                                       num_workers=args.num_workers, transform=self.train_transform)
        self.val_loader = get_loader(df_val, phase='train', batch_size=args.batch_size, shuffle=False,
                                       num_workers=args.num_workers, transform=self.test_transform)

        # Network
        self.model = Network(args).to(self.device)
        macs, params = get_model_complexity_info(self.model, (3, args.img_size, args.img_size), as_strings=True,
                                                 print_per_layer_stat=False, verbose=False)
        self.logger.info('{:<30}  {:<8}'.format('Computational complexity: ', macs))
        self.logger.info('{:<30}  {:<8}'.format('Number of parameters: ', params))

        # Loss
        self.criterion = nn.CrossEntropyLoss()
        # self.criterion = LabelSmoothingLoss(classes=7, smoothing=0.1)
        
        # Optimizer & Scheduler
        self.optimizer = optim.Lamb(self.model.parameters(), lr=args.initial_lr, weight_decay=args.weight_decay)
        
        iter_per_epoch = len(self.train_loader)
        self.warmup_scheduler = WarmUpLR(self.optimizer, iter_per_epoch * args.warm_epoch)
        self.scheduler = torch.optim.lr_scheduler.OneCycleLR(self.optimizer, max_lr=args.max_lr, steps_per_epoch=iter_per_epoch, epochs=args.epochs)

        if args.multi_gpu:
            self.model = nn.DataParallel(self.model).to(self.device)

        # Train / Validate
        best_loss = np.inf
        best_epoch = 0
        early_stopping = 0
        start = time.time()
        for epoch in range(1, args.epochs+1):
            self.epoch = epoch

            if args.scheduler == 'cos':
                if epoch > args.warm_epoch:
                    self.scheduler.step()

            # Training
            train_loss, train_acc, train_f1 = self.training(args)

            # Validation
            val_loss, val_acc, val_f1 = self.validate()

            # Save models
            if val_loss < best_loss:
                # Model weight in Multi_GPU or Single GPU
                state_dict= self.model.module.state_dict() if args.multi_gpu else self.model.state_dict()

                early_stopping = 0
                best_epoch = epoch
                best_loss = val_loss
                best_acc = val_acc
                best_f1 = val_f1

                torch.save({'epoch':epoch,
                            'state_dict':state_dict,
                            'optimizer': self.optimizer.state_dict(),
                            'scheduler': self.scheduler.state_dict(),
                    }, os.path.join(save_path, 'best_model.pth'))
                self.logger.info(f'-----------------SAVE:{best_epoch}epoch----------------')
            else:
                early_stopping += 1

            # Early Stopping
            if early_stopping == args.patience:
                break

        self.logger.info(f'\nBest Val Epoch:{best_epoch} | Val Loss:{best_loss:.4f} | Val Acc:{best_acc:.4f} | Val F1:{best_f1:.4f}')
        end = time.time()
        self.logger.info(f'Total Process time:{(end - start) / 60:.3f}Minute')

    # Training
    def training(self, args):
        self.model.train()
        train_loss = AvgMeter()
        train_acc = 0
        preds_list = []
        targets_list = []

        scaler = grad_scaler.GradScaler()
        for i, (images, targets) in enumerate(tqdm(self.train_loader)):
            images = torch.tensor(images, device=self.device, dtype=torch.float32)
            targets = torch.tensor(targets, device=self.device, dtype=torch.long)
            
            if self.epoch <= args.warm_epoch:
                self.warmup_scheduler.step()

            self.model.zero_grad(set_to_none=True)
            if args.amp:
                with autocast():
                    preds = self.model(images)
                    loss = self.criterion(preds, targets)
                scaler.scale(loss).backward()

                # Gradient Clipping
                if args.clipping is not None:
                    scaler.unscale_(self.optimizer)
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), args.clipping)

                scaler.step(self.optimizer)
                scaler.update()

            else:
                preds = self.model(images)
                loss = self.criterion(preds, targets)
                loss.backward()
                nn.utils.clip_grad_norm_(self.model.parameters(), args.clipping)
                self.optimizer.step()

            if args.scheduler == 'cycle':
                if self.epoch > args.warm_epoch:
                    self.scheduler.step()

            # Metric
            train_acc += (preds.argmax(dim=1) == targets).sum().item()
            preds_list.extend(preds.argmax(dim=1).cpu().detach().numpy())
            targets_list.extend(targets.cpu().detach().numpy())
            # log
            train_loss.update(loss.item(), n=images.size(0))

        train_acc /= len(self.train_loader.dataset)
        train_f1 = f1_score(np.array(targets_list), np.array(preds_list), average='macro')

        self.logger.info(f'Epoch:[{self.epoch:03d}/{args.epochs:03d}]')
        self.logger.info(f'Train Loss:{train_loss.avg:.3f} | Acc:{train_acc:.4f} | F1:{train_f1:.4f}')
        return train_loss.avg, train_acc, train_f1
            
    # Validation or Dev
    def validate(self):
        self.model.eval()
        with torch.no_grad():
            val_loss = AvgMeter()
            val_acc = 0
            preds_list = []
            targets_list = []

            for i, (images, targets) in enumerate(self.val_loader):
                images = torch.tensor(images, device=self.device, dtype=torch.float32)
                targets = torch.tensor(targets, device=self.device, dtype=torch.long)

                preds = self.model(images)
                loss = self.criterion(preds, targets)

                # Metric
                val_acc += (preds.argmax(dim=1) == targets).sum().item()
                preds_list.extend(preds.argmax(dim=1).cpu().detach().numpy())
                targets_list.extend(targets.cpu().detach().numpy())

                # log
                val_loss.update(loss.item(), n=images.size(0))
            val_acc /= len(self.val_loader.dataset)
            val_f1 = f1_score(np.array(targets_list), np.array(preds_list), average='macro')

            self.logger.info(f'Valid Loss:{val_loss.avg:.3f} | Acc:{val_acc:.4f} | F1:{val_f1:.4f}')
        return val_loss.avg, val_acc, val_f1

## Main Function

In [10]:
def main(args):
    # Random Seed
    seed = args.seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = True

    save_path = os.path.join(args.model_path, (args.exp_num).zfill(3))
    # Create model directory
    os.makedirs(save_path, exist_ok=True)
    Trainer(args, save_path)

In [15]:
# model을 받아서 예측 (ensemble 포함)
def predict(model_list, test_loader, device):
    preds_list = []
    with torch.no_grad():
        for images in tqdm(test_loader):
            images = torch.as_tensor(images, device=device, dtype=torch.float32)

            ensemble = np.zeros((images.shape[0], 7), dtype=np.float32)
            for model in model_list:
                preds = model(images)
                preds = torch.softmax(preds, dim=1)
                preds = preds.detach().cpu().numpy()
                ensemble += preds
            preds = ensemble / len(model_list)
            preds_list.extend(preds.tolist())
    return np.array(preds_list)

# 학습된 Weight을 Load하고 반환하는 함수
def load_model(encoder_name, model_path):
    model = Network_test(encoder_name).to(device)
    model.load_state_dict(torch.load(opj('./results/', model_path, 'best_model.pth'))['state_dict'])
    model.eval()
    return model

# Pseudo label을 추가하여 학습하기 위한 DataFrame 생성
def generate_df_pseudo(df_train, df_test, ensemble_pred, threshold, phase):
    max_proba = ensemble_pred.max(axis=1)
    pseudo_idx = np.where(max_proba > threshold)[0]
    print('Add Number of images', len(pseudo_idx))

    # Make pseudo dataframe
    df_pseudo = df_train.drop(['uid', 'disease'], axis=1)
    for i, img_path in enumerate(df_test['img_path']):
        if i in pseudo_idx:
            name = os.path.basename(img_path)
            row = [opj('total_imgs_1024', name), ensemble_pred.argmax(axis=1)[i]]
            df_pseudo = df_pseudo.append({'img_path':row[0],
                                        'disease_code':row[1]
                                        }, ignore_index=True)
    df_pseudo.to_csv(f'../data/train_pseudo{phase}.csv')


In [None]:
if __name__ == '__main__':
    # Test dataset
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    submission = pd.read_csv('../data/sample_submission.csv')
    df_train = pd.read_csv('../data/train.csv')
    df_test = pd.read_csv('../data/test.csv')

    # Train, Test dataframe img_path 경로 바꿔주기
    df_train['img_path'] = df_train['img_path'].apply(lambda x:x.replace('train_imgs', 'total_imgs_1024'))
    df_test['img_path'] = df_test['img_path'].apply(lambda x:x.replace('test_imgs', 'total_imgs_1024'))

    test_transform = get_train_augmentation(img_size=352, ver=1)
    test_dataset = Test_dataset(df_test, test_transform)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2)

    encoder_list = ['regnety_064'] * 5
    threshold = [0.95, 0.9, 0.9, 0.85, 0.85]   # Pseudo labeling을 할 때, threshold를 줄여가면서 test sample 사용 증가
    for phase in range(0, 5):  # Pseudo labeling 방법 5번 반복
        args.phase = phase
        exp_num_list = []

        if phase == 4:
            args.encoder_name = 'efficientnet_b3'
            encoder_list = ['efficientnet_b3']

        for i in range(5):   # Train the 5Folds
            args.fold = i
            args.exp_num = str(i).zfill(3)
            main(args)
            exp_num_list.append(args.exp_num)

        # 5Fold Ensemble Predict
        model_list = [load_model(enc, path) for enc, path in zip(encoder_list, exp_num_list)]
        ensemble_pred = predict(model_list, test_loader, device)
        generate_df_pseudo(df_train, df_test, ensemble_pred, threshold[phase], phase)

    submission.iloc[:, 1] = ensemble_pred.argmax(axis=1)
    submission.to_csv('final_submission.csv', index=False)