# 0. Intro

환경은 구글 Colab을 사용하였습니다.
메인이 되는 트레이닝 방식은 Semi-supervised 입니다.

## Flow

제가 대회에 참여하면서 생각하고 진행했던 내용 입니다.

- CNN 모델링을 하면서 성능이 제일 잘 나오는 모델을 먼저 찾아보았습니다.
- CNN은 Skip Connection 과 CBAM(발음주의)이라는 CNN Attention 기법을 섞어서 만들었습니다. 
    - Skip Connection을 가장 먼저 시도하였고 그 후 CBAM을 연결하였습니다.
- 0.88 정도가 나오는 단일 모델을 찾은 후, Semi-supervised를 진행하였습니다.
    - 진행순서는 다음과 같습니다.
    - [1] 가장 validation이 잘 나오는 모델로 test를 예측합니다.
    - [2] 예측한 test중 label을 95% 이상이라고 예측한것만 추가로 test의 label로 넣어줍니다.
    - [3] train과 test를 합쳐서 다시 훈련합니다. (validation set은 test와 합치기 전 맨처음 split해놓은 상태로 오로지 validation만 합니다.)
    - [4] 1~3번을 10번 반복하여 나온 결과를 저장합니다.
- train data(전체) 와 예측된 test를 가지고 5fold 훈련을 하고 정답을 예측합니다.

# 1. 라이브러리 및 데이터 로드

In [None]:
path = "/content/drive/My Drive/data/dacon mnist/"

In [None]:
import pandas as pd
import numpy as np
import torch
import math
import torch.nn as nn
import torch.nn.functional as F

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [None]:
from torch import cuda
device = 'cpu'
if cuda.device_count() > 0:
    device = 'cuda'
print(device)

In [None]:
data = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + 'test.csv')

In [None]:
test['digit'] = np.nan
test = pd.concat([test.iloc[:, :2], test.iloc[:, -1], test.iloc[:, 2:-1]], axis=1)

In [None]:
from string import ascii_uppercase
chars = {abc: i for i, abc, in enumerate(ascii_uppercase)}
data['letter'] = data['letter'].map(chars)
test['letter'] = test['letter'].map(chars)

In [None]:
from torchvision import models
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset

# 2. 모델링

## CNN Attention(CBAM) 
위에서 언급했던 CNN의 Attention 기법입니다. 

In [None]:
# https://github.com/Jongchan/attention-module
class BasicConv(nn.Module):
    def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True, bn=True, bias=False):
        super(BasicConv, self).__init__()
        self.out_channels = out_planes
        self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias)
        self.bn = nn.BatchNorm2d(out_planes,eps=1e-5, momentum=0.01, affine=True) if bn else None
        self.relu = nn.ReLU() if relu else None

    def forward(self, x):
        x = self.conv(x)
        if self.bn is not None:
            x = self.bn(x)
        if self.relu is not None:
            x = self.relu(x)
        return x

class Flatten(nn.Module):
    def forward(self, x):
        return x.view(x.size(0), -1)

class ChannelGate(nn.Module):
    def __init__(self, gate_channels, reduction_ratio=16, pool_types=['avg', 'max']):
        super(ChannelGate, self).__init__()
        self.gate_channels = gate_channels
        self.mlp = nn.Sequential(
            Flatten(),
            nn.Linear(gate_channels, gate_channels // reduction_ratio),
            nn.ReLU(),
            nn.Linear(gate_channels // reduction_ratio, gate_channels)
            )
        self.pool_types = pool_types
    def forward(self, x):
        channel_att_sum = None
        for pool_type in self.pool_types:
            if pool_type=='avg':
                avg_pool = F.avg_pool2d( x, (x.size(2), x.size(3)), stride=(x.size(2), x.size(3)))
                channel_att_raw = self.mlp( avg_pool )
            elif pool_type=='max':
                max_pool = F.max_pool2d( x, (x.size(2), x.size(3)), stride=(x.size(2), x.size(3)))
                channel_att_raw = self.mlp( max_pool )
            elif pool_type=='lp':
                lp_pool = F.lp_pool2d( x, 2, (x.size(2), x.size(3)), stride=(x.size(2), x.size(3)))
                channel_att_raw = self.mlp( lp_pool )
            elif pool_type=='lse':
                # LSE pool only
                lse_pool = logsumexp_2d(x)
                channel_att_raw = self.mlp( lse_pool )

            if channel_att_sum is None:
                channel_att_sum = channel_att_raw
            else:
                channel_att_sum = channel_att_sum + channel_att_raw

        scale = torch.sigmoid( channel_att_sum ).unsqueeze(2).unsqueeze(3).expand_as(x)
        return x * scale

def logsumexp_2d(tensor):
    tensor_flatten = tensor.view(tensor.size(0), tensor.size(1), -1)
    s, _ = torch.max(tensor_flatten, dim=2, keepdim=True)
    outputs = s + (tensor_flatten - s).exp().sum(dim=2, keepdim=True).log()
    return outputs

class ChannelPool(nn.Module):
    def forward(self, x):
        return torch.cat( (torch.max(x,1)[0].unsqueeze(1), torch.mean(x,1).unsqueeze(1)), dim=1 )

class SpatialGate(nn.Module):
    def __init__(self):
        super(SpatialGate, self).__init__()
        kernel_size = 7
        self.compress = ChannelPool()
        self.spatial = BasicConv(2, 1, kernel_size, stride=1, padding=(kernel_size-1) // 2, relu=False)
    def forward(self, x):
        x_compress = self.compress(x)
        x_out = self.spatial(x_compress)
        scale = torch.sigmoid(x_out) # broadcasting
        return x * scale

class CBAM(nn.Module):
    def __init__(self, gate_channels, reduction_ratio=16, pool_types=['avg', 'max'], no_spatial=False):
        super(CBAM, self).__init__()
        self.ChannelGate = ChannelGate(gate_channels, reduction_ratio, pool_types)
        self.no_spatial=no_spatial
        if not no_spatial:
            self.SpatialGate = SpatialGate()
    def forward(self, x):
        x_out = self.ChannelGate(x)
        if not self.no_spatial:
            x_out = self.SpatialGate(x_out)
        return x_out

## Skip Connection을 사용한 CNN 및 CBAM적용

In [None]:
#  # 0.8878048780487805
class ConvNet(nn.Module):
    def __init__(self):
        super().__init__()                
        self.first = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 128, kernel_size=5, stride=1, padding=5//2),
            nn.ReLU(),
            nn.BatchNorm2d(128),
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=3//2),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )        
        self.conv1 = self.main_block(128, 256)        
        self.skip1 = self.skip_block(128, 256)
        self.conv2 = self.main_block(256, 512)
        self.skip2 = self.skip_block(256, 512)
        self.conv_cbam1 = CBAM(256, reduction_ratio=16)
        self.skip_cbam1 = CBAM(256, reduction_ratio=16)
        self.conv_cbam2 = CBAM(512, reduction_ratio=16)
        self.skip_cbam2 = CBAM(512, reduction_ratio=16)

        self.fc = nn.Sequential(            
            nn.BatchNorm1d(8192),
            nn.Linear(8192, 1024),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.BatchNorm1d(1024),
            nn.Linear(1024, 10),          
        )
        

    def main_block(self, in_feature, out_feature):
        return nn.Sequential(
            nn.BatchNorm2d(in_feature),
            nn.Conv2d(in_feature, in_feature, kernel_size=3, stride=1, padding=3//2),
            nn.ReLU(),
            nn.BatchNorm2d(in_feature),
            nn.Conv2d(in_feature, out_feature, kernel_size=3, stride=1, padding=3//2),
        )
    def skip_block(self, in_feature, out_feature):
        return nn.Sequential(
            nn.BatchNorm2d(in_feature),
            nn.Conv2d(in_feature, out_feature, kernel_size=1, stride=1),
        )

    def forward(self, x):
        batch_size = x.size(0)
        x = self.first(x)
        conv1 = self.conv1(x)
        conv1 = self.conv_cbam1(conv1)
        skip1 = self.skip1(x)        
        skip1 = self.skip_cbam1(skip1)
        x = F.relu(conv1 + skip1)
        x = F.max_pool2d(x, kernel_size=2)

        conv2 = self.conv2(x)
        conv2 = self.conv_cbam2(conv2)
        skip2 = self.skip2(x)
        skip2 = self.skip_cbam2(skip2)
        x = F.relu(conv2 + skip2)
        x = F.max_pool2d(x, kernel_size=2)
        out = self.fc(x.view(batch_size, -1))        

        return out

In [None]:
from torchsummary import summary
model = ConvNet()
model.to(device)
summary(model, (1, 32, 32), batch_size=1024)

# 3. Preprocessing

In [None]:
class ImageData(Dataset):
    def __init__(self, df, transform):
        super().__init__()
        self.df = df
        self.image = np.expand_dims(df.iloc[:, 3:].values.reshape(-1, 28, 28), axis=3)
        self.labels = df['digit'].values
        self.transform = transform

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        label = self.labels[index].astype(np.int)           
        image = self.transform(self.image[index].astype(np.uint8))
        #image = torch.Tensor(image).unsqueeze(0)
        return image, label

class TestImageData(Dataset):
    def __init__(self, df, transform):
        super().__init__()
        self.df = df
        self.image = np.expand_dims(df.iloc[:, 3:].values.reshape(-1, 28, 28), axis=3)
        self.transform = transform

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        image = self.image[index]
        image = self.transform(self.image[index].astype(np.uint8))
        return image

In [None]:
train, valid = train_test_split(data, test_size=0.1, random_state=777)

## Augmentation

augmentation은 

1. 이미지 패딩을 5를 준 상태에서 Random Crop을 하여 

   최대한 원본 이미지의 손상이 없는 상태에서 이미지의 이동이 일어나게 하였습니다.

2. Affine 변환

3. Random Erasing

3가지를 랜덤하게 적용하도록 하였습니다.

In [None]:
# transforms.Normalize(0.1430, 0.2538)
randomApply = transforms.RandomApply([transforms.RandomAffine(30)])
data_transf = transforms.Compose([
                                  transforms.ToPILImage(), 
                                  transforms.Resize((32,32)), 
                                  transforms.RandomApply([transforms.RandomCrop(size=(32,32), padding=5)]),
                                  randomApply, transforms.ToTensor(),                                   
                                  transforms.Normalize(0.5, 0.5),
                                  transforms.RandomErasing(),
                                  ])
test_transf = transforms.Compose([transforms.ToPILImage(), transforms.Resize((32,32)), transforms.ToTensor(), transforms.Normalize(0.5, 0.5)])

train_data = ImageData(df = train, transform = data_transf)
train_loader = DataLoader(dataset = train_data, batch_size = 64, shuffle=True)
valid_data = ImageData(df = valid, transform = test_transf)
valid_loader = DataLoader(dataset = valid_data, batch_size = 512)
test_data = TestImageData(df = test, transform = test_transf)
test_loader = DataLoader(dataset = test_data, batch_size = 512)

# 4. Training

In [None]:
def train_model(model, train_loader, valid_loader):
    EPOCH = 100
    best_model = None
    best_loss = 9876543210
    best_acc = 0
    early_stop_patience = 0

    optimizer = torch.optim.AdamW(model.parameters())
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)
    criterion = nn.CrossEntropyLoss()
    model = model.to(device)    
    
    for epoch in range(EPOCH):
        print('='*20 + f' EPOCH - {epoch} ' + '='*20)
        model.train()
        train_loss = 0
        for image, label in train_loader:
            image = image.to(device)
            label = label.to(device)
            optimizer.zero_grad()
            pred = model(image)
            loss = criterion(pred, label)
            loss.backward()
            train_loss += loss.item()
            optimizer.step()
        print(f'train_loss : {train_loss / len(train_loader)}')
        # validation
        model.eval()
        valid_loss = 0
        preds = []
        labels = []
        for image, label in valid_loader:
            with torch.no_grad():
                image = image.to(device)
                label = label.to(device)
                pred = model(image)
                loss = criterion(pred, label)
                valid_loss += loss.item()  
                preds.extend(F.softmax(pred.to('cpu'), dim=1).argmax(dim=1).tolist())
                labels.extend(label.to('cpu').tolist())
        acc = accuracy_score(labels, preds)
        valid_loss /= len(valid_loader)
        scheduler.step(valid_loss)
    
        early_stop_patience += 1
        print(f'valid_loss: {valid_loss}, valid_acc: {acc}')
        if epoch % 10 == 0:
            print(confusion_matrix(labels, preds))
        if best_loss > valid_loss and best_acc <= acc:
            best_loss = valid_loss                
            best_acc = acc
            best_model = model.state_dict()
            print(f'model_saved, best_acc : {acc}')
        #     early_stop_patience = 0
        # if early_stop_patience > 10:
        #     break
    return best_model

In [None]:
def infer_newdata(model, test_data, test_transf):
    preds = []
    test_dataset = TestImageData(test_data, test_transf)
    test_loader = DataLoader(test_dataset, batch_size=512)
    model.eval()
    for image in test_loader:
        with torch.no_grad():
            image = image.to(device)
            pred = model(image)
            pred = F.softmax(pred.to('cpu'), dim=1).numpy()
            preds.append(pred)
    preds = np.concatenate(preds)
    idx = np.argwhere(preds>0.95)
    test_data.loc[idx[:, 0], 'digit'] = idx[:,1]
    return test_data

In [None]:
for n_semi in range(10):
    best_model = train_model(model, train_loader, valid_loader)
    model.load_state_dict(best_model)
    model.eval()
    model.to(device)
    test = infer_newdata(model, test, test_transf)
    new_data = pd.concat([train, test.dropna()], axis=0)
    new_dataset = ImageData(new_data, data_transf)
    train_loader = DataLoader(new_dataset, batch_size=64, shuffle=True)
    print('='*20 + f' SEMI SUPERVISED START - {n_semi} ' + '='*20)
    print(f'new train dataset - train: {len(train)} + new: {len(test.dropna())} ')


## RE-training 및 Inference
training 시간이 길어 중간 결과를 저장한 후 이를 이용하여 재훈련시켰습니다.

In [None]:
test.to_csv(path+'10semi.csv', index=False)

In [None]:
test = pd.read_csv(path + '10semi.csv')
new_data = pd.concat([data, test.dropna()], axis=0, ignore_index=True)
kfold = KFold(n_splits=5, shuffle=True, random_state=777)

In [None]:
models = []
for train_idx, valid_idx in kfold.split(new_data):
    model = ConvNet()
    train = new_data.iloc[train_idx]
    valid = new_data.iloc[valid_idx]
    train_data = ImageData(df = train, transform = data_transf)
    train_loader = DataLoader(dataset = train_data, batch_size = 64)
    valid_data = ImageData(df = valid, transform = test_transf)
    valid_loader = DataLoader(dataset = valid_data, batch_size = 512)
    best = train_model(model, train_loader, valid_loader)
    models.append(best)   

In [None]:
# inference 5fold
total = []
for state in models:
    model = ConvNet()
    model.load_state_dict(state)
    model.eval()
    model.to(device)
    preds = []
    for image in test_loader:    
        with torch.no_grad():
            image = image.to(device)
            pred = model(image)
            preds.extend(F.softmax(pred.to('cpu'), dim=1).tolist())
    total.append(preds)


In [None]:
total = np.array(total).mean(axis=0)
total = total.argmax(axis=1)

# 5. 제출

In [None]:
submission = pd.read_csv(path + 'submission.csv')

submission['digit'] = total

In [None]:
submission.to_csv(path+'20200903_5fold_semi.csv', index=False)

# 6. etc.

뒤늦게 참가해서 Toy프로젝트 하는 느낌으로 참가했는데 우연찮게 올라오게 되었습니다.

이럴 줄 알았으면 코드를 좀 깔끔하게 하는건데. 다음부터는 항상 예쁘게 짜야겠다는 생각이 듭니다.

위 코드에는 제가 의도하지않은 몇가지 오류가 있는데요, 수정하여 코드를 작성한것도 있으나

당시 제출에 사용했던것과 가장 근접한 코드가 남은것이 이 코드라 그대로 올리게 되었습니다.

1. Semi training 시 모델을 새로 생성하지 않고 기존것을 이어서 훈련하고있습니다.
2. 한번 저장한 것을 load하면서 재훈련 하다보니 코드가 수정되어 조금 다른 결과가 나올수 있습니다.
3. seed설정을 안한 부분이 있는데, 결과가 크게 달라지지 않을 것이라 생각하여 중요하게 생각하진 않았습니다.

마지막에 tpu를 사용하는것으로 변환하면서 개선시키다보니 이전코드에 반영이 안되 아쉽네요.

개선시킨다면, Semi-training 시작부터 10fold 훈련-예측을 사용하고
이후에 inference 부분에서도 10fold 및 validationd을 기존 train 데이터에서만 뽑아 적용하는것이 더 낫지 않았나 싶습니다.