In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim

from torchvision import transforms
from sklearn.metrics import accuracy_score

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2

from tqdm.notebook import tqdm
import time
import pandas as pd
import numpy as np
import random
import math
import os
import cv2
from PIL import Image

from sklearn.model_selection import train_test_split

In [2]:
# class cfg: 
#     height = 32
#     width = 100
#     valid_size = 0.1
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     seed = 1212
#     pad_idx = 0
#     d_model = 512
#     n_head = 8
#     ffn_hidden = 2048
#     enc_n_layers = 12
#     dec_n_layers = 6
#     drop_prob = 0.1
#     epochs = 20
#     learning_rate = 0.0001
#     batch_size = 64
#     num_workers = 4 # 본인의 GPU, CPU 환경에 맞게 설정

class cfg: 
    height = 64
    width = 128
    valid_size = 0.2
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    seed = 1212
    pad_idx = 0
    d_model = 512
    n_head = 8
    ffn_hidden = 2048
    enc_n_layers = 
    dec_n_layers = 2
    max_len = 8
    drop_prob = 0.1
    epochs = 20
    learning_rate = 0.0001
    batch_size = 64
    num_workers = 4 # 본인의 GPU, CPU 환경에 맞게 설정
    print_step = 50
    pt_path = 'model_best_64_128_4epoch.pt'
    
# # small satrn
# class cfg: 
#     height = 64
#     width = 128
#     valid_size = 0.2
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     seed = 1212
#     pad_idx = 0
#     d_model = 256
#     n_head = 8
#     ffn_hidden = 1024
#     enc_n_layers = 9
#     dec_n_layers = 3
#     drop_prob = 0.1
#     epochs = 30
#     learning_rate = 0.0001
#     batch_size = 32
#     print_step = 100
#     pt_path = None

In [3]:
# 시드 고정 
def seed_everything(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)
    
seed_everything(cfg.seed)

In [4]:
# batch norm relu 둘다 적용dropout은 없음

class ShallowCNN(nn.Module):
    def __init__(self,input_channels=3,hidden_size=512):
        super().__init__()
        
        self.conv1 = nn.Conv2d(
            in_channels = input_channels,
            out_channels = hidden_size//2,
            kernel_size = 3,
            padding = 1,
            bias=False
        )
        self.bn1 = nn.BatchNorm2d(hidden_size//2)
        
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=2,stride=2)
        
        self.conv2 = nn.Conv2d(
            in_channels = hidden_size//2,
            out_channels = hidden_size,
            kernel_size = 3,
            padding = 1,
            bias=False
        )
        self.bn2 = nn.BatchNorm2d(hidden_size)
    
    def forward(self,input):
        out = self.conv1(input)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.maxpool(out)
        
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.maxpool(out)
        
        return out

In [5]:
# a = torch.Tensor(1,3,128,128)
# model = ShallowCNN()
# print(model(a).size())

In [6]:
class Adaptive2DPositionEncoder(nn.Module):
    def __init__(self, in_channels, max_h=200, max_w=200, dropout=0.1):
        super(Adaptive2DPositionEncoder, self).__init__()

        h_position_encoder = self.generate_encoder(in_channels, max_h)
        h_position_encoder = h_position_encoder.transpose(0, 1).view(1, in_channels, max_h, 1)

        w_position_encoder = self.generate_encoder(in_channels, max_w)
        w_position_encoder = w_position_encoder.transpose(0, 1).view(1, in_channels, 1, max_w)

        self.register_buffer('h_position_encoder', h_position_encoder)
        self.register_buffer('w_position_encoder', w_position_encoder)

        self.h_scale = self.scale_factor_generate(in_channels)
        self.w_scale = self.scale_factor_generate(in_channels)
        self.pool = nn.AdaptiveAvgPool2d(1)
        self.dropout = nn.Dropout(p=dropout)
        
    def generate_encoder(self,in_channels, max_len):
        pos = torch.arange(max_len).float().unsqueeze(1)

        i = torch.arange(in_channels).float().unsqueeze(0)
        angle_rates = 1 / torch.pow(10000, (2 * (i//2)) / in_channels)

        position_encoder = pos * angle_rates
        position_encoder[:, 0::2] = torch.sin(position_encoder[:, 0::2])
        position_encoder[:, 1::2] = torch.cos(position_encoder[:, 1::2])

        return position_encoder

    def scale_factor_generate(self, in_channels):
        scale_factor = nn.Sequential(
            nn.Conv2d(in_channels, in_channels, kernel_size=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels, in_channels, kernel_size=1),
            nn.Sigmoid()
        )

        return scale_factor

    def forward(self, x):
        b, c, h, w = x.size()

        avg_pool = self.pool(x)

        h_pos_encoding = self.h_scale(avg_pool) * self.h_position_encoder[:, :, :h, :]
        w_pos_encoding = self.w_scale(avg_pool) * self.w_position_encoder[:, :, :, :w]

        out = x + h_pos_encoding + w_pos_encoding

        out = self.dropout(out)

        return out

In [7]:
class EncoderEmbedding(nn.Module):
    def __init__(self,d_model,drop_prob,device):
        super(EncoderEmbedding,self).__init__()
        self.device = device
        self.drop_out = nn.Dropout(p=drop_prob)
        
        self.shallow_cnn = ShallowCNN(hidden_size=d_model)
        self.pos_emb = Adaptive2DPositionEncoder(d_model, max_h=100, max_w=150, dropout=drop_prob)
        
        
    def forward(self,input): # [Batch,channel,h,w]
        cnn_out = self.shallow_cnn(input) # [Batch,d_model,h//4,w//4]
        pos_out = self.pos_emb(cnn_out)
        return pos_out

In [8]:
# a = torch.Tensor(10,3,112,112).to(cfg.device)
# model = EncoderEmbedding(512,0.1,cfg.device).to(cfg.device)
# print(model(a).size())

In [9]:
class ScaleDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaleDotProductAttention, self).__init__()
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, q, k, v, mask=None, inf_value=1e12): 
        # q,k,v -> [batch_size, head, length, d_tensor]  (d_tensor = d_model // n_head)
        # mask -> [batch_size , 1 , len_query , len_key]

        batch_size, head, length, d_tensor = k.size()

        k_t = k.transpose(2, 3)  # transpose # (B,n_head,d,L_k)
        score = (q @ k_t) / math.sqrt(d_tensor) # (B,n_head,L_q,L_k)

        if mask is not None: # (B, 1, L, L)
            score = score.masked_fill(mask == False, (-1)*inf_value) 
            # softmax 적용시 e^(-inf) = 0이 되므로 0대신 -inf를 넣어줌 0을 넣으면 e^0 = 1 로 1이 나오게됨

        score = self.softmax(score) # (B,n_head,L_q,L_k)
 
        v = score @ v # (B,n_head,L_q,d_tensor)
        # @ = matmul , mul은 원소별 곱셈 

        return v, score  # v(attention output) -> (B,n_head,L_q,d_tensor) , score -> (B,n_head,L,L)

In [10]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_head):
        super(MultiHeadAttention, self).__init__()
        self.n_head = n_head
        self.attention = ScaleDotProductAttention()
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_concat = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask=None): 
        # q,k,v -> [Batch,Length,d_model] , mask -> [batch_size , 1 , len_query , len_key]
        
        q, k, v = self.w_q(q), self.w_k(k), self.w_v(v) # [batch_size, length, d_model]
        # 합쳐서 연산하나 나눠서 연산하나 Linear 적용하는 부분은 똑같기때문에 한번에 연산함.
        
        q, k, v = self.split(q), self.split(k), self.split(v) 
        # [batch_size, head, length, d_tensor]  (d_tensor = d_model // self.n_head)

        out, attention = self.attention(q, k, v, mask=mask) 
        # v(attention output) -> (B,n_head,L_q,d_tensor) , score -> (B,n_head,L,L)
        # v-> out , score -> attention

        out = self.concat(out) # (batch_size, length, d_model)
        out = self.w_concat(out) # (batch_size, length, d_model)

        return out # (batch_size, length, d_model)

    def split(self, tensor): # tensor -> [batch_size, length, d_model]

        batch_size, length, d_model = tensor.size()

        d_tensor = d_model // self.n_head
        tensor = tensor.view(batch_size, length, self.n_head, d_tensor).transpose(1, 2)

        return tensor # tensor -> [batch_size, head, length, d_tensor]

    def concat(self, tensor): # tensor-> [batch_size, head, length, d_tensor]

        batch_size, head, length, d_tensor = tensor.size()
        d_model = head * d_tensor

        tensor = tensor.transpose(1, 2).contiguous().view(batch_size, length, d_model)
        return tensor # tensor -> [batch_size, length, d_model]

In [11]:
class ConvolutionFeedForward(nn.Module):
    def __init__(self, d_model, ffn_hidden, drop_prob=0.1):
        super(ConvolutionFeedForward, self).__init__()
        
        self.conv1 = nn.Conv2d(
            in_channels = d_model,
            out_channels = ffn_hidden,
            kernel_size = 1,
            padding = 0,
            bias=False
        )
        self.bn1 = nn.BatchNorm2d(ffn_hidden)
        
        self.depthwise = nn.Conv2d(
            in_channels = ffn_hidden,
            out_channels = ffn_hidden,
            kernel_size = 3,
            padding = 1,
            bias=False,
            groups = ffn_hidden
        )
        
        self.conv2 = nn.Conv2d(
            in_channels = ffn_hidden,
            out_channels = d_model,
            kernel_size = 1,
            padding = 0,
            bias=False
        )
        
        self.bn2 = nn.BatchNorm2d(d_model)
                
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x): 
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)

        x = self.depthwise(x)
        x = self.bn1(x)
        x = self.relu(x)
        
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        
        return x

In [12]:
# a = torch.Tensor(10,3,112,112)
# model = ConvolutionFeedForward(3,12,0.1)
# print(model(a).size())

In [13]:
class EncoderLayer(nn.Module):

    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)

        self.cff = ConvolutionFeedForward(d_model=d_model, ffn_hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(p=drop_prob)

    def forward(self, x): # x : [b, h x w, c] 
        # 1. compute self attention
        _x = x
        x = self.attention(q=x, k=x, v=x)
        
        # 2. add and norm
        x = self.dropout1(x)
        x = self.norm1(x + _x)
        
        
        # 3. positionwise feed forward network
        _x = x # _x : [b, h x w, c] 
        
        b,l,c = x.size()
        
        x = x.transpose(1,2).reshape(b, c, cfg.height//4, cfg.width//4) # [Batch, channel(d_model),h,w]
        x = self.cff(x)
      
        # 4. add and norm
        x = self.dropout2(x)
        
        b, c, h, w = x.size()
        x = x.reshape(b, c, h * w).transpose(1, 2)  # [b, h x w, c]
        
        x = self.norm2(x + _x)
        return x

In [14]:
class Encoder(nn.Module):
    
    def __init__(self, d_model, ffn_hidden, n_head, n_layers, drop_prob,device):
        super().__init__()
        self.emb = EncoderEmbedding(d_model=d_model,
                                    drop_prob=drop_prob,
                                    device=device
                                    )

        self.layers = nn.ModuleList([EncoderLayer(d_model=d_model,
                                                  ffn_hidden=ffn_hidden,
                                                  n_head=n_head,
                                                  drop_prob=drop_prob)
                                     for _ in range(n_layers)])

    def forward(self, x): # x: (batch,chnnel,h,w)
        x = self.emb(x) # x: (batch,d_model,h,w)
        
        b, c, h, w = x.size()
        x = x.reshape(b, c, h * w).transpose(1, 2)  # [b, h x w, c]
        
        for layer in self.layers:
            x = layer(x)

        return x

In [15]:
# a = torch.Tensor(10,3,112,112).to(cfg.device)
# model = Encoder(512,4,4,0.1,cfg.device).to(cfg.device)
# print(model(a).size())

In [16]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len, device): # ex. d_model = 512, max_len = 100
        # d_model : embedding dimension
        # max_len : 전체 데이터 문장에 대한 최대길이
        
        super(PositionalEncoding, self).__init__()

        self.encoding = torch.zeros(max_len, d_model, device=device) # self.encoding -> (max_len , d_model)
        self.encoding.requires_grad = False  # we don't need to compute gradient (학습할 필요가 없는 값)

        pos = torch.arange(0, max_len, device=device) # pos -> (max_len) # ex. pos = [0,1,2,3,...,99]
        pos = pos.float().unsqueeze(dim=1) # pos -> (max_len, 1) 

        _2i = torch.arange(0, d_model, step=2, device=device).float() # _2i -> (d_model//2) # ex. _2i = [0,2,4,...,510]

        self.encoding[:, 0::2] = torch.sin(pos / (10000 ** (_2i / d_model)))
        self.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i / d_model)))
        # self.encoding[i,j] -> j가 짝수 : torch.sin(i/(10000)**(j/512))
        #                    -> j가 홀수 : torch.cos(i/(10000)**((j-1)/512))

        # self.encoding 은 i번째 단어에 대해 i번째 단어라는것을 구분짓기 위한 encoding 값을 제공함
        
    def forward(self, x): # x -> (Batch,Length)

        batch_size, seq_len = x.size() # seq_len != max_len (seq_len : 이번 배치에서의 seq_len)

        # seq_len이 배치내의 문장 최대 길이이므로 seq_len까지 단어 순서를 구분해주기 위한 encoding 값을 가져감
        return self.encoding[:seq_len, :] 

In [17]:
class TransformerEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len, drop_prob,device): # max_len은 전체 데이터에 대한 max_len
        super(TransformerEmbedding, self).__init__()
        self.tok_emb = nn.Embedding(vocab_size, d_model)
        self.pos_emb = PositionalEncoding(d_model, max_len,device)
        self.drop_out = nn.Dropout(p=drop_prob)

    def forward(self, x): # x -> (Batch,Length) (Length : Batch내의 최대 문장 길이)
        tok_emb = self.tok_emb(x) # tok_emb -> (Batch,Length,d_model)
        pos_emb = self.pos_emb(x) # pos_emb -> (Length,d_model)
        return self.drop_out(tok_emb + pos_emb) # (Batch,Length,d_model)  # pos_emb가 broadcasting 됨

In [18]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [19]:
class DecoderLayer(nn.Module):

    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)

        self.enc_dec_attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(p=drop_prob)

        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout3 = nn.Dropout(p=drop_prob)

    def forward(self, dec, enc, trg_mask, src_mask):    
        # 1. compute self attention
        _x = dec
        x = self.self_attention(q=dec, k=dec, v=dec, mask=trg_mask)
        
        # 2. add and norm
        x = self.dropout1(x)
        x = self.norm1(x + _x)

        if enc is not None:
            # 3. compute encoder - decoder attention
            _x = x
            x = self.enc_dec_attention(q=x, k=enc, v=enc, mask=src_mask)
            
            # 4. add and norm
            x = self.dropout2(x)
            x = self.norm2(x + _x)

        # 5. positionwise feed forward network
        _x = x
        x = self.ffn(x)
        
        # 6. add and norm
        x = self.dropout3(x)
        x = self.norm3(x + _x)
        return x

In [20]:
class Decoder(nn.Module):
    def __init__(self, dec_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob, device):
        super().__init__()
        self.emb = TransformerEmbedding(d_model=d_model,
                                        drop_prob=drop_prob,
                                        max_len=max_len,
                                        vocab_size=dec_voc_size,
                                        device=device
                                        )

        self.layers = nn.ModuleList([DecoderLayer(d_model=d_model,
                                                  ffn_hidden=ffn_hidden,
                                                  n_head=n_head,
                                                  drop_prob=drop_prob)
                                     for _ in range(n_layers)])

        self.linear = nn.Linear(d_model, dec_voc_size)

    def forward(self, trg, src, trg_mask, src_mask):
        trg = self.emb(trg)

        for layer in self.layers:
            trg = layer(trg, src, trg_mask, src_mask)

        # pass to LM head
        output = self.linear(trg)
        return output

In [21]:
class SATRN(nn.Module):

    def __init__(self, pad_idx, dec_voc_size, d_model, n_head, max_len,
                ffn_hidden, enc_n_layers, dec_n_layers , drop_prob,device):
        super().__init__()
        self.pad_idx = pad_idx # 길이 맞춰주기 위한 패딩 보통 1 사용
        self.device = device
        self.encoder = Encoder(d_model=d_model,
                            ffn_hidden=ffn_hidden,
                            n_head=n_head,
                            n_layers=enc_n_layers,
                            drop_prob=drop_prob,
                            device=device
                            )
        self.decoder = Decoder(dec_voc_size=dec_voc_size,
                            max_len=max_len,
                            d_model=d_model,
                            ffn_hidden=ffn_hidden,
                            n_head=n_head,
                            n_layers=dec_n_layers,
                            drop_prob=drop_prob,
                            device=device
                            )
    def forward(self, src, trg): # src -> [Batch,channel,h,w] ,trg -> [Batch,Length]        
        src_trg_mask = self.make_src_trg_mask(trg, src) # [batch_size , 1 , len_trg , len_src] 
        
        trg_mask = self.make_pad_mask(trg, trg) * self.make_no_peak_mask(trg, trg) # [batch_size , 1 , len_trg , len_trg]
        # make_pad_mask(trg, trg) -> [batch_size , 1 , len_trg , len_trg]
        # make_no_peak_mask(trg, trg) -> [len_trg , len_trg]  (broadcasting 적용)
        
        enc_src = self.encoder(src) # enc_src -> (batch_size, length, d_model)
        output = self.decoder(trg, enc_src, trg_mask, src_trg_mask)
        return output
    
    def make_src_trg_mask(self,q,k): # src -> [Batch,channel,h,w] ,trg -> [Batch,Length]
        len_q = q.size(1)
        len_k = cfg.height//4 * cfg.width//4
        
        # batch_size x 1 x len_q x 1
        q = q.ne(self.pad_idx).unsqueeze(1).unsqueeze(3)
        # batch_size x 1 x len_q x len_k
        q = q.repeat(1, 1, 1, len_k)
        
        # k 이미지는 모두 True일것이므로 생략
        return q

    def make_pad_mask(self, q, k): # q,k -> [Batch,Length]
        len_q, len_k = q.size(1), k.size(1)

        # batch_size x 1 x 1 x len_k (unsqueeze는 강제로 그 차원에 1차원을 넣어줌)
        k = k.ne(self.pad_idx).unsqueeze(1).unsqueeze(2)
        # batch_size x 1 x len_q x len_k
        k = k.repeat(1, 1, len_q, 1) # bx1 , 1x1 , 1xlen_1 , len_k x 1 차원이 되는것임 (repeat)

        # batch_size x 1 x len_q x 1
        q = q.ne(self.pad_idx).unsqueeze(1).unsqueeze(3)
        # batch_size x 1 x len_q x len_k
        q = q.repeat(1, 1, 1, len_k)

        mask = k & q # 둘다 True일경우 True 반환 나머지는 모두 False
        return mask

    def make_no_peak_mask(self, q, k): # q,k -> [Batch,Length]
        len_q, len_k = q.size(1), k.size(1)

        # tril 은 대각선 윗부분을 0으로 만들어주는것
        # len_q x len_k
        mask = torch.tril(torch.ones(len_q, len_k)).type(torch.BoolTensor).to(self.device)

        return mask

In [22]:
# a = torch.Tensor(1,3,112,112).to(cfg.device)
# b = torch.LongTensor([[2,4,5,6,1]]).to(cfg.device)
# model = SATRN(1,100,512,4,10,512,4,0.1,cfg.device).to(cfg.device)
# # pad_idx, dec_voc_size, d_model, n_head, max_len, ffn_hidden, n_layers, drop_prob,device
# result = model(a,b)

In [23]:
# print(result.size()) # dec_vocab size 가 100이므로 100 dimension으로 나옴

# 데이터 load

In [24]:
df = pd.read_csv('./train.csv')

In [25]:
df['label_split']=df['label'].apply(lambda x : list(x))

In [26]:
df

Unnamed: 0,id,img_path,label,label_split
0,TRAIN_00000,./train/TRAIN_00000.png,빨간색,"[빨, 간, 색]"
1,TRAIN_00001,./train/TRAIN_00001.png,머,[머]
2,TRAIN_00002,./train/TRAIN_00002.png,차차,"[차, 차]"
3,TRAIN_00003,./train/TRAIN_00003.png,써,[써]
4,TRAIN_00004,./train/TRAIN_00004.png,놓치다,"[놓, 치, 다]"
...,...,...,...,...
76883,TRAIN_76883,./train/TRAIN_76883.png,회,[회]
76884,TRAIN_76884,./train/TRAIN_76884.png,겪다,"[겪, 다]"
76885,TRAIN_76885,./train/TRAIN_76885.png,벨트,"[벨, 트]"
76886,TRAIN_76886,./train/TRAIN_76886.png,톼,[톼]


In [27]:
# 제공된 학습데이터 중 1글자 샘플들의 단어사전이 학습/테스트 데이터의 모든 글자를 담고 있으므로 학습 데이터로 우선 배치
df['len'] = df['label'].str.len()
train_v1 = df[df['len']==1]

In [28]:
cfg.max_len = max(df['len'])+2 # 앞뒤 패드
print(cfg.max_len)

8


In [29]:
# 제공된 학습데이터 중 2글자 이상의 샘플들에 대해서 단어길이를 고려하여 Train (80%) / Validation (20%) 분할
df = df[df['len']>1]
train_v2, val, _, _ = train_test_split(df, df['len'], test_size=cfg.valid_size, random_state=cfg.seed)

In [30]:
# 학습 데이터로 우선 배치한 1글자 샘플들과 분할된 2글자 이상의 학습 샘플을 concat하여 최종 학습 데이터로 사용
train = pd.concat([train_v1, train_v2])
print(len(train), len(val))

66251 10637


# Get Vocabulary

In [31]:
# 학습 데이터로부터 단어 사전(Vocabulary) 구축
train_gt = [gt for gt in train['label']]
train_gt = "".join(train_gt)
letters = sorted(list(set(list(train_gt))))
print(len(letters))

2349


In [32]:
print(letters[:10])

['가', '각', '간', '갇', '갈', '갉', '갊', '감', '갑', '값']


In [33]:
vocabulary = ['pad']+["start"]+['end'] + letters
print(len(vocabulary))
idx2char = {k:v for k,v in enumerate(vocabulary, start=0)}
char2idx = {v:k for k,v in idx2char.items()}

cfg.dec_voc_size = len(vocabulary)


2352


In [34]:
print(idx2char[0])
print(idx2char[1])
print(idx2char[2])
print(idx2char[3])

pad
start
end
가


In [35]:
train['label_data']=train['label_split'].apply(lambda x :[1]+[char2idx[item] for item in x]+[2])
val['label_data']=val['label_split'].apply(lambda x :[1]+[char2idx[item] for item in x]+[2])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val['label_data']=val['label_split'].apply(lambda x :[1]+[char2idx[item] for item in x]+[2])


In [36]:
train

Unnamed: 0,id,img_path,label,label_split,len,label_data
1,TRAIN_00001,./train/TRAIN_00001.png,머,[머],1,"[1, 804, 2]"
3,TRAIN_00003,./train/TRAIN_00003.png,써,[써],1,"[1, 1288, 2]"
7,TRAIN_00007,./train/TRAIN_00007.png,빈,[빈],1,"[1, 1024, 2]"
10,TRAIN_00010,./train/TRAIN_00010.png,윷,[윷],1,"[1, 1528, 2]"
27,TRAIN_00027,./train/TRAIN_00027.png,훵,[훵],1,"[1, 2305, 2]"
...,...,...,...,...,...,...
25282,TRAIN_25282,./train/TRAIN_25282.png,감사하다,"[감, 사, 하, 다]",4,"[1, 10, 1106, 2210, 435, 2]"
27512,TRAIN_27512,./train/TRAIN_27512.png,채우다,"[채, 우, 다]",3,"[1, 1791, 1487, 435, 2]"
34748,TRAIN_34748,./train/TRAIN_34748.png,상식,"[상, 식]",2,"[1, 1118, 1259, 2]"
49860,TRAIN_49860,./train/TRAIN_49860.png,나뭇가지,"[나, 뭇, 가, 지]",4,"[1, 294, 868, 3, 1685, 2]"


In [37]:
val

Unnamed: 0,id,img_path,label,label_split,len,label_data
33672,TRAIN_33672,./train/TRAIN_33672.png,기본적,"[기, 본, 적]",3,"[1, 162, 968, 1596, 2]"
14674,TRAIN_14674,./train/TRAIN_14674.png,다루다,"[다, 루, 다]",3,"[1, 435, 730, 435, 2]"
8767,TRAIN_08767,./train/TRAIN_08767.png,크다,"[크, 다]",2,"[1, 1984, 435, 2]"
22866,TRAIN_22866,./train/TRAIN_22866.png,전기밥솥,"[전, 기, 밥, 솥]",4,"[1, 1597, 162, 915, 1189, 2]"
34909,TRAIN_34909,./train/TRAIN_34909.png,간호사,"[간, 호, 사]",3,"[1, 5, 2260, 1106, 2]"
...,...,...,...,...,...,...
41187,TRAIN_41187,./train/TRAIN_41187.png,테이블,"[테, 이, 블]",3,"[1, 2030, 1549, 1018, 2]"
34061,TRAIN_34061,./train/TRAIN_34061.png,간편하다,"[간, 편, 하, 다]",4,"[1, 5, 2146, 2210, 435, 2]"
34531,TRAIN_34531,./train/TRAIN_34531.png,조미료,"[조, 미, 료]",3,"[1, 1621, 891, 724, 2]"
55234,TRAIN_55234,./train/TRAIN_55234.png,아니,"[아, 니]",2,"[1, 1355, 425, 2]"


# CustomDataset

In [38]:
class CustomDataset(Dataset):
    def __init__(self, img_path_list, label_list, train_mode=True):
        self.img_path_list = img_path_list
        self.label_list = label_list
        self.train_mode = train_mode
        
    def __len__(self):
        return len(self.img_path_list)
    
    def __getitem__(self, index):
        image = Image.open(self.img_path_list[index]).convert('RGB')
        
        if self.train_mode:
            image = self.train_transform(image)
        else:
            image = self.test_transform(image)
            
        if self.label_list is not None:
            text = self.label_list[index]
            return image, torch.LongTensor(text)
        else:
            return image
    
    # Image Augmentation
    def train_transform(self, image):
        transform_ops = A.Compose([
            A.Resize(cfg.height,cfg.width,p=1.0),
            A.RandomBrightnessContrast(brightness_limit=(-0.2, 0.2), contrast_limit=(-0.2, 0.2), p=0.5),
            A.Rotate(limit=(-15,15), p=0.5, border_mode=cv2.BORDER_REPLICATE),
            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225),max_pixel_value=255.0, p=1.0),
            ToTensorV2(p=1.0)
        ], p=1.0)
        return transform_ops(image=np.array(image))['image']
    
    def test_transform(self, image):
        transform_ops = A.Compose([
            A.Resize(cfg.height,cfg.width,p=1.0),
            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225),max_pixel_value=255.0, p=1.0),
            ToTensorV2(p=1.0)
        ], p=1.0)
        return transform_ops(image=np.array(image))['image']

In [39]:
def collate_fn(batch):
    img_batch, tgt_batch = [], []
    for img, tgt_sample in batch:
        img_batch.append(img)
        tgt_batch.append(tgt_sample)

    tgt_batch = pad_sequence(tgt_batch,batch_first=True, padding_value=0)
    return torch.stack(img_batch), tgt_batch

In [40]:
train_dataset = CustomDataset(train['img_path'].values, train['label_data'].values, True)
train_loader = DataLoader(train_dataset, batch_size = cfg.batch_size, shuffle=True, num_workers=cfg.num_workers,collate_fn=collate_fn)

val_dataset = CustomDataset(val['img_path'].values, val['label_data'].values, False)
val_loader = DataLoader(val_dataset, batch_size = cfg.batch_size, shuffle=True, num_workers=cfg.num_workers,collate_fn=collate_fn)

val_test_dataset = CustomDataset(val['img_path'].values, None, False)
val_test_loader = DataLoader(val_test_dataset, batch_size = cfg.batch_size, shuffle=False, num_workers=cfg.num_workers)

In [41]:
image_batch, text_batch = next(iter(train_loader))
print(image_batch, text_batch)


tensor([[[[2.2489, 2.2489, 2.2489,  ..., 2.2489, 2.2489, 2.2489],
          [2.2489, 2.2489, 2.2489,  ..., 2.2489, 2.2489, 2.2489],
          [2.2489, 2.2489, 2.2489,  ..., 2.2489, 2.2489, 2.2489],
          ...,
          [2.2489, 2.2489, 2.2489,  ..., 2.2489, 2.2489, 2.2489],
          [2.2489, 2.2489, 2.2489,  ..., 2.2489, 2.2489, 2.2489],
          [2.2489, 2.2489, 2.2489,  ..., 2.2489, 2.2489, 2.2489]],

         [[2.4286, 2.4286, 2.4286,  ..., 2.4286, 2.4286, 2.4286],
          [2.4286, 2.4286, 2.4286,  ..., 2.4286, 2.4286, 2.4286],
          [2.4286, 2.4286, 2.4286,  ..., 2.4286, 2.4286, 2.4286],
          ...,
          [2.4286, 2.4286, 2.4286,  ..., 2.4286, 2.4286, 2.4286],
          [2.4286, 2.4286, 2.4286,  ..., 2.4286, 2.4286, 2.4286],
          [2.4286, 2.4286, 2.4286,  ..., 2.4286, 2.4286, 2.4286]],

         [[2.6400, 2.6400, 2.6400,  ..., 2.6400, 2.6400, 2.6400],
          [2.6400, 2.6400, 2.6400,  ..., 2.6400, 2.6400, 2.6400],
          [2.6400, 2.6400, 2.6400,  ..., 2

In [42]:
print(image_batch.size(), text_batch.size())

torch.Size([64, 3, 64, 128]) torch.Size([64, 7])


In [43]:
model = SATRN(
    cfg.pad_idx,
    cfg.dec_voc_size,
    cfg.d_model,
    cfg.n_head,
    cfg.max_len,
    cfg.ffn_hidden,
    cfg.enc_n_layers,
    cfg.dec_n_layers,
    cfg.drop_prob,
    cfg.device
).to(cfg.device)

# Adam optimizer로 학습 최적화
optimizer = torch.optim.Adam(model.parameters(), lr=cfg.learning_rate)

# 뒷 부분의 패딩(padding)에 대해서는 값 무시
criterion = nn.CrossEntropyLoss(ignore_index = cfg.pad_idx)

  angle_rates = 1 / torch.pow(10000, (2 * (i//2)) / in_channels)


In [44]:
# def initialize_weights(m):
#     # convolution kernel의 weight를 He initialization을 적용한다.
#     if isinstance(m, nn.Conv2d):
#         nn.init.kaiming_uniform_(m.weight.data,nonlinearity='relu')

#         # bias는 상수 0으로 초기화 한다.
#         if m.bias is not None:
#             nn.init.constant_(m.bias, 0)

#     elif isinstance(m, nn.BatchNorm2d):
#         nn.init.constant_(m.weight, 1)
#         nn.init.constant_(m.bias, 0)

#     elif isinstance(m, nn.Linear):
#         nn.init.kaiming_uniform_(m.weight.data,nonlinearity='relu')

#         # bias는 상수 0으로 초기화 한다.
#         if m.bias is not None:
#             nn.init.constant_(m.bias, 0)

# model.apply(initialize_weights)

In [45]:
if cfg.pt_path is not None:
    model.load_state_dict(torch.load(cfg.pt_path))

In [46]:
def train(model, iterator, optimizer, criterion, clip,device):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(tqdm(iterator)):
        image = batch[0].to(device) # Batch,channel,height,width
        text = batch[1].to(device) # Batch,Lenght

        optimizer.zero_grad()
        
        # 출력 단어의 마지막 인덱스는 제외 
        output = model(image, text[:, :-1]) # output -> [Batch, trg_len - 1, output_dim]
        output_reshape = output.contiguous().view(-1, output.shape[-1]) # output_reshape -> [Batch*(trg_len-1), output_dim]
        text = text[:, 1:].contiguous().view(-1) # trg -> [Batch*(trg_len-1)]

        loss = criterion(output_reshape, text)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
        if i%cfg.print_step==0:
            print(f'step : {i/cfg.print_step+1}, loss : {epoch_loss/(i+1)}')
    return epoch_loss / len(iterator)

In [47]:
def evaluate(model, iterator, criterion,device):
    model.eval() # 평가 모드
    epoch_loss = 0

    with torch.no_grad():
        for i, batch in enumerate(tqdm(iterator)):
            image = batch[0].to(device) # Batch,channel,height,width
            text = batch[1].to(device) # Batch,Lenght

            output = model(image, text[:,:-1]) # output -> [Batch, trg_len - 1, output_dim]

            output_reshape = output.contiguous().view(-1, output.shape[-1]) # output_reshape -> [Batch*(trg_len-1), output_dim]
            
            # start token 제외
            text = text[:,1:].contiguous().view(-1) # text -> [Batch*(trg_len-1)]

            # 모델의 출력 결과와 타겟 문장을 비교하여 손실 계산
            loss = criterion(output_reshape, text)

            # 전체 손실 값 계산
            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [61]:
def test(model, iterator,device):
    model.eval() # 평가 모드

    with torch.no_grad():
        answer_list=[]
        for i, batch in enumerate(tqdm(iterator)):
            image = batch.to(device) # Batch,channel,height,width
            enc_src = model.encoder(image)
            
            trg_tensor = torch.ones(image.size()[0], 1).type(torch.LongTensor).to(device) # Batch,1
            for i in range(cfg.max_len): # 출력하고 싶은 문장의 최대길이
                
                src_trg_mask = model.make_src_trg_mask(trg_tensor, image) # [batch_size , 1 , len_trg , len_src] 
                trg_mask = model.make_pad_mask(trg_tensor, trg_tensor) * model.make_no_peak_mask(trg_tensor, trg_tensor) 
                
                output = model.decoder(trg_tensor, enc_src, trg_mask, src_trg_mask)
            
                # 출력 문장에서 가장 마지막 단어만 사용 # trg가 한단어라면 한단어가 결과로 나옴. 4단어라면 4단어 결과가나옴.
                pred_token = output.argmax(-1)[:,-1].unsqueeze(1) # output -> [Batch, trg_len - 1, output_dim] -> [Batch, trg_len - 1]
                trg_tensor = torch.cat([trg_tensor,pred_token],dim=-1)
                
            
            trg_tensor = trg_tensor.tolist() # [Batch,max_len]
            # 각 출력 단어 인덱스를 실제 단어로 변환
            word_list=[]
            for word in trg_tensor:
                spel_list=[]
                for idx in word[1:]:
                    if idx ==0 or idx == 1 or idx ==2:
                        break
                    char = idx2char[idx]
                    spel_list.append(char)
                word_list.append("".join(spel_list))
            
            answer_list+=word_list
    return answer_list

In [49]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [50]:
cfg.clip = 5
best_accuracy = 0
best_valid_loss = float('inf')

for epoch in range(cfg.epochs):
    start_time = time.time() # 시작 시간 기록

    train_loss = train(model, train_loader, optimizer, criterion, cfg.clip, cfg.device)
    valid_loss = evaluate(model, val_loader, criterion, cfg.device)

    answer_list = test(model, val_test_loader, cfg.device)
    accuracy = accuracy_score(val['label'],answer_list)

    if best_accuracy < accuracy:
        best_accuracy = accuracy
        torch.save(model.state_dict(), 'model_best_accuracy.pt')
    
    end_time = time.time() # 종료 시간 기록
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model_best_loss.pt')

    print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tValidation Loss: {valid_loss:.3f}')
    print(f'\tAccuracy : {accuracy:.5f}')

  0%|          | 0/1036 [00:00<?, ?it/s]

step : 1.0, loss : 0.33630409836769104
step : 2.0, loss : 0.22560976065841376
step : 3.0, loss : 0.22768862206156892
step : 4.0, loss : 0.22327701105976736
step : 5.0, loss : 0.22354589087601326
step : 6.0, loss : 0.22409131464017815
step : 7.0, loss : 0.22371877880884564
step : 8.0, loss : 0.2242299911600572
step : 9.0, loss : 0.22352343816263717
step : 10.0, loss : 0.2225654732824165
step : 11.0, loss : 0.22335139220346234
step : 12.0, loss : 0.22216152229402114
step : 13.0, loss : 0.22164207590300708
step : 14.0, loss : 0.2210340757500924
step : 15.0, loss : 0.2184112077092648
step : 16.0, loss : 0.2170760625807169
step : 17.0, loss : 0.2150473641731617
step : 18.0, loss : 0.21350311538692646
step : 19.0, loss : 0.21204380980474702
step : 20.0, loss : 0.2103267096993675
step : 21.0, loss : 0.20838723956913383


  0%|          | 0/167 [00:00<?, ?it/s]

  0%|          | 0/167 [00:00<?, ?it/s]

Epoch: 01 | Time: 14m 19s
	Train Loss: 0.207
	Validation Loss: 0.164
	Accuracy : 0.87797


  0%|          | 0/1036 [00:00<?, ?it/s]

step : 1.0, loss : 0.1395053118467331
step : 2.0, loss : 0.16415541049312143
step : 3.0, loss : 0.15296407545557117
step : 4.0, loss : 0.14816731530309513
step : 5.0, loss : 0.14933749117809741
step : 6.0, loss : 0.14803126688853677
step : 7.0, loss : 0.1478775703530771
step : 8.0, loss : 0.14696268410424562
step : 9.0, loss : 0.14734440198414345
step : 10.0, loss : 0.14649086569471
step : 11.0, loss : 0.1456904297029068
step : 12.0, loss : 0.14641548185431805
step : 13.0, loss : 0.14545475244026215
step : 14.0, loss : 0.14418171733594892
step : 15.0, loss : 0.14323623174968866
step : 16.0, loss : 0.1422472814576683
step : 17.0, loss : 0.14089160105132134
step : 18.0, loss : 0.13984216195470997
step : 19.0, loss : 0.1383902626143113
step : 20.0, loss : 0.13769766346372264
step : 21.0, loss : 0.13741978742814923


  0%|          | 0/167 [00:00<?, ?it/s]

  0%|          | 0/167 [00:00<?, ?it/s]

Epoch: 02 | Time: 13m 28s
	Train Loss: 0.137
	Validation Loss: 0.087
	Accuracy : 0.92771


  0%|          | 0/1036 [00:00<?, ?it/s]

step : 1.0, loss : 0.06668488681316376
step : 2.0, loss : 0.10283480277832817
step : 3.0, loss : 0.09912256092423259
step : 4.0, loss : 0.0987492243185738
step : 5.0, loss : 0.0996848159094355
step : 6.0, loss : 0.09998396318034822
step : 7.0, loss : 0.09820254352500668
step : 8.0, loss : 0.09566729485012188
step : 9.0, loss : 0.09558554627279986
step : 10.0, loss : 0.09445826638233899
step : 11.0, loss : 0.09399773407273902
step : 12.0, loss : 0.09400071967021308
step : 13.0, loss : 0.09400712424724947
step : 14.0, loss : 0.09352857653858475
step : 15.0, loss : 0.09310678277860865
step : 16.0, loss : 0.09230203824130181
step : 17.0, loss : 0.09128141452496716
step : 18.0, loss : 0.0907470141105205
step : 19.0, loss : 0.09037951876788702
step : 20.0, loss : 0.08995923183889733
step : 21.0, loss : 0.08923604565446491


  0%|          | 0/167 [00:00<?, ?it/s]

  0%|          | 0/167 [00:00<?, ?it/s]

Epoch: 03 | Time: 13m 36s
	Train Loss: 0.089
	Validation Loss: 0.116
	Accuracy : 0.90627


  0%|          | 0/1036 [00:00<?, ?it/s]

step : 1.0, loss : 0.08555316179990768


KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), 'model_last_epoch.pt')

In [62]:
model.load_state_dict(torch.load('./model_best_accuracy.pt'))

test_df= pd.read_csv('./test.csv')

test_dataset = CustomDataset(test_df['img_path'].values, None, False)
test_loader = DataLoader(test_dataset, batch_size = cfg.batch_size, shuffle=False, num_workers=cfg.num_workers)

In [63]:
answer_list = test(model, test_loader, cfg.device)

  0%|          | 0/1159 [00:00<?, ?it/s]

In [64]:
print(answer_list)

['남망', '상당', '받아들이다', '바구니', '살', '빼놓다', '인식하다', '센티', '소풍', '광주', '나나', '무험', '도도', '술순', '괴로워하다', '카', '합치다', '다정하다', '혼자', '가능하다', '호주', '발전되다', '피우다', '스웨터', '시작되다', '겨울', '예선', '한국말', '세워지다', '비방방', '좋다', '남대문장', '보수적', '사진기', '내리다', '평평', '앉어다', '특별히', '우선', '대답', '학생', '여섯', '본질', '현대', '속하다', '지저런하다', '불다', '아래', '걸어오다', '선원', '호주', '일', '골프장', '가위', '기차', '저곳', '기침', '위쪽', '불안', '바람하다', '작아지다', '학비', '양말', '피곤', '말씀', '아무튼', '경계', '이모', '역사상', '공개하다', '영화간', '피자', '무컴', '예매', '있다', '사십', '선물', '아직', '결심하다', '블라우스', '이자', '리다', '지우개', '소망', '예술하다', '울산', '끌다', '마마', '깨끗해지다', '소규모', '나아지다', '밀리다', '예', '눈', '내나다', '경우', '마지만', '위', '소리', '인사', '않다', '왜', '올래', '불가능하다', '체정하다', '땅딱', '돌아오다', '아래쪽', '말씀하다', '한순간', '도주', '왜', '빨다', '항공', '생각하다', '남북', '반반', '논문', '부르다', '저축', '둘러싸다', '내려가다', '공식적', '사과', '허용하다', '서울', '아무런', '돌보다', '믿바닥', '약', '실은', '흐려지다', '대선', '글프', '처별', '한국말', '서다', '나다', '그간', '일요일', '환자', '갈다', '복잡하다', '보업', '감', '분석', '용', '관습', '배', '반기다', '칠계', '달', '분리하다

In [65]:
submit = pd.read_csv('./sample_submission.csv')
submit['label'] = answer_list

In [66]:
submit

Unnamed: 0,id,label
0,TEST_00000,남망
1,TEST_00001,상당
2,TEST_00002,받아들이다
3,TEST_00003,바구니
4,TEST_00004,살
...,...,...
74116,TEST_74116,캐나다
74117,TEST_74117,사무
74118,TEST_74118,친절하다
74119,TEST_74119,쪽


In [67]:
submit.to_csv('./submission_6epoch.csv', index=False)