In [2]:
import random
import pandas as pd
import numpy as np
import os
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from torchvision.models import resnet18
from torchvision import transforms

from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings(action='ignore') 
import cv2
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2




In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [47]:
CFG = {
    'IMG_HEIGHT_SIZE':64,
    'IMG_WIDTH_SIZE':224,
    'EPOCHS':50,
    'LEARNING_RATE':1e-3,
    'BATCH_SIZE':256,
    'NUM_WORKERS':4, # 본인의 GPU, CPU 환경에 맞게 설정
    'SEED':41
}

In [48]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

In [49]:
os.getcwd()

'/home/gsc/dacon_KYOWON/Dacon_KYOWON'

In [50]:
#window
# base_dir = 'D:/Dacon_KYOWON/Dacon_KYOWON'
# data_dir = "D:/Dacon_KYOWON/open"
#ubuntu
base_dir = '/home/gsc/dacon_KYOWON/Dacon_KYOWON'
data_dir = '/home/gsc/dacon_KYOWON/open'


In [51]:
df = pd.read_csv(f'{data_dir}/train.csv')

In [52]:
# 제공된 학습데이터 중 1글자 샘플들의 단어사전이 학습/테스트 데이터의 모든 글자를 담고 있으므로 학습 데이터로 우선 배치
df['len'] = df['label'].str.len()
train_v1 = df[df['len']==1]

In [53]:
# 제공된 학습데이터 중 2글자 이상의 샘플들에 대해서 단어길이를 고려하여 Train (80%) / Validation (20%) 분할
df = df[df['len']>1]
train_v2, val, _, _ = train_test_split(df, df['len'], test_size=0.2, random_state=CFG['SEED'])

데이터 길이를 1,2,3,4로 나눠보는 것은 어떨까

In [54]:
# 학습 데이터로 우선 배치한 1글자 샘플들과 분할된 2글자 이상의 학습 샘플을 concat하여 최종 학습 데이터로 사용
train = pd.concat([train_v1, train_v2])
print(len(train), len(val))

66251 10637


In [55]:
# 학습 데이터로부터 단어 사전(Vocabulary) 구축
train_gt = [gt for gt in train['label']]
train_gt = "".join(train_gt)
letters = sorted(list(set(list(train_gt))))
print(len(letters))

2349


In [56]:
vocabulary = ["-"] + letters
print(len(vocabulary))
idx2char = {k:v for k,v in enumerate(vocabulary, start=0)}
char2idx = {v:k for k,v in idx2char.items()}

2350


In [57]:
class CustomDataset(Dataset):
    def __init__(self, img_path_list, label_list,transforms=None, train_mode=True):
        self.img_path_list = img_path_list
        self.label_list = label_list
        self.transforms = transforms
        self.train_mode = train_mode
        
    def __len__(self):
        return len(self.img_path_list)
    
    def __getitem__(self, index):
        img_path = self.img_path_list[index]
        image = cv2.imread(f"{data_dir}/{img_path.split('/')[-2]}/{img_path.split('/')[-1]}")
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        if self.transforms is not None:
            image = self.transforms(image=image)['image']
           
        if self.label_list is not None:
            text = self.label_list[index]
            return image, text
        else:
            return image
  

In [58]:
def resize_transform(height,width, state='train'):
    if state == 'train':
        transform = A.Compose([
                                #A.HorizontalFlip(p=0.2),
                                #A.VerticalFlip(p=0.2),
                                A.Rotate(limit=[-45,45], p=1),
                                #A.RandomRotate90(p=0.2),
                                A.Resize(height,width),
                                #A.RandomResizedCrop(height=height, width=width, scale=(0.3, 1.0)),
                                A.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
                                ToTensorV2(),
                                ])
    else:
        transform = A.Compose([
                            A.Resize(height,width),
                            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, always_apply=False, p=1.0),
                            #A.RandomResizedCrop(height=CFG['IMG_SIZE'], width=CFG['IMG_SIZE'], scale=(0.3, 1.0)),               
                            ToTensorV2()
                            ])

    return transform

In [59]:
train_transform = resize_transform(CFG['IMG_HEIGHT_SIZE'],CFG['IMG_WIDTH_SIZE'])
test_transform = resize_transform(CFG['IMG_HEIGHT_SIZE'],CFG['IMG_WIDTH_SIZE'],'test')

In [60]:
img_path = train['label'].values
print(img_path)
#print(f"{data_dir}/{img_path.split('/')[-2]}/{img_path.split('/')[-1]}")

['머' '써' '빈' ... '계속' '단계' '손수']


In [61]:
train_dataset = CustomDataset(train['img_path'].values, train['label'].values, train_transform)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=CFG['NUM_WORKERS'])

val_dataset = CustomDataset(val['img_path'].values, val['label'].values,test_transform)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=CFG['NUM_WORKERS'])

In [62]:
image_batch, text_batch = iter(train_loader).next()
print(image_batch.size(), text_batch)

torch.Size([256, 3, 64, 224]) ('신비', '그나마', '씻기다', '쉴', '일반', '셋', '수입되다', '세', '달리다', '뜁', '선원', '추가', '대량', '예매하다', '콩', '쯩', '못', '사모님', '뱀', '호주머니', '젤', '벌', '불', '베개', '수도권', '회복되다', '날씨', '유산', '견해', '체조', '얘', '튀김', '삼계탕', '어려움', '세', '발자국', '터', '함께', '스타', '샛', '참여하다', '븐', '학생증', '신인', '신청', '양복', '추진하다', '집중하다', '구분되다', '기타', '지구', '꽐', '명예', '대륙', '천장', '신', '칡', '창조', '걋', '나무', '향', '여행사', '강', '흄', '출발', '전철', '꿈', '캠페인', '유산', '자극', '품', '대출', '관광버스', '내외', '법', '딱', '알리다', '먹이다', '왼발', '이제', '씁', '뒷골목', '만들다', '특정하다', '학교생활', '잘나다', '륵', '우유', '찜', '가늘다', '낮다', '적다', '척', '도', '이', '맛', '바탕', '끌다', '애쓰다', '이거', '어느덧', '주사', '겁', '늰', '빵', '곡', '바탕', '상대편', '옆', '헤', '중단', '회', '악', '농민', '긴급', '실은', '활용', '수십', '사흘', '실리다', '확대되다', '읍', '택하다', '화', '꼬마', '이해하다', '불편하다', '전문직', '멀리', '죽', '가다', '많아지다', '암', '일자', '이념', '수입되다', '팀', '돌다', '악몽', '간단하다', '불리다', '발휘하다', '미술관', '과학자', '정확하다', '턱', '큽', '텍스트', '신', '이곳저곳', '무용가', '무', '자동', '참석', '기술', '뛸', '뱅', '만일', '소풍', 

In [83]:
!pip install torchsummary

Collecting torchsummary
  Using cached torchsummary-1.5.1-py3-none-any.whl (2.8 kB)
Installing collected packages: torchsummary
Successfully installed torchsummary-1.5.1


In [84]:
from torchsummary import summary as summary_

In [98]:
resnet_test = resnet18(pretrained=True)
resnet_test.cuda()
summary_(resnet_test,(3,64,224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 64, 32, 112]           9,408
       BatchNorm2d-2          [-1, 64, 32, 112]             128
              ReLU-3          [-1, 64, 32, 112]               0
         MaxPool2d-4           [-1, 64, 16, 56]               0
            Conv2d-5           [-1, 64, 16, 56]          36,864
       BatchNorm2d-6           [-1, 64, 16, 56]             128
              ReLU-7           [-1, 64, 16, 56]               0
            Conv2d-8           [-1, 64, 16, 56]          36,864
       BatchNorm2d-9           [-1, 64, 16, 56]             128
             ReLU-10           [-1, 64, 16, 56]               0
       BasicBlock-11           [-1, 64, 16, 56]               0
           Conv2d-12           [-1, 64, 16, 56]          36,864
      BatchNorm2d-13           [-1, 64, 16, 56]             128
             ReLU-14           [-1, 64,

In [104]:
import torchvision.models as models
eff_test = models.efficientnet_b5(pretrained=True)
eff_test.cuda()
summary_(eff_test,(3,64,224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 48, 32, 112]           1,296
       BatchNorm2d-2          [-1, 48, 32, 112]              96
              SiLU-3          [-1, 48, 32, 112]               0
            Conv2d-4          [-1, 48, 32, 112]             432
       BatchNorm2d-5          [-1, 48, 32, 112]              96
              SiLU-6          [-1, 48, 32, 112]               0
 AdaptiveAvgPool2d-7             [-1, 48, 1, 1]               0
            Conv2d-8             [-1, 12, 1, 1]             588
              SiLU-9             [-1, 12, 1, 1]               0
           Conv2d-10             [-1, 48, 1, 1]             624
          Sigmoid-11             [-1, 48, 1, 1]               0
SqueezeExcitation-12          [-1, 48, 32, 112]               0
           Conv2d-13          [-1, 24, 32, 112]           1,152
      BatchNorm2d-14          [-1, 24, 

In [138]:
class RecognitionModel_eff(nn.Module):
    def __init__(self, num_chars=len(char2idx), rnn_hidden_size=256):
        super(RecognitionModel_eff, self).__init__()
        self.num_chars = num_chars
        self.rnn_hidden_size = rnn_hidden_size
        
        # CNN Backbone = 사전학습된 resnet18 활용
        # https://arxiv.org/abs/1512.03385
        effnet = models.efficientnet_b5(pretrained=True)
        # CNN Feature Extract
        effnet_modules = list(effnet.features)[:-3]
        self.feature_extract = nn.Sequential(
            *effnet_modules,
            nn.Conv2d(176, 256, kernel_size=(3,6), stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True)
        )

        self.linear1 = nn.Linear(1024, rnn_hidden_size)
        
        # RNN
        self.rnn = nn.RNN(input_size=rnn_hidden_size, 
                            hidden_size=rnn_hidden_size,
                            bidirectional=True, 
                            batch_first=True)
        self.linear2 = nn.Linear(self.rnn_hidden_size*2, num_chars)
        
        
    def forward(self, x):
        # CNN
        x = self.feature_extract(x) # [batch_size, channels, height, width]
        x = x.permute(0, 3, 1, 2) # [batch_size, width, channels, height]
         
        batch_size = x.size(0)
        T = x.size(1)
        x = x.view(batch_size, T, -1) # [batch_size, T==width, num_features==channels*height]
        x = self.linear1(x)
        
        # RNN
        x, hidden = self.rnn(x)
        
        output = self.linear2(x)
        output = output.permute(1, 0, 2) # [T==10, batch_size, num_classes==num_features]
        
        return output

In [140]:
RecognitionModel_eff_test = RecognitionModel_eff()
RecognitionModel_eff_test.cuda()
summary_(RecognitionModel_eff_test,(3,64,224))
#RecognitionModel_eff_test.state_dict().keys()

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 48, 32, 112]           1,296
       BatchNorm2d-2          [-1, 48, 32, 112]              96
              SiLU-3          [-1, 48, 32, 112]               0
            Conv2d-4          [-1, 48, 32, 112]             432
       BatchNorm2d-5          [-1, 48, 32, 112]              96
              SiLU-6          [-1, 48, 32, 112]               0
 AdaptiveAvgPool2d-7             [-1, 48, 1, 1]               0
            Conv2d-8             [-1, 12, 1, 1]             588
              SiLU-9             [-1, 12, 1, 1]               0
           Conv2d-10             [-1, 48, 1, 1]             624
          Sigmoid-11             [-1, 48, 1, 1]               0
SqueezeExcitation-12          [-1, 48, 32, 112]               0
           Conv2d-13          [-1, 24, 32, 112]           1,152
      BatchNorm2d-14          [-1, 24, 

In [None]:
RecognitionModel_test = RecognitionModel()
RecognitionModel_test.cuda()
summary_(RecognitionModel_test,(3,64,224))

In [63]:
class RecognitionModel(nn.Module):
    def __init__(self, num_chars=len(char2idx), rnn_hidden_size=256):
        super(RecognitionModel, self).__init__()
        self.num_chars = num_chars
        self.rnn_hidden_size = rnn_hidden_size
        
        # CNN Backbone = 사전학습된 resnet18 활용
        # https://arxiv.org/abs/1512.03385
        resnet = resnet18(pretrained=True)
        # CNN Feature Extract
        resnet_modules = list(resnet.children())[:-3]
        self.feature_extract = nn.Sequential(
            *resnet_modules,
            nn.Conv2d(256, 256, kernel_size=(3,6), stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True)
        )

        self.linear1 = nn.Linear(1024, rnn_hidden_size)
        
        # RNN
        self.rnn = nn.RNN(input_size=rnn_hidden_size, 
                            hidden_size=rnn_hidden_size,
                            bidirectional=True, 
                            batch_first=True)
        self.linear2 = nn.Linear(self.rnn_hidden_size*2, num_chars)
        
        
    def forward(self, x):
        # CNN
        x = self.feature_extract(x) # [batch_size, channels, height, width]
        x = x.permute(0, 3, 1, 2) # [batch_size, width, channels, height]
         
        batch_size = x.size(0)
        T = x.size(1)
        x = x.view(batch_size, T, -1) # [batch_size, T==width, num_features==channels*height]
        x = self.linear1(x)
        
        # RNN
        x, hidden = self.rnn(x)
        
        output = self.linear2(x)
        output = output.permute(1, 0, 2) # [T==10, batch_size, num_classes==num_features]
        
        return output

In [141]:
criterion = nn.CTCLoss(blank=0) # idx 0 : '-'

In [142]:
def encode_text_batch(text_batch):
    text_batch_targets_lens = [len(text) for text in text_batch]
    text_batch_targets_lens = torch.IntTensor(text_batch_targets_lens)
    
    text_batch_concat = "".join(text_batch)
    text_batch_targets = [char2idx[c] for c in text_batch_concat]
    text_batch_targets = torch.IntTensor(text_batch_targets)
    
    return text_batch_targets, text_batch_targets_lens

In [143]:
def compute_loss(text_batch, text_batch_logits):
    """
    text_batch: list of strings of length equal to batch size
    text_batch_logits: Tensor of size([T, batch_size, num_classes])
    """
    text_batch_logps = F.log_softmax(text_batch_logits, 2) # [T, batch_size, num_classes]  
    text_batch_logps_lens = torch.full(size=(text_batch_logps.size(1),), 
                                       fill_value=text_batch_logps.size(0), 
                                       dtype=torch.int32).to(device) # [batch_size] 

    text_batch_targets, text_batch_targets_lens = encode_text_batch(text_batch)
    loss = criterion(text_batch_logps, text_batch_targets, text_batch_logps_lens, text_batch_targets_lens)

    return loss

In [144]:
def train(model, optimizer, train_loader, val_loader, scheduler, device):
    model.to(device)
    
    best_loss = 999999
    best_model = None
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        for image_batch, text_batch in tqdm(iter(train_loader)):
            image_batch = image_batch.to(device)
            
            optimizer.zero_grad()
            
            text_batch_logits = model(image_batch)
            loss = compute_loss(text_batch, text_batch_logits)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
        
        _train_loss = np.mean(train_loss)
        
        _val_loss = validation(model, val_loader, device)
        print(f'Epoch : [{epoch}] Train CTC Loss : [{_train_loss:.5f}] Val CTC Loss : [{_val_loss:.5f}]')
        
        if scheduler is not None:
            scheduler.step(_val_loss)
        
        if best_loss > _val_loss:
            best_loss = _val_loss
            best_model = model
    
    return best_model

In [145]:
def validation(model, val_loader, device):
    model.eval()
    val_loss = []
    with torch.no_grad():
        for image_batch, text_batch in tqdm(iter(val_loader)):
            image_batch = image_batch.to(device)
            
            text_batch_logits = model(image_batch)
            loss = compute_loss(text_batch, text_batch_logits)
            
            val_loss.append(loss.item())
    
    _val_loss = np.mean(val_loss)
    return _val_loss

In [146]:
model = RecognitionModel_eff()
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2,threshold_mode='abs',min_lr=1e-8, verbose=True)

infer_model = train(model, optimizer, train_loader, val_loader, scheduler, device)

  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [1] Train CTC Loss : [6.99589] Val CTC Loss : [5.44485]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [2] Train CTC Loss : [5.37781] Val CTC Loss : [3.82829]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [3] Train CTC Loss : [3.95382] Val CTC Loss : [2.27937]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [4] Train CTC Loss : [2.43728] Val CTC Loss : [0.99604]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [5] Train CTC Loss : [1.40943] Val CTC Loss : [0.55840]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [6] Train CTC Loss : [0.88524] Val CTC Loss : [0.34672]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [7] Train CTC Loss : [0.60441] Val CTC Loss : [0.24706]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [8] Train CTC Loss : [0.44584] Val CTC Loss : [0.18568]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [9] Train CTC Loss : [0.33099] Val CTC Loss : [0.13634]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [10] Train CTC Loss : [0.26823] Val CTC Loss : [0.12483]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [11] Train CTC Loss : [0.23975] Val CTC Loss : [0.12610]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [12] Train CTC Loss : [0.19910] Val CTC Loss : [0.09331]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [13] Train CTC Loss : [0.18590] Val CTC Loss : [0.12012]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [14] Train CTC Loss : [0.17289] Val CTC Loss : [0.11683]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [15] Train CTC Loss : [0.15176] Val CTC Loss : [0.08838]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [16] Train CTC Loss : [0.14396] Val CTC Loss : [0.09557]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [17] Train CTC Loss : [0.13442] Val CTC Loss : [0.08138]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [18] Train CTC Loss : [0.13386] Val CTC Loss : [0.08766]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [19] Train CTC Loss : [0.12978] Val CTC Loss : [0.07741]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [20] Train CTC Loss : [0.14530] Val CTC Loss : [0.08432]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [21] Train CTC Loss : [0.12238] Val CTC Loss : [0.09344]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [22] Train CTC Loss : [0.10283] Val CTC Loss : [0.10659]
Epoch 00022: reducing learning rate of group 0 to 5.0000e-04.


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [23] Train CTC Loss : [0.05394] Val CTC Loss : [0.04491]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [24] Train CTC Loss : [0.03716] Val CTC Loss : [0.04250]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [25] Train CTC Loss : [0.03416] Val CTC Loss : [0.03815]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [26] Train CTC Loss : [0.02683] Val CTC Loss : [0.03915]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [27] Train CTC Loss : [0.03523] Val CTC Loss : [0.03685]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [28] Train CTC Loss : [0.02740] Val CTC Loss : [0.03860]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [29] Train CTC Loss : [0.03065] Val CTC Loss : [0.03729]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [30] Train CTC Loss : [0.03404] Val CTC Loss : [0.05937]
Epoch 00030: reducing learning rate of group 0 to 2.5000e-04.


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [31] Train CTC Loss : [0.02349] Val CTC Loss : [0.03552]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [32] Train CTC Loss : [0.01526] Val CTC Loss : [0.02854]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [33] Train CTC Loss : [0.01460] Val CTC Loss : [0.02886]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [34] Train CTC Loss : [0.01513] Val CTC Loss : [0.03101]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [35] Train CTC Loss : [0.01499] Val CTC Loss : [0.03127]
Epoch 00035: reducing learning rate of group 0 to 1.2500e-04.


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [36] Train CTC Loss : [0.01079] Val CTC Loss : [0.02711]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [37] Train CTC Loss : [0.00938] Val CTC Loss : [0.02600]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [38] Train CTC Loss : [0.00859] Val CTC Loss : [0.02777]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [39] Train CTC Loss : [0.00796] Val CTC Loss : [0.02737]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [40] Train CTC Loss : [0.00758] Val CTC Loss : [0.02848]
Epoch 00040: reducing learning rate of group 0 to 6.2500e-05.


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [41] Train CTC Loss : [0.00677] Val CTC Loss : [0.02592]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [42] Train CTC Loss : [0.00620] Val CTC Loss : [0.02529]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [43] Train CTC Loss : [0.00603] Val CTC Loss : [0.02555]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [44] Train CTC Loss : [0.00576] Val CTC Loss : [0.02578]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [45] Train CTC Loss : [0.00558] Val CTC Loss : [0.02613]
Epoch 00045: reducing learning rate of group 0 to 3.1250e-05.


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [46] Train CTC Loss : [0.00526] Val CTC Loss : [0.02476]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [47] Train CTC Loss : [0.00476] Val CTC Loss : [0.02498]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [48] Train CTC Loss : [0.00507] Val CTC Loss : [0.02508]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [49] Train CTC Loss : [0.00490] Val CTC Loss : [0.02539]
Epoch 00049: reducing learning rate of group 0 to 1.5625e-05.


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [50] Train CTC Loss : [0.00467] Val CTC Loss : [0.02522]


In [69]:
model = RecognitionModel()
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2,threshold_mode='abs',min_lr=1e-8, verbose=True)

infer_model = train(model, optimizer, train_loader, val_loader, scheduler, device)

  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [1] Train CTC Loss : [6.94179] Val CTC Loss : [5.62186]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [2] Train CTC Loss : [5.46225] Val CTC Loss : [4.10313]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [3] Train CTC Loss : [4.28611] Val CTC Loss : [2.71521]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [4] Train CTC Loss : [2.92163] Val CTC Loss : [1.63375]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [5] Train CTC Loss : [1.95564] Val CTC Loss : [1.02609]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [6] Train CTC Loss : [1.38198] Val CTC Loss : [0.74690]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [7] Train CTC Loss : [1.02963] Val CTC Loss : [0.53161]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [8] Train CTC Loss : [0.80307] Val CTC Loss : [0.42908]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [9] Train CTC Loss : [0.64523] Val CTC Loss : [0.34102]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [10] Train CTC Loss : [0.54701] Val CTC Loss : [0.33079]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [11] Train CTC Loss : [0.45835] Val CTC Loss : [0.31476]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [12] Train CTC Loss : [0.39662] Val CTC Loss : [0.29276]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [13] Train CTC Loss : [0.36063] Val CTC Loss : [0.25891]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [14] Train CTC Loss : [0.32050] Val CTC Loss : [0.23994]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [15] Train CTC Loss : [0.28850] Val CTC Loss : [0.23162]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [16] Train CTC Loss : [0.27748] Val CTC Loss : [0.20388]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [17] Train CTC Loss : [0.25522] Val CTC Loss : [0.25127]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [18] Train CTC Loss : [0.23620] Val CTC Loss : [0.19825]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [19] Train CTC Loss : [0.21695] Val CTC Loss : [0.19250]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [20] Train CTC Loss : [0.21128] Val CTC Loss : [0.24411]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [21] Train CTC Loss : [0.21032] Val CTC Loss : [0.21581]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [22] Train CTC Loss : [0.19316] Val CTC Loss : [0.17051]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [23] Train CTC Loss : [0.18656] Val CTC Loss : [0.18182]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [24] Train CTC Loss : [0.17500] Val CTC Loss : [0.18472]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [25] Train CTC Loss : [0.18392] Val CTC Loss : [0.27264]
Epoch 00025: reducing learning rate of group 0 to 5.0000e-04.


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [26] Train CTC Loss : [0.08830] Val CTC Loss : [0.10236]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [27] Train CTC Loss : [0.06096] Val CTC Loss : [0.09611]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [28] Train CTC Loss : [0.05566] Val CTC Loss : [0.09595]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [29] Train CTC Loss : [0.05389] Val CTC Loss : [0.09269]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [30] Train CTC Loss : [0.05242] Val CTC Loss : [0.10853]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [31] Train CTC Loss : [0.05205] Val CTC Loss : [0.11287]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [32] Train CTC Loss : [0.05070] Val CTC Loss : [0.09886]
Epoch 00032: reducing learning rate of group 0 to 2.5000e-04.


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [33] Train CTC Loss : [0.03275] Val CTC Loss : [0.08055]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [34] Train CTC Loss : [0.02654] Val CTC Loss : [0.07774]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [35] Train CTC Loss : [0.02552] Val CTC Loss : [0.08185]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [36] Train CTC Loss : [0.02212] Val CTC Loss : [0.07831]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [37] Train CTC Loss : [0.02343] Val CTC Loss : [0.08257]
Epoch 00037: reducing learning rate of group 0 to 1.2500e-04.


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [38] Train CTC Loss : [0.01739] Val CTC Loss : [0.07514]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [39] Train CTC Loss : [0.01552] Val CTC Loss : [0.07157]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [40] Train CTC Loss : [0.01492] Val CTC Loss : [0.07097]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [41] Train CTC Loss : [0.01446] Val CTC Loss : [0.07239]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [42] Train CTC Loss : [0.01335] Val CTC Loss : [0.07141]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [43] Train CTC Loss : [0.01343] Val CTC Loss : [0.07211]
Epoch 00043: reducing learning rate of group 0 to 6.2500e-05.


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [44] Train CTC Loss : [0.01178] Val CTC Loss : [0.06943]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [45] Train CTC Loss : [0.01079] Val CTC Loss : [0.06808]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [46] Train CTC Loss : [0.01111] Val CTC Loss : [0.06758]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [47] Train CTC Loss : [0.01053] Val CTC Loss : [0.06891]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [48] Train CTC Loss : [0.01001] Val CTC Loss : [0.06589]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [49] Train CTC Loss : [0.00973] Val CTC Loss : [0.06769]


  0%|          | 0/259 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

Epoch : [50] Train CTC Loss : [0.00925] Val CTC Loss : [0.06733]


In [147]:
test = pd.read_csv('../open/test.csv')

In [148]:
test_dataset = CustomDataset(test['img_path'].values, None,test_transform)
test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=CFG['NUM_WORKERS'])

In [149]:
def decode_predictions(text_batch_logits):
    text_batch_tokens = F.softmax(text_batch_logits, 2).argmax(2) # [T, batch_size]
    text_batch_tokens = text_batch_tokens.numpy().T # [batch_size, T]

    text_batch_tokens_new = []
    for text_tokens in text_batch_tokens:
        text = [idx2char[idx] for idx in text_tokens]
        text = "".join(text)
        text_batch_tokens_new.append(text)

    return text_batch_tokens_new

def inference(model, test_loader, device):
    model.eval()
    preds = []
    with torch.no_grad():
        for image_batch in tqdm(iter(test_loader)):
            image_batch = image_batch.to(device)
            
            text_batch_logits = model(image_batch)
            
            text_batch_pred = decode_predictions(text_batch_logits.cpu())
            
            preds.extend(text_batch_pred)
    return preds

In [150]:
predictions = inference(infer_model, test_loader, device)

  0%|          | 0/290 [00:00<?, ?it/s]

In [151]:
# 샘플 별 추론결과를 독립적으로 후처리
def remove_duplicates(text):
    if len(text) > 1:
        letters = [text[0]] + [letter for idx, letter in enumerate(text[1:], start=1) if text[idx] != text[idx-1]]
    elif len(text) == 1:
        letters = [text[0]]
    else:
        return ""
    return "".join(letters)

def correct_prediction(word):
    parts = word.split("-")
    parts = [remove_duplicates(part) for part in parts]
    corrected_word = "".join(parts)
    return corrected_word

In [152]:
submit = pd.read_csv('../open/sample_submission.csv')
submit['label'] = predictions
submit['label'] = submit['label'].apply(correct_prediction)

In [153]:
submit.to_csv('./submission_50_eff.csv', index=False)

In [158]:
sub_text = pd.read_csv('./submission_50_eff.csv')
sub_text

Unnamed: 0,id,label
0,TEST_00000,남말
1,TEST_00001,상랑
2,TEST_00002,밤아들이다
3,TEST_00003,바구
4,TEST_00004,살
...,...,...
74116,TEST_74116,캐나다
74117,TEST_74117,사무
74118,TEST_74118,친절하다
74119,TEST_74119,쪽
