**Packages Installation**

In [1]:
# %cd /home/2023-1_DL_TeamProject_t5
# %pip install -r requirements.txt

**Environment Variables Settings**

In [2]:
%env WANDB_PROJECT=DL2023_t5
%env WANDB_NOTEBOOK_NAME=./main.ipynb

env: WANDB_PROJECT=DL2023_t5
env: WANDB_NOTEBOOK_NAME=./main.ipynb


**Import**

In [3]:
import random
import pandas as pd
import numpy as np
import math
import os
from PIL import Image
from pathlib import Path
import re
from nltk import edit_distance

from torch.nn.utils.rnn import pad_sequence
from util import InverseSqrtScheduler


import pytorch_lightning as pl
from pytorch_lightning.utilities import rank_zero_only

import torch
from torch.utils.data import Dataset, DataLoader

from transformers import (
    DonutProcessor,
    VisionEncoderDecoderConfig,
    VisionEncoderDecoderModel,
    get_scheduler
)

import wandb

from tqdm.auto import tqdm

import warnings
warnings.filterwarnings(action='ignore') 

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device.type

'cuda'

**Hyperparameter Settings**

In [5]:
CFG = {
    'WORKING_DIR': "/home/2023-1_DL_TeamProject_t5",
    'SEED':42,
    'NUM_WORKERS':4,
    'IMG_HEIGHT':800,
    'IMG_WIDTH':600,
    'MAX_LEN':1024,
    'BATCH_SIZE':1
}

**Set Working Direcotry**

In [6]:
os.chdir(CFG['WORKING_DIR'])
print(os.getcwd())

/home/2023-1_DL_TeamProject_t5


**Fix Seeds**

In [7]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED'])

**Dataset Building & Data Preprocessing**

In [8]:
type_dict = {0:"uni", 1:"nm", 2:"ing", 3:"exp", 4:"how", 5:"des", 9:"etc"}

class DonutDataset(Dataset):

    def __init__(
        self,
        dataframe: pd.DataFrame,
        max_length: int,
        processor: DonutProcessor,
        split: str = "train",
        ignore_id: int = -100,
    ):
        super().__init__()

        self.max_length = max_length
        self.split = split
        self.ignore_id = ignore_id
        self.dataframe = dataframe.reset_index(drop=True)
        self.dataframe_length = len(self.dataframe)
        self.processor = processor
        self.gt_container = []
        
        for idx, sample in self.dataframe.iterrows():
            ground_truth = self.get_gt_strings(eval(sample['texts']))
            self.gt_container.append(ground_truth)

    def get_gt_strings(self, ct):
        
        gt_string = ""
        flag = 1
        tp = -1
        for i, item in enumerate(ct):
            if flag:
                gt_string = gt_string + f'<{type_dict[item[0]]}>'
                tp = item[0]
                flag = 0
                gt_string = gt_string + f'{item[1]}'
            
            elif not flag:
                gt_string = gt_string + f' {item[1]}'
            
            if i == len(ct)-1 or ct[i+1][0] != tp:
                gt_string = gt_string + f'</{type_dict[item[0]]}>'
                flag = 1
        
        return gt_string
    
    def __len__(self):
        
        return self.dataframe_length

    def __getitem__(self, idx: int):

        sample = self.dataframe.loc[idx]
        image = Image.open(sample['image_path'])
       
        pixel_values = self.processor(image, random_padding=self.split == "train", return_tensors="pt").pixel_values.squeeze()

        target_sequence = self.gt_container[idx] 
        input_ids = self.processor.tokenizer(
            target_sequence,
            add_special_tokens=False,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )["input_ids"].squeeze(0)

        labels = input_ids.clone()
        labels[labels == self.processor.tokenizer.pad_token_id] = self.ignore_id  

        return pixel_values, labels, target_sequence

In [9]:
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")
processor.image_processor.size = {"height": CFG['IMG_HEIGHT'],"width": CFG['IMG_WIDTH']}

config = VisionEncoderDecoderConfig.from_pretrained("naver-clova-ix/donut-base")
config.encoder.image_size = [CFG['IMG_HEIGHT'], CFG['IMG_WIDTH']]
config.decoder.max_length = CFG['MAX_LEN']

model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base", config=config)

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


In [10]:
added_tokens = [fr'<{x}>' for x in type_dict.values()] + [fr'</{x}>' for x in type_dict.values()]
processor.tokenizer.add_tokens(added_tokens)
model.decoder.resize_token_embeddings(len(processor.tokenizer))
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.decoder_start_token_id = processor.tokenizer.convert_tokens_to_ids(['<s>'])[0]

In [11]:
print("Pad token ID:", processor.decode([model.config.pad_token_id]))
print("Decoder start token ID:", processor.decode([model.config.decoder_start_token_id]))

Pad token ID: <pad>
Decoder start token ID: <s>


In [12]:
from sklearn.model_selection import train_test_split

train_val_df = pd.read_csv("./dataframes/train_annot_df.csv")
test_df = pd.read_csv("./dataframes/test_annot_df.csv")

train_df, val_df = train_test_split(train_val_df, test_size=0.2, random_state=CFG['SEED'])

train_dataset = DonutDataset(train_df, max_length=CFG['MAX_LEN'], processor=processor, split="train")
val_dataset = DonutDataset(val_df, max_length=CFG['MAX_LEN'], processor=processor, split="validation")
test_dataset = DonutDataset(test_df, max_length=CFG['MAX_LEN'], processor=processor, split="test")

**Dataloader Building**

In [13]:
train_dataloader = DataLoader(train_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=True, num_workers=CFG['NUM_WORKERS'])
val_dataloader = DataLoader(val_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=CFG['NUM_WORKERS'])
test_dataloader = DataLoader(test_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=CFG['NUM_WORKERS'])

In [14]:
#Batch Verifying
batch = next(iter(train_dataloader))
pixel_values, labels, target_sequences = batch
print(pixel_values.shape)

torch.Size([1, 3, 800, 600])


In [15]:
for id in labels[0].tolist()[:30]:
  if id != -100:
    print(processor.decode([id]))
  else:
    print(id)

<des>

락
티
케
어
</des>
<nm>
제
마
지
스
</nm>
<des>
로
션
0
.
25%

프
레
드
니
카
르
베
이
트
전문


In [16]:
target_sequences[0]

'<des>락티케어</des><nm>제마지스</nm><des>로션 0.25% 프레드니카르베이트 전문의약품 본 상표는 GSK 그룹사 실시권을 [2017] GSK 허여 받은 그룹사 것입니다. 또는 소유이거나 실시권자 Stiefel 20g C</des>'

In [17]:
class DonutModelPLModule(pl.LightningModule):
    def __init__(self, config, processor, model):
        super().__init__()
        self.config = config
        self.processor = processor
        self.model = model

    def training_step(self, batch, batch_idx):
        pixel_values, labels, _ = batch
        
        outputs = self.model(pixel_values, labels=labels)
        loss = outputs.loss
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx, dataset_idx=0):
        pixel_values, labels, answers = batch
        batch_size = pixel_values.shape[0]

        decoder_input_ids = torch.full((batch_size, 1), self.model.config.decoder_start_token_id, device=self.device)
        
        outputs = self.model.generate(pixel_values,
                                   decoder_input_ids=decoder_input_ids,
                                   max_length=CFG['MAX_LEN'],
                                   early_stopping=True,
                                   pad_token_id=self.processor.tokenizer.pad_token_id,
                                   eos_token_id=self.processor.tokenizer.eos_token_id,
                                   use_cache=True,
                                   num_beams=1,
                                   bad_words_ids=[[self.processor.tokenizer.unk_token_id]],
                                   return_dict_in_generate=True,)
    
        predictions = []
        for seq in self.processor.tokenizer.batch_decode(outputs.sequences):
            seq = seq.replace(self.processor.tokenizer.eos_token, "").replace(self.processor.tokenizer.pad_token, "")
            seq = re.sub(r"<.*?>", "", seq, count=1).strip()
            predictions.append(seq)

        scores = []
        for pred, answer in zip(predictions, answers):
            # pred = re.sub(r"(?:(?<=>) | (?=", "", answer, count=1)
            # answer = answer.replace(self.processor.tokenizer.eos_token, "")
            scores.append(edit_distance(pred, answer) / max(len(pred), len(answer)))

            if self.config.get("verbose", False) and len(scores) == 1:
                print(f"Prediction: {pred}")
                print(f"    Answer: {answer}")
                print(f" Normed ED: {scores[0]}")

        self.log("val_edit_distance", np.mean(scores))
        
        return scores

    def configure_optimizers(self):

        optimizer = torch.optim.AdamW(self.parameters(), lr=self.config.get("lr"), weight_decay=self.config.get("weight_decay"))
        scheduler = InverseSqrtScheduler(optimizer, len(train_dataset) * self.config.get('warmup_ratio'))
    
        return [optimizer], [scheduler]

    def train_dataloader(self):
        return train_dataloader

    def val_dataloader(self):
        return val_dataloader

In [18]:
training_config = {"max_epochs":10,
                    "val_check_interval":0.25, # how many times we want to validate during an epoch
                    "check_val_every_n_epoch":1,
                    "gradient_clip_val":1.0,
                    "lr":3e-5,
                    "weight_decay":1e-2,
                    "accumulate_grad_batches":4,
                    "num_nodes": 1,
                    "warmup_ratio": 0.1,
                    "es_patience": 3,
                    "result_path": "./result",
                    "verbose": True,
                    }

model_module = DonutModelPLModule(training_config, processor, model)

In [20]:
torch.set_float32_matmul_precision("medium")

In [21]:
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor

wandb_logger = WandbLogger(project="DL-2023", name="demo-run")

early_stop_callback = EarlyStopping(monitor="val_edit_distance", patience=training_config.get('es_patience'), verbose=False, mode="min")
lr_monitor = LearningRateMonitor(logging_interval='step')

trainer = pl.Trainer(
        accelerator="gpu",
        devices=1,
        max_epochs=training_config.get("max_epochs"),
        val_check_interval=training_config.get("val_check_interval"),
        check_val_every_n_epoch=training_config.get("check_val_every_n_epoch"),
        gradient_clip_val=training_config.get("gradient_clip_val"),
        precision=16,
        accumulate_grad_batches=training_config.get("accumulate_grad_batches"),
        num_sanity_val_steps=0,
        logger=wandb_logger,
        callbacks=[early_stop_callback, lr_monitor],
)

trainer.fit(model_module)

[34m[1mwandb[0m: Currently logged in as: [33m2gnldud[0m. Use [1m`wandb login --relogin`[0m to force relogin


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                      | Params
----------------------------------------------------
0 | model | VisionEncoderDecoderModel | 201 M 
----------------------------------------------------
201 M     Trainable params
0         Non-trainable params
201 M     Total params
807.465   Total estimated model params size (MB)


Epoch 0:   1%|          | 695/67337 [02:30<4:00:56,  4.61it/s, v_num=25il]