**Packages Installation**

In [1]:
# %cd /home/2023-1_DL_TeamProject_t5
# %pip install -r requirements.txt

**Environment Variables Settings**

In [2]:
%env WANDB_PROJECT=Comsmetics&Medicines_CORD_DLt5
%env WANDB_NOTEBOOK_NAME=./experiment.ipynb

env: WANDB_PROJECT=DL2023_t5
env: WANDB_NOTEBOOK_NAME=./experiment.ipynb


**Import**

In [3]:
import random
import pandas as pd
import numpy as np
import math
import os
from PIL import Image
from pathlib import Path
import re
from nltk import edit_distance

from util import LogPredictionsCallback

import pytorch_lightning as pl
# from pytorch_lightning.utilities import rank_zero_only

import torch
from torch.utils.data import Dataset, DataLoader

from transformers import (
    DonutProcessor,
    VisionEncoderDecoderConfig,
    VisionEncoderDecoderModel,
    get_scheduler
)

import wandb

from tqdm.auto import tqdm

import warnings
warnings.filterwarnings(action='ignore') 

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device.type

'cuda'

**Hyperparameter Settings**

In [5]:
CFG = {
    'WORKING_DIR': "/home/2023-1_DL_TeamProject_t5",
    'SEED':17,
    'NUM_WORKERS':4,
    'IMG_HEIGHT':800,
    'IMG_WIDTH':600,
    'MAX_LEN':1024,
    'BATCH_SIZE':1,
    'SAMPLING_RATE':1,
    'VAL_SPLIT': 0.2,
    'PIN_MEMORY': True,
    'SWEEP_NUM': 6
}

**Set Working Direcotry**

In [6]:
os.chdir(CFG['WORKING_DIR'])
print(os.getcwd())

/home/2023-1_DL_TeamProject_t5


**Fix Seeds**

In [7]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED'])

**Dataset Building & Data Preprocessing**

In [8]:
type_dict = {0:"uni", 1:"nm", 2:"ing", 3:"exp", 4:"how", 5:"des", 9:"etc"}

class DonutDataset(Dataset):

    def __init__(
        self,
        dataframe: pd.DataFrame,
        max_length: int,
        processor: DonutProcessor,
        split: str = "train",
        ignore_id: int = -100,
    ):
        super().__init__()

        self.max_length = max_length
        self.split = split
        self.ignore_id = ignore_id
        self.dataframe = dataframe.reset_index(drop=True)
        self.dataframe_length = len(self.dataframe)
        self.processor = processor
        self.gt_container = []
        
        for idx, sample in self.dataframe.iterrows():
            ground_truth = self.get_gt_strings(eval(sample['texts']))
            self.gt_container.append(ground_truth)

    def get_gt_strings(self, ct):
        
        gt_string = ""
        flag = 1
        tp = -1
        for i, item in enumerate(ct):
            if flag:
                gt_string = gt_string + f'<{type_dict[item[0]]}>'
                tp = item[0]
                flag = 0
                gt_string = gt_string + f'{item[1]}'
            
            elif not flag:
                gt_string = gt_string + f' {item[1]}'
            
            if i == len(ct)-1 or ct[i+1][0] != tp:
                gt_string = gt_string + f'</{type_dict[item[0]]}>'
                flag = 1
        
        return gt_string
    
    def __len__(self):
        
        return self.dataframe_length

    def __getitem__(self, idx: int):

        sample = self.dataframe.loc[idx]
        image = Image.open(sample['image_path'])
       
        pixel_values = self.processor(image, random_padding=self.split == "train", return_tensors="pt").pixel_values.squeeze()

        target_sequence = self.gt_container[idx] 
        input_ids = self.processor.tokenizer(
            target_sequence,
            add_special_tokens=False,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )["input_ids"].squeeze(0)

        labels = input_ids.clone()
        labels[labels == self.processor.tokenizer.pad_token_id] = self.ignore_id  

        return pixel_values, labels, target_sequence

In [9]:
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")
processor.image_processor.size = {"height": CFG['IMG_HEIGHT'],"width": CFG['IMG_WIDTH']}
added_tokens = [fr'<{x}>' for x in type_dict.values()] + [fr'</{x}>' for x in type_dict.values()]
processor.tokenizer.add_tokens(added_tokens)

donut_config = VisionEncoderDecoderConfig.from_pretrained("naver-clova-ix/donut-base")
donut_config.encoder.image_size = [CFG['IMG_HEIGHT'], CFG['IMG_WIDTH']]
donut_config.decoder.max_length = CFG['MAX_LEN']

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


In [10]:
def model_init():    
    model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base", config=donut_config, ignore_mismatched_sizes=True)
    model.decoder.resize_token_embeddings(len(processor.tokenizer))
    model.config.pad_token_id = processor.tokenizer.pad_token_id
    model.config.decoder_start_token_id = processor.tokenizer.convert_tokens_to_ids(['<s>'])[0]
    return model

model = model_init()

In [11]:
print("Pad token ID:", processor.decode([model.config.pad_token_id]))
print("Decoder start token ID:", processor.decode([model.config.decoder_start_token_id]))

Pad token ID: <pad>
Decoder start token ID: <s>


In [12]:
from sklearn.model_selection import train_test_split

train_val_df = pd.read_csv("./dataframes/train_annot_df.csv")
# test_df = pd.read_csv("./dataframes/test_annot_df.csv")

train_df, val_df = train_test_split(train_val_df, test_size=CFG['VAL_SPLIT'], random_state=CFG['SEED'])

train_df = train_df.sample(frac=CFG['SAMPLING_RATE'], random_state=CFG['SEED'], ignore_index=True)
val_df = val_df.sample(frac=CFG['SAMPLING_RATE'], random_state=CFG['SEED'], ignore_index=True)

train_dataset = DonutDataset(train_df, max_length=CFG['MAX_LEN'], processor=processor, split="train")
val_dataset = DonutDataset(val_df, max_length=CFG['MAX_LEN'], processor=processor, split="validation")
# test_dataset = DonutDataset(test_df, max_length=CFG['MAX_LEN'], processor=processor, split="test")

**Dataloader Building**

In [13]:
train_dataloader = DataLoader(train_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=True, num_workers=CFG['NUM_WORKERS'], pin_memory=CFG['PIN_MEMORY'])
val_dataloader = DataLoader(val_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=CFG['NUM_WORKERS'], pin_memory=CFG['PIN_MEMORY'])
# test_dataloader = DataLoader(test_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=CFG['NUM_WORKERS'], pin_memory=CFG['PIN_MEMORY'])

In [14]:
#Batch Verifying
batch = next(iter(train_dataloader))
pixel_values, labels, target_sequences = batch
print(pixel_values.shape)

torch.Size([1, 3, 800, 600])


In [15]:
for id in labels[0].tolist()[:30]:
  if id != -100:
    print(processor.decode([id]))
  else:
    print(id)

<des>
SO
LU
TION
C
ARE
여
드
름
성
피
부
사용
적
합
테
스트
Hid
den
Tag
App
설치
후
,
정
품
인
증
확인
Check


In [16]:
target_sequences[0]

"<des>SOLUTION CARE 여드름성 피부 사용 적합 테스트 Hidden Tag App 설치 후, 정품인증 확인 Check the authenticity after installing Hidden Tag App 블랙헤드 유분개선 화이트헤드 Dr.Different VITAACNAL TX Night Cream 각질케어 * 임상으로 확인한 피부 유분 감소 비타아크날 TX 피부 각질 감소 나이트 크림 효과 화이트헤드 감소 블랙헤드 감소 피부 사용 인원 : 20명 (만15세~45세 여드름성피부 남녀), 개선, 적합/유분/블랙&화이트헤드/각질 시험기관 : 효능효과는 개인에 따라 (주)글로벌의학연구센터. 차이가 시험기간 : 있을 수 있습니다. 2020.09.28~2020.10.28 * * * 레티날 사용시 피부가 올라올 수 있습니다. 처음 사용하거나 피부가 섞어서 사용하시다가 점차 양을 따갑고 붉어질 수 민감하실 경우 크림에 있으며 각질이 소량 피부자극 테스트 완료 여드름성 피부 사용 적합 테스트 완료 처음 사용하거나 적응하면 바르고 민감하실 경우 사용하시기 매일 늘려 사용해주세요. 일주일에 세 번 바랍니다. 이내로 제조번호 및 화장품 책임판매업자 (주)다른코스메틱스 층(논현동, 다른타워) 화장품 제조업자 (주)에코먼트 경기도 산단로 63-5(모곡동) www.drdifferent.com MADE IN KOREA 평택시 서울특별시 강남구 학동로335 11, 12 8 809641 690685 > H0315 본 제품에 이상이 소비자상담실 : '소비자분쟁해결기준'에 있을 경우 의거 보상해 080-766-5252 공정거래위원회 고시 드립니다.</des><how>사용시의 1. 화장품 사용 사용부위가 붉은 반점, 이상 증상이나 부작용이 주의사항 시 또는 사용 후 부어오름 있는 경우 부위 등에는 사용을 직사광선에 또는 가려움증 전문의 등과 자제할 것 3. 의하여 등의 상담할 것. 2. 보관 및 취급 상처가 않는 곳에 보관할 것. 있는 시의 나) 고유의 노란색상이 주의할 것. 원료

**Pytorch Lightning Module Define**

In [17]:
class DonutModelPLModule(pl.LightningModule):
    def __init__(self, config, processor, model):
        super().__init__()
        self.config = config
        self.processor = processor
        self.model = model
        self.save_hyperparameters()

    def training_step(self, batch, batch_idx):
        pixel_values, labels, _ = batch
        
        outputs = self.model(pixel_values, labels=labels)
        loss = outputs.loss
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx, dataset_idx=0):
        pixel_values, labels, answers = batch
        batch_size = pixel_values.shape[0]

        val_loss = self.model(pixel_values, labels=labels).loss

        decoder_input_ids = torch.full((batch_size, 1), self.model.config.decoder_start_token_id, device=self.device)
        
        outputs = self.model.generate(pixel_values,
                                   decoder_input_ids=decoder_input_ids,
                                   max_length=CFG['MAX_LEN'],
                                   early_stopping=True,
                                   pad_token_id=self.processor.tokenizer.pad_token_id,
                                   eos_token_id=self.processor.tokenizer.eos_token_id,
                                   use_cache=True,
                                   num_beams=1,
                                   bad_words_ids=[[self.processor.tokenizer.unk_token_id]],
                                   return_dict_in_generate=True,)
    
        predictions = []
        for seq in self.processor.tokenizer.batch_decode(outputs.sequences):
            seq = seq.replace(self.processor.tokenizer.eos_token, "").replace(self.processor.tokenizer.pad_token, "")
            seq = re.sub(r"<.*?>", "", seq, count=1).strip()
            predictions.append(seq)

        scores = []
        for pred, answer in zip(predictions, answers):
            # pred = re.sub(r"(?:(?<=>) | (?=", "", answer, count=1)
            # answer = answer.replace(self.processor.tokenizer.eos_token, "")
            scores.append(edit_distance(pred, answer) / max(len(pred), len(answer)))

            if self.config.get("verbose", False) and len(scores) == 1:
                print(f"Prediction: {pred}")
                print(f"    Answer: {answer}")
                print(f" Normed ED: {scores[0]}")
        
        self.log("val_loss", val_loss)
        self.log("val_edit_distance", np.mean(scores))
        
        return pred 

    def configure_optimizers(self):

        num_total_steps = len(train_dataloader)*self.config.get('max_epochs')

        optimizer = torch.optim.AdamW(self.parameters(), lr=self.config.get("lr"), weight_decay=self.config.get("weight_decay"))
        scheduler = get_scheduler(
                        self.config.get("sch_type"),
                        optimizer=optimizer,
                        num_warmup_steps=math.ceil(num_total_steps*self.config.get("warmup_ratio")),
                        num_training_steps=num_total_steps,
                    )
        sch_config = {
            "scheduler": scheduler,
            "interval": "step",
            "name": self.config.get("sch_type") + "_scheduler",
	    }
    
        return [optimizer], [sch_config]

In [18]:
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint
import gc

torch.set_float32_matmul_precision("medium")

def train(config=None):
    with wandb.init(config=config):
        config = wandb.config
        wandb_logger = WandbLogger(log_model = "all")

        training_args = {
            "max_epochs":1,
            "val_check_interval": 0.2,
            "check_val_every_n_epoch": 1,
            "log_every_n_steps": 100,
            'grad_logging_step': 1000,
            "gradient_clip_val": 1.0,
            "lr": config.learning_rate,
            "weight_decay": config.weight_decay,
            "accumulate_grad_batches": 4,
            "num_nodes": 1,
            "warmup_ratio": config.warmup_ratio,
            "es_patience": 1,
            "sch_type": config.lr_scheduler_type,
            "verbose": False,
            }

        model_module = DonutModelPLModule(training_args, processor, model_init())
        
        wandb_logger.watch(model_module, log='all', log_freq=training_args.get('grad_logging_step'), log_graph=False)
        checkpoint_callback = ModelCheckpoint(monitor="val_edit_distance", mode="min", auto_insert_metric_name=True)
        early_stop_callback = EarlyStopping(monitor="val_edit_distance", patience=training_args.get('es_patience'), verbose=False, mode="min")
        lr_monitor = LearningRateMonitor(logging_interval='step')
        log_predictions_callback = LogPredictionsCallback()

        trainer = pl.Trainer(
            accelerator="gpu",
            devices=1,
            max_epochs=training_args.get("max_epochs"),
            val_check_interval=training_args.get("val_check_interval"),
            check_val_every_n_epoch=training_args.get("check_val_every_n_epoch"),
            gradient_clip_val=training_args.get("gradient_clip_val"),
            precision=16,
            accumulate_grad_batches=training_args.get("accumulate_grad_batches"),
            num_sanity_val_steps=2,
            logger=wandb_logger,
            log_every_n_steps=training_args.get("log_every_n_steps"),
            callbacks=[early_stop_callback, lr_monitor, checkpoint_callback, log_predictions_callback],
        )

        trainer.fit(model_module, train_dataloader, val_dataloader)

        del model_module
        del wandb_logger
        del checkpoint_callback
        del early_stop_callback
        del log_predictions_callback
        del lr_monitor
        del trainer
            
        gc.collect()
        torch.cuda.empty_cache()

In [19]:
sweep_config = {
    'method': 'random',
    'metric' : {
        'name': 'val_edit_distance',
        'goal': 'minimize',   
        },
    'parameters' : {
        'learning_rate': {
            'distribution': 'log_uniform_values',
            'min': 1e-5,
            'max': 1e-4,
        },
        'weight_decay': {
            'distribution': 'q_uniform',
            'min': 0,
            'max': 1e-2,
            'q': 0.001,
        },
        'warmup_ratio':{
            'values': [0.1, 0.2]
        },
        'lr_scheduler_type':{
            'values': ['inverse_sqrt', 'cosine']
        },
    },
}

In [20]:
sweep_id = wandb.sweep(sweep_config, project=os.environ['WANDB_PROJECT'])
wandb.agent(sweep_id, train, count=CFG['SWEEP_NUM'])



Create sweep with ID: qeqll1ix
Sweep URL: https://wandb.ai/2gnldud/DL2023_t5/sweeps/qeqll1ix


[34m[1mwandb[0m: Agent Starting Run: cw6hecok with config:
[34m[1mwandb[0m: 	learning_rate: 0.00010239531115560212
[34m[1mwandb[0m: 	lr_scheduler_type: inverse_sqrt
[34m[1mwandb[0m: 	warmup_ratio: 0.2
[34m[1mwandb[0m: 	weight_decay: 0.008
[34m[1mwandb[0m: Currently logged in as: [33m2gnldud[0m. Use [1m`wandb login --relogin`[0m to force relogin


Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at naver-clova-ix/donut-base and are newly initialized because the shapes did not match:
- decoder.model.decoder.embed_tokens.weight: found shape torch.Size([57525, 1024]) in the checkpoint and torch.Size([57539, 1024]) in the model instantiated
- decoder.lm_head.weight: found shape torch.Size([57525, 1024]) in the checkpoint and torch.Size([57539, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                      | Params
----------------------------------------------------
0 | model | VisionEncoderDecoderModel | 201 M 
------------------------------

Epoch 0:  25%|██▍       | 208/833 [01:01<03:04,  3.39it/s, v_num=ecok]Prediction: <des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><des><de

[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


Epoch 0:  53%|█████▎    | 439/833 [04:42<04:13,  1.55it/s, v_num=ecok]