In [1]:
from func_utils.plot_utils import show_image
import matplotlib.pyplot as plt 
from glob import glob
import pandas as pd 
import numpy as np 
import json
import os 

import torch 
from func_utils.pydataloader import SynthDogDataset
from func_utils.trainer_utils import *
from encoder_decoder_model import init_dit_mbert_models_fixed, init_dit_dbart_models, print_model_layer_sizes, load_pretrained_enc_dec_model

import wandb
import gc

torch.cuda.empty_cache()
gc.collect()
wandb.login()

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Currently logged in as: [33mbeasted90[0m ([33mbeasted90-comudel[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
def get_synth_images_json_path(data_root= os.path.join('synthdog','outputs'), split='train'):
    ipath = os.path.join(data_root, '*', split, '*.jpg')
    json_path = os.path.join(data_root, '*', split, 'metadata.jsonl')

    return glob(ipath), glob(json_path)


torch.cuda.empty_cache()

root_path = os.path.join('synthdog', 'outputs_ol')
train_ipath, train_json_metadata = get_synth_images_json_path(data_root=root_path, split='train')
val_ipath, val_json_metadata = get_synth_images_json_path(data_root=root_path, split='validation')
test_ipath, test_json_metadata = get_synth_images_json_path(data_root=root_path, split='test')
processor, text_tokenizer = init_dit_dbart_models(load_model=False)
# model.gradient_checkpointing_enable()

peak_mem = torch.cuda.max_memory_allocated()
print(f"The model as is is holding: {peak_mem / 1024**3:.2f} of GPU RAM")

Fetching 1 files: 100%|██████████| 1/1 [00:00<?, ?it/s]


The model as is is holding: 0.00 of GPU RAM


In [3]:
text = 'ÁGUA É ESSENCIAL PARA A COMPREENSÃO E AÇÃO; CÂNCER, ÓRGÃOS, EMOÇÃO, TÊM INFLUÊNCIA, E ÍNDICES MOSTRAM EVOLUÇÃO.'
text2 = "água é essencial para a compreensão e ação; câncer, órgãos, emoção, têm influência, e índices mostram evolução."

text_tokenizer.decode(text_tokenizer(text).input_ids)

'<s> ÁGUA É ESSENCIAL PARA A COMPREENSÃ O E AÇÃ O; CÂ NCER, ÓRGÃ OS, EMOÇÃ O, TÊ M INFLUÊ NCIA, E ÍNDICES MOSTRAM EVOLUÇÃ O.</s>'

In [4]:
run_name = "dtesting"
wandb.init(project="ocr model", name=run_name)

In [5]:
max_token_size = 1056
sample_size = 32
train_synthdataset = SynthDogDataset(train_ipath, train_json_metadata, image_feature_extractor=processor, 
                                     text_tokenizer=text_tokenizer, max_token_size=max_token_size, sample_size=sample_size)
val_synthdataset = SynthDogDataset(val_ipath, val_json_metadata, image_feature_extractor=processor, 
                                   text_tokenizer=text_tokenizer, max_token_size=max_token_size, sample_size=4)
# test_synthdataset = SynthDogDataset(test_ipath, test_json_metadata, image_feature_extractor=processor, 
#                                     text_tokenizer=text_tokenizer, max_token_size=max_token_size, sample_size=20)

['synthdog\\outputs_ol\\SynthDoG_en\\train\\image_0.jpg', 'synthdog\\outputs_ol\\SynthDoG_en\\train\\image_1.jpg']
Sampled lang counter: {'en': 16, 'pt': 16}
Length of _.images: 32 | Length of _.json_metadata: 32011
['synthdog\\outputs_ol\\SynthDoG_en\\validation\\image_10007.jpg', 'synthdog\\outputs_ol\\SynthDoG_en\\validation\\image_10017.jpg']
Sampled lang counter: {'pt': 2, 'en': 2}
Length of _.images: 4 | Length of _.json_metadata: 4008


In [6]:
# r=32
# alpha=r*2
# dropout=0.3
# target_modules = [
#         "q_proj", "k_proj", "v_proj", "out_proj",
# ]
# modules_to_save = None

num_epochs = 20
training_args = Seq2SeqTrainingArguments(
        output_dir=f"./{run_name}",
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=1,
        learning_rate=5e-4,  
        lr_scheduler_type="cosine",
        num_train_epochs=num_epochs,
        warmup_ratio=0.1,  
        logging_steps=10,
        logging_strategy="steps",
        save_total_limit=3,
        fp16=False,
        max_grad_norm=10,  
        weight_decay=0.01,
        
        dataloader_pin_memory=False,
        predict_with_generate=True,
        generation_max_length=max_token_size,
        generation_num_beams=6,
        report_to=["wandb"],
        run_name=run_name,
        save_safetensors=False,

        eval_strategy="epoch",
        save_strategy="epoch",
        metric_for_best_model="eval_loss",
        load_best_model_at_end=True,  
        greater_is_better=False,
        )

In [7]:
image_processor, text_tokenizer = init_dit_dbart_models(load_model=False)
decoder = "naver-clova-ix/donut-base"
ckpt_path = 'saved_models\mydit_dbart'
ovmodel = load_pretrained_enc_dec_model(ckpt_path, base_encoder_model=None, 
                                        base_decoder_model=decoder, 
                                        lora_applied=False, 
                                        new_tokens=['Ã', 'Ê', 'Â']
                                    )

Fetching 1 files: 100%|██████████| 1/1 [00:00<?, ?it/s]


Loaded the pre-trained model successfully...


In [8]:
text_tokenizer

XLMRobertaTokenizerFast(name_or_path='naver-clova-ix/donut-base', vocab_size=57522, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>', 'additional_special_tokens': ['<s_iitcdip>', '<s_synthdog>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	57521: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True, special=True

In [9]:
ovmodel

VisionEncoderDecoderModel(
  (encoder): BeitModel(
    (embeddings): BeitEmbeddings(
      (patch_embeddings): BeitPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): BeitEncoder(
      (layer): ModuleList(
        (0): BeitLayer(
          (attention): BeitAttention(
            (attention): BeitSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): BeitSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): BeitIntermediate(
            (dense): Linear(in_

In [10]:
ovmodel.config.max_length = max_token_size
ovmodel.config.decoder.max_length = max_token_size
# dropout = 0.2
ovmodel.config.min_length = 1
ovmodel.config.decoder.min_length = 1
ovmodel.config.no_repeat_ngram_size = 0
ovmodel.config.repetition_penalty = 1.2
ovmodel.config.length_penalty = 1.0 
ovmodel.config.early_stopping = True
ovmodel.config.num_beams = 6
ovmodel.config.use_cache = False  
ovmodel.config.is_encoder_decoder = True
ovmodel.config.do_sample = False  
ovmodel.config.tie_word_embeddings = True
print_trainable_prams(ovmodel)

✅ Trainable: encoder.embeddings.cls_token
✅ Trainable: encoder.embeddings.position_embeddings
✅ Trainable: encoder.embeddings.patch_embeddings.projection.weight
✅ Trainable: encoder.embeddings.patch_embeddings.projection.bias
✅ Trainable: encoder.encoder.layer.0.lambda_1
✅ Trainable: encoder.encoder.layer.0.lambda_2
✅ Trainable: encoder.encoder.layer.0.attention.attention.query.weight
✅ Trainable: encoder.encoder.layer.0.attention.attention.query.bias
✅ Trainable: encoder.encoder.layer.0.attention.attention.key.weight
✅ Trainable: encoder.encoder.layer.0.attention.attention.value.weight
✅ Trainable: encoder.encoder.layer.0.attention.attention.value.bias
✅ Trainable: encoder.encoder.layer.0.attention.output.dense.weight
✅ Trainable: encoder.encoder.layer.0.attention.output.dense.bias
✅ Trainable: encoder.encoder.layer.0.intermediate.dense.weight
✅ Trainable: encoder.encoder.layer.0.intermediate.dense.bias
✅ Trainable: encoder.encoder.layer.0.output.dense.weight
✅ Trainable: encoder.enco

In [11]:
ovmodel.config.decoder.max_position_embeddings

1536

In [12]:
text_tokenizer.bos_token_id

0

In [13]:
ovmodel.config.vocab_size, ovmodel.config.decoder.vocab_size, len(text_tokenizer)

(57528, 57528, 57528)

In [23]:
text_tokenizer

XLMRobertaTokenizerFast(name_or_path='naver-clova-ix/donut-base', vocab_size=57522, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>', 'additional_special_tokens': ['<s_iitcdip>', '<s_synthdog>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	57521: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True, special=True

In [25]:
ovmodel.config.max_length

1056

In [26]:
print(ovmodel.decoder.get_input_embeddings().weight.shape)

torch.Size([57528, 1024])


In [27]:
ovmodel.decoder.lm_head

Linear(in_features=1024, out_features=57528, bias=False)

In [25]:
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=10, 
)
trainer = setup_dit_bart_training(
        train_synthdataset, val_synthdataset, training_args=training_args, model=ovmodel, text_tokenizer=text_tokenizer,
        run_name = run_name, 
        callbacks=[early_stopping_callback]
    )

  trainer = Seq2SeqTrainer(


In [26]:
history = trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 0}.
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss,Bleu,Pred Intersect Labels
1,10.319,9.81922,0.0,0.00464
2,7.4842,7.685655,0.0,0.025
3,6.8435,7.306023,0.0,0.0
4,6.6315,7.308141,0.0,0.075
5,6.5865,7.191164,0.0,0.175
6,6.4501,7.352375,0.0,0.175
7,6.1878,7.256555,0.0,0.166606
8,6.2733,7.181708,0.0,0.083333
9,6.191,7.287179,0.0,0.041667
10,6.1747,7.250978,0.0,0.168333




KeyboardInterrupt: 

In [32]:
import numpy as np 

ovmodel.eval()
vind = np.random.randint(0, len(train_synthdataset))
sample = train_synthdataset[vind]  
inputs = sample["pixel_values"].unsqueeze(0).to(ovmodel.device)
image = sample['image']
text = sample['text']
output_ids = ovmodel.generate(inputs, max_length=100, num_beams=6)
prediction = text_tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("Prediction:", prediction)
print("Ground Truth:", sample["text"])

Prediction: des des des
Ground Truth: outras pessoas, sejam materiais ou corporais. O pagamento poderá ser feito diretamente ao terceiro


In [None]:
from transformers import VisionEncoderDecoderModel, AutoImageProcessor

processor = AutoImageProcessor.from_pretrained("naver-clova-ix/donut-base")
tmodel = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base")
tmodel.eval()
vind = np.random.randint(0, len(train_synthdataset))
sample = train_synthdataset[vind]  
inputs = processor(sample['image'],return_tensors="pt").pixel_values.to(tmodel.device)
image = sample['image']
text = sample['text']
text

'called the "n"-isomer . However the'

In [2]:
import matplotlib.pyplot as plt 
plt.imshow(image)

NameError: name 'image' is not defined

In [47]:
output_ids = tmodel.generate(inputs, max_length=100, num_beams=6)
prediction = text_tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("Prediction:", prediction)
print("Ground Truth:", sample["text"])

Prediction: 
Ground Truth: called the "n"-isomer . However the


In [48]:
inputs = sample["pixel_values"].unsqueeze(0).to(ovmodel.device)
output_ids = ovmodel.generate(inputs, max_length=100, num_beams=6)
prediction = text_tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("Prediction:", prediction)
print("Ground Truth:", sample["text"])

Prediction: thess,
Ground Truth: called the "n"-isomer . However the
