In [5]:
from func_utils.plot_utils import show_image
import matplotlib.pyplot as plt 
from glob import glob
import pandas as pd 
import numpy as np 
import json
import os 

import torch 
from func_utils.pydataloader import SynthDogDataset
from func_utils.trainer_utils import *
from encoder_decoder_model import init_dit_t5_models_fixed

import wandb
import gc

torch.cuda.empty_cache()
gc.collect()
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mbeasted90[0m ([33mbeasted90-comudel[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
def get_synth_images_json_path(data_root= os.path.join('synthdog','outputs'), split='train'):
    ipath = os.path.join(data_root, '*', split, '*.jpg')
    json_path = os.path.join(data_root, '*', split, 'metadata.jsonl')

    return glob(ipath), glob(json_path)


torch.cuda.empty_cache()

root_path = os.path.join('synthdog', 'outputs_ol')
train_ipath, train_json_metadata = get_synth_images_json_path(data_root=root_path, split='train')
val_ipath, val_json_metadata = get_synth_images_json_path(data_root=root_path, split='validation')
test_ipath, test_json_metadata = get_synth_images_json_path(data_root=root_path, split='test')
processor, text_tokenizer, _ = init_dit_t5_models_fixed()
# model.gradient_checkpointing_enable()

peak_mem = torch.cuda.max_memory_allocated()
print(f"The model as is is holding: {peak_mem / 1024**3:.2f} of GPU RAM")

Fetching 1 files: 100%|██████████| 1/1 [00:00<?, ?it/s]


The model as is is holding: 0.00 of GPU RAM


In [3]:
run_name = "dit_t5_raw_full_model_unfreeze"
wandb.init(project="ocr model", name=run_name)

In [4]:
max_token_size = 512
sample_size = 1000
train_synthdataset = SynthDogDataset(train_ipath, train_json_metadata, image_feature_extractor=processor, 
                                     text_tokenizer=text_tokenizer, max_token_size=max_token_size, sample_size=sample_size)
val_synthdataset = SynthDogDataset(val_ipath, val_json_metadata, image_feature_extractor=processor, 
                                   text_tokenizer=text_tokenizer, max_token_size=max_token_size, sample_size=20)
test_synthdataset = SynthDogDataset(test_ipath, test_json_metadata, image_feature_extractor=processor, 
                                    text_tokenizer=text_tokenizer, max_token_size=max_token_size, sample_size=20)

['synthdog\\outputs_ol\\SynthDoG_en\\train\\image_0.jpg', 'synthdog\\outputs_ol\\SynthDoG_en\\train\\image_1.jpg']
Sampled lang counter: {'pt': 500, 'en': 500}
Length of _.images: 1000 | Length of _.json_metadata: 32011
['synthdog\\outputs_ol\\SynthDoG_en\\validation\\image_10007.jpg', 'synthdog\\outputs_ol\\SynthDoG_en\\validation\\image_10017.jpg']
Sampled lang counter: {'pt': 10, 'en': 10}
Length of _.images: 20 | Length of _.json_metadata: 4008
['synthdog\\outputs_ol\\SynthDoG_en\\test\\image_10.jpg', 'synthdog\\outputs_ol\\SynthDoG_en\\test\\image_10003.jpg']
Sampled lang counter: {'pt': 10, 'en': 10}
Length of _.images: 20 | Length of _.json_metadata: 3978


In [5]:
r=32
alpha=r*2
dropout=0.3
target_modules = [
        "q_proj", "k_proj", "v_proj", "out_proj",
        # "fc1", "fc2"
]
# modules_to_save = ["embed_tokens", "lm_head"]
modules_to_save = None

num_epochs = 500
training_args = Seq2SeqTrainingArguments(
        output_dir=f"./{run_name}",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=1,
        learning_rate=9e-5,  
        lr_scheduler_type="cosine",
        num_train_epochs=num_epochs,
        warmup_ratio=0.1,  
        logging_steps=50,
        # save_steps=50,
        # eval_steps=50,
        logging_strategy="steps",
        save_total_limit=3,
        fp16=False,
        max_grad_norm=10,  
        weight_decay=0.01,
        
        dataloader_pin_memory=False,
        predict_with_generate=True,
        generation_max_length=512,
        generation_num_beams=6,
        report_to=["wandb"],
        run_name=run_name,
        save_safetensors=False,

        eval_strategy="epoch",
        save_strategy="epoch",
        metric_for_best_model="eval_loss",
        load_best_model_at_end=True,  
        greater_is_better=False,

        # label_smoothing_factor=0.1
        )

In [6]:
image_processor, text_tokenizer, ovmodel = init_dit_t5_models_fixed()
# ovmodel = add_lora_to_decoder(ovmodel, r=r, alpha=alpha, dropout=dropout, target_modules=target_modules, modules_to_save=modules_to_save)
# ovmodel = unfreeze_all_params(ovmodel, unfreeze_encoder=False, unfreeze_decoder=True)
# ovmodel = unfreeze_last_n_encoder(ovmodel, unfreeze_last_n_layer_block=1, unfreeze_attention_layers=True,skip_encoder=True, skip_decoder=True)
# ovmodel = freeze_encoder_unfreeze_decoder(ovmodel, applied_lora=True)

ovmodel.add_cross_attention = True
ovmodel.config.max_length = max_token_size
ovmodel.config.decoder.max_length = max_token_size
ovmodel.config.min_length = 1
ovmodel.config.decoder.min_length = 1
ovmodel.config.no_repeat_ngram_size = 0
ovmodel.config.repetition_penalty = 1.5
ovmodel.config.length_penalty = 1.0 
ovmodel.config.early_stopping = True
ovmodel.config.num_beams = 6
ovmodel.config.use_cache = False  
ovmodel.config.is_encoder_decoder = True
ovmodel.config.do_sample = False  
ovmodel.config.tie_word_embeddings = True
ovmodel.config.decoder.dropout = dropout
ovmodel.config.decoder.attention_dropout = 0.2
ovmodel.config.decoder.decoder_layerdrop = 0.15
print_trainable_prams(ovmodel)

Fetching 1 files: 100%|██████████| 1/1 [00:00<?, ?it/s]


✅ Trainable: encoder.embeddings.cls_token
✅ Trainable: encoder.embeddings.position_embeddings
✅ Trainable: encoder.embeddings.patch_embeddings.projection.weight
✅ Trainable: encoder.embeddings.patch_embeddings.projection.bias
✅ Trainable: encoder.encoder.layer.0.lambda_1
✅ Trainable: encoder.encoder.layer.0.lambda_2
✅ Trainable: encoder.encoder.layer.0.attention.attention.query.weight
✅ Trainable: encoder.encoder.layer.0.attention.attention.query.bias
✅ Trainable: encoder.encoder.layer.0.attention.attention.key.weight
✅ Trainable: encoder.encoder.layer.0.attention.attention.value.weight
✅ Trainable: encoder.encoder.layer.0.attention.attention.value.bias
✅ Trainable: encoder.encoder.layer.0.attention.output.dense.weight
✅ Trainable: encoder.encoder.layer.0.attention.output.dense.bias
✅ Trainable: encoder.encoder.layer.0.intermediate.dense.weight
✅ Trainable: encoder.encoder.layer.0.intermediate.dense.bias
✅ Trainable: encoder.encoder.layer.0.output.dense.weight
✅ Trainable: encoder.enco

In [7]:
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=10, 
)
trainer = setup_dit_bart_training(
        train_synthdataset, val_synthdataset, training_args=training_args, model=ovmodel, text_tokenizer=text_tokenizer,
        run_name = run_name, 
        callbacks=[early_stopping_callback]
    )

  trainer = Seq2SeqTrainer(


In [None]:
from transformers import T5ForConditionalGeneration

tmodel = T5ForConditionalGeneration.from_pretrained('t5-base')
tmodel.forward

['__call__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__func__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__self__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__']

In [8]:
history = trainer.train()

TypeError: T5ForConditionalGeneration.forward() got an unexpected keyword argument 'encoder_hidden_states'

In [None]:
wandb.finish()

In [None]:
model.eval()
vind = np.random.randint(0, len(train_synthdataset))
sample = train_synthdataset[vind]  
inputs = sample["pixel_values"].unsqueeze(0).to(model.device)
image = sample['image']
text = sample['text']
output_ids = model.generate(inputs, max_length=100, num_beams=6)
prediction = text_tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("Prediction:", prediction)
print("Ground Truth:", sample["text"])