In [1]:
import numpy as np

import torch
import pytorch_lightning as pl
from transformers import Dinov2Config

from dataset import ImageCaptionDataset, ImageTextCollator
from model import (
    load_tokenizer,
    load_dinov2_image_encoder,
    load_lstm_text_encoder,
    load_show_and_tell,
    load_lightning_model,
)
from utils import (
    read_yaml,
    get_split_config,
    get_dataset_config,
    get_image_model_config,
    get_tokenizer_config,
    get_text_model_config,
    get_model_config,
    get_trainer_config,
    get_show_and_tell_model_config,
)

In [2]:
config = read_yaml("./configs/train_config.yaml")
_, (train_split_config, val_split_config, _) = get_split_config(config)
config.keys()

dict_keys(['dataset_config', 'tokenizer_config', 'model_config', 'trainer_config'])

In [3]:
train_dataset_path, train_sampling_config, train_dataset_config = get_dataset_config(train_split_config)
if train_sampling_config:
    sampling_fn_name, sampling_fn_args = (
        train_sampling_config.pop("sampling_fn_name", None),
        train_sampling_config,
    )
train_dataset_path, train_dataset_config, sampling_fn_name, sampling_fn_args

('./coco-2014/dataset.json',
 {'return_dict': True,
  'padding': 'longest',
  'image_size': [518, 518],
  'max_length': 512},
 'choose_index',
 {'index': 0})

In [4]:
val_dataset_path, val_sampling_config, val_dataset_config = get_dataset_config(
    val_split_config
)

if val_sampling_config:
    val_sampling_fn_name, val_sampling_fn_args = (
        val_sampling_config.pop("sampling_fn_name", None),
        val_sampling_config,
    )

In [6]:
tokenizer_name, tokenizer_path, tokenizer_config = get_tokenizer_config(config)
tokenizer_name, tokenizer_path, tokenizer_config

('gpt2', 'openai-community/gpt2', {})

In [7]:
tokenizer = load_tokenizer(
    tokenizer_name=tokenizer_name,
    tokenizer_path=tokenizer_path,
    tokenizer_config=tokenizer_config,
)
# tokenizer.add_bos_token('<|startoftext|>')
tokenizer.vocab_size, tokenizer.special_tokens_map

(50257,
 {'bos_token': '<|startoftext|>',
  'eos_token': '<|endoftext|>',
  'unk_token': '<|endoftext|>',
  'pad_token': '<|pad|>'})

In [8]:
train_dataset = ImageCaptionDataset(
    tokenizer=tokenizer,
    dataset_path=train_dataset_path,
    sampling_fn=sampling_fn_name,
    sampling_fn_args=sampling_fn_args,
    return_tensors=None,
    **train_dataset_config,
)

ValueError: Sampling function 'None' should be either a callable type or str, found type <class 'NoneType'>.

In [8]:
batch = next(iter(train_dataset))
batch["pixel_values"].shape, len(batch["input_ids"])

((3, 518, 518), 14)

# Model dry run


In [9]:
model_config = get_model_config(config)
model_config

{'showandtell_model': {'model_name': 'showandtell', 'model_path': None},
 'image_model': {'model_name': 'dinov2',
  'model_path': './weights/dinov2-base-weights.pth',
  'freeze': True,
  'config': {'hidden_size': 768, 'image_size': 518, 'patch_size': 14}},
 'text_model': {'model_name': 'lstm',
  'model_path': None,
  'config': {'num_layers': 1, 'hidden_size': 768, 'bidirectional': False}}}

## Image Model


In [10]:
image_model_name, image_model_path, freeze, image_model_config = get_image_model_config(
    model_config
)
image_model_name, image_model_path, image_model_config, freeze

('dinov2',
 './weights/dinov2-base-weights.pth',
 {'hidden_size': 768, 'image_size': 518, 'patch_size': 14},
 True)

In [11]:
dinov2_config = Dinov2Config(**image_model_config)
image_encoder = load_dinov2_image_encoder(dinov2_config, freeze, image_model_path)
dinov2_config.image_size

  model.encoder.load_state_dict(torch.load(model_path))


518

In [12]:
# outs = image_encoder(batch['pixel_values'])
# outs[0].shape, outs[1].shape

## Text Model


In [13]:
text_model_name, text_model_path, text_model_config = get_text_model_config(
    model_config
)
text_model_name, text_model_path, text_model_config

('lstm', None, {'num_layers': 1, 'hidden_size': 768, 'bidirectional': False})

In [14]:
text_encoder = load_lstm_text_encoder(
    len(tokenizer), pretrained_model_path=text_model_path, **text_model_config
)

# ShowAndTell Core


In [15]:
showtell_core_model_name, showtell_core_model_path, showtell_core_config = (
    get_show_and_tell_model_config(model_config)
)
showtell_core_model_name, showtell_core_model_path, showtell_core_config

('showandtell', None, {})

In [16]:
showtell_core = load_show_and_tell(
    tokenizer,
    image_encoder,
    text_encoder,
    pretrained_model_path=showtell_core_model_path,
)
showtell_core.vocab_size, showtell_core.tokenizer.bos_token_id, batch.keys(), batch[
    "input_ids"
]

(50259,
 50258,
 dict_keys(['pixel_values', 'input_ids']),
 [50258,
  1969,
  929,
  286,
  41701,
  286,
  2057,
  326,
  2291,
  44653,
  290,
  8509,
  220,
  50256])

# Lightning Model


In [18]:
model = load_lightning_model(tokenizer, showtell_core)

# Dry Run Train


In [19]:
trainer_config, batch_size, logger_config, logger_name = get_trainer_config(config)
trainer_config, batch_size, logger_config, logger_name

({'accelerator': 'gpu',
  'max_epochs': 2,
  'log_every_n_steps': 1,
  'enable_progress_bar': True,
  'overfit_batches': 1},
 10,
 {'save_dir': './TrainingLogs/',
  'name': 'test1',
  'log_graph': True,
  'prefix': ''},
 'tensorboard')

In [20]:
# Dataloader
image_text_collator = ImageTextCollator(
    tokenizer,
    padding=train_dataset_config.get("padding", True),
    return_tensors=train_dataset_config.get("return_tensors", "pt"),
)
train_dataloader = torch.utils.data.DataLoader(
    dataset=train_dataset, batch_size=batch_size, collate_fn=image_text_collator
)
batch = next(iter(train_dataloader))
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([10, 14]),
 'attention_mask': torch.Size([10, 14]),
 'pixel_values': torch.Size([10, 3, 518, 518])}

In [21]:
logger_name = logger_config.pop
train_logger = pl.loggers.TensorBoardLogger(**logger_config)
trainer = pl.Trainer(logger=train_logger, **trainer_config)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
`Trainer(overfit_batches=1)` was configured so 1 batch will be used.


In [22]:
trainer.fit(model, train_dataloaders=train_dataloader)

/home/harsh/anaconda3/envs/DL/lib/python3.10/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
You are using a CUDA device ('NVIDIA GeForce RTX 4060 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type             | Params | Mode 
--------------------------------------------------------------
0 | showandtell_core | ShowAndTell      | 168 M  | train
1 | criterion        | CrossEntropyLoss | 0      | train
--------------------------------------------------------------
82.0 M    Trainable params
86.6 M    Non-trainable params
168 M     Total params
674.213

Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=2` reached.


In [23]:
# Dataloader
image_text_collator = ImageTextCollator(
    tokenizer,
    padding=train_dataset_config.get("padding", True),
    return_tensors=train_dataset_config.get("return_tensors", "pt"),
)
val_dataloader = torch.utils.data.DataLoader(
    dataset=train_dataset, batch_size=1, collate_fn=image_text_collator
)
batch = next(iter(val_dataloader))
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([1, 14]),
 'attention_mask': torch.Size([1, 14]),
 'pixel_values': torch.Size([1, 3, 518, 518])}

In [24]:
# batch = next(iter(val_dataloader))
# pixel_values, labels = batch["pixel_values"], batch["input_ids"][:, 1:]
# label_sequence = list(
#     map(
#         "".join,
#         list(map(tokenizer.batch_decode, labels.detach().cpu().numpy().tolist())),
#     )
# )

# logits, loss = model._step(pixel_values, labels)
# prediction = tokenizer.batch_decode(logits.argmax(-1))
# print(prediction, label_sequence)

# from model import calculate_bleu

# bleu = calculate_bleu(prediction, label_sequence)
# print(bleu)

In [25]:
trainer.validate(model, dataloaders=val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/harsh/anaconda3/envs/DL/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=21` in the `DataLoader` to improve performance.


Validation: |          | 0/? [00:00<?, ?it/s]

[{'val_loss': 9.491268157958984, 'bleu_score': 0.8528028726577759}]