In [14]:
import pickle

import pandas as pd
import numpy as np

import torch
from transformers import Dinov2Config
import pytorch_lightning as pl

import wandb

from dataset import ImageCaptionDataset, Vocab
from model import Dinov2Encoder, TextEncoder, ShowAndTell, Model, to_device

In [15]:
with open('./coco-2014/vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)
    
vocab.size 

924

In [16]:
dataset = ImageCaptionDataset(
    vocab=vocab,
    dataset_path="./coco-2014/dataset.json",
)
dataset, len(dataset), vocab.size

(<dataset.ImageCaptionDataset at 0x70af83d9c730>, 100, 924)

In [17]:
config = Dinov2Config(patch_size=14)
image_encoder = Dinov2Encoder(
    config=config, dinov2_weights_path="./dinov2-base-weights.pth", freeze=True
)
text_encoder = TextEncoder(vocab_size=vocab.size)
showtell_core = ShowAndTell(
    vocab,
    image_encoder,
    text_encoder,
)
showtell_core = to_device(showtell_core)

In [18]:
dataloader = torch.utils.data.DataLoader(dataset)
batch = next(iter(dataloader))
# image, tokens, others = batch
# image, tokens = to_device(image), to_device(tokens)
# image.shape, tokens.shape, tokensa

In [19]:
model = Model(vocab=vocab, showtell_core=showtell_core)

In [20]:
project_name = "ShowAndTell"
run_name = "full_coverage"
run = wandb.init(name=run_name, project="ShowAndTell")
wandb_logger = pl.loggers.WandbLogger(name=run_name, run=run, project=project_name)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mhmankodiya[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [21]:
trainer = pl.Trainer(
    accelerator="gpu",
    devices=[0],
    overfit_batches=0,
    max_epochs=10,
    logger=[wandb_logger],
)
trainer.fit(model, train_dataloaders=dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 4060 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
/home/harsh/anaconda3/envs/DL/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type             | Params | Mode 
--------------------------------------------------------------
0 | showandtell_core | ShowAndTell      | 92.7 M | train
1

Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


In [None]:
torch.save(showtell_core.state_dict(), f'./weights/{run_name}-{project_name}.pth')
model_artifact = wandb.Artifact(name='weights', type='model')
model_artifact.add_file(f'./weights/{run_name}-{project_name}.pth')
run.log_artifact(model_artifact)

In [22]:
run.finish()

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▅▄▃▂▂▁▂▁▁
trainer/global_step,▁▂▃▃▄▅▆▆▇█

0,1
epoch,9.0
train_loss,3.25975
trainer/global_step,999.0


# -------------------------------------------------------------------------------

In [9]:
batch = next(iter(dataloader))
image, tokens, (image_path, image_id) = batch
image, tokens = to_device(image), to_device(tokens)
image.shape, image.device

(torch.Size([1, 3, 480, 640]), device(type='cuda', index=0))

In [10]:
showtell_core = to_device(showtell_core)
with torch.no_grad():
    logits = showtell_core(image, teacher_forcing=False)
out_tokens = logits.argmax(-1).detach().cpu().squeeze(0).numpy()
logits.shape, out_tokens.shape

(torch.Size([1, 12, 924]), (12,))

In [11]:
out_tokens, tokens

(array([  0, 154, 521,  78, 521, 289, 803, 389, 101,  22,  97,   1]),
 tensor([[  0, 154, 521,  78, 521, 289, 803, 389, 101,  22,  97,   1]],
        device='cuda:0'))

In [12]:
print(f'GT {vocab.decode_indexes(tokens.detach().cpu().squeeze(0).numpy())}')
print(f'Pred {vocab.decode_indexes(out_tokens)}')

GT <start> closeup of bins of food that include broccoli and bread <end>
Pred <start> closeup of bins of food that include broccoli and bread <end>


In [13]:
temp = np.random.randint(low=0, high=924, size=(10, 12))
list(map(vocab.decode_indexes, temp))

['jetliner hillside carryout los scissors savanna include they yellow selfie grass leaves',
 'assorted outdoors pomeranian horse clear fry stove lamppost from distance sunny salad',
 'cup mostly taxiing backs ramp without fliers graffiti porch door split mostly',
 'passenger signs lamps spoon walking coming graffiti knife still <start> tennis their',
 'view laptops curve boys ancient living oval pizza soccer filled that hind',
 'boats doubles elephant riding loaded groups tiny curious ramp chewing motorcycle bear',
 'calves skier seen jumping smaller split neck photo body tips pasta standby',
 'soldiers long pictures shoe orange kind laptop video desktop jelly eating traveler',
 'canopy boxes fast school make peppers toppings waterfront few peanut bushy runway',
 'and pub shows together lots kickstand river stove show horse closeup turned']