In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import os

import torch
from llava.mm_utils import process_images
from tqdm import tqdm
from transformers import AutoTokenizer

# Add the src directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..", "src")))

from dataset.dataset import DatasetConfig, build_dataloader
from dataset.processor import Processor
from model.model import VisionLanguageModel
from utils.train_utils import ExperimentConfig


import os
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import OmegaConf
from hydra.core.config_store import ConfigStore

OmegaConf.register_new_resolver(
    "ifel", lambda flag, val_true, val_false: val_true if flag else val_false
)

In [None]:
# load hydra configs
cs = ConfigStore.instance()
cs.store(name="ExperimentConfig", node=ExperimentConfig)
cs.store(name="DatasetConfig", group="dataset", node=DatasetConfig)
# OmegaConf.register_new_resolver("models_dir", lambda: MODELS_DIR)


with initialize(version_base=None, config_path="../conf"):
    config = compose(config_name="train", overrides=["+experiment=train_local_test", "main_dir='..'"])
    print(OmegaConf.to_yaml(config))

In [None]:
# load example from coco dataset
train_data_dir = "../data/coco/images/train2017"
train_annotation_file = "../data/coco/annotations/instances_train2017.json"
model_name = "lmms-lab/llava-onevision-qwen2-0.5b-si"

tokenizer = AutoTokenizer.from_pretrained(model_name)
num_img_tokens = 729

dataloader = build_dataloader(
    config=config,
    dataset_config=config.train_dataset,
    batch_size=2,#config.batch_size,
    is_train=False, # val_dataset
    num_workers=config.num_workers,
    # image_size=config.transform.image_size,
    num_image_tokens=num_img_tokens,
    subset_size=10,
    # use_random_subset=True,
)

In [None]:
batch = next(iter(dataloader))

# test labels for train dataset
# labels = batch["labels"][batch["labels"] != -100]
# print(tokenizer.decode(labels))

In [None]:
from utils.data_utils import show_img_with_bbox
import matplotlib.pyplot as plt
import numpy as np
import torchvision.transforms as T
from PIL import Image

dataset = dataloader.dataset
# Get original dataset from dataset if Subset is used
if hasattr(dataset, "dataset"):
    dataset = dataset.dataset

class_id_to_name = dataset.index_to_cat_name

#print(example_batch)

#fig, ax = plt.subplots()   
axes = show_img_with_bbox(batch, dataset, figsize=(10,10))


In [None]:
device=torch.device("cpu")

model = VisionLanguageModel(model_name, config).to(device)
# checkpoint_0_1740259806.pt
# checkpoint_0_1740425842.pt
state_dict = torch.load("../../checkpoints-trained/last_model_hardy-energy-102.pt", map_location=device)
model.projector.load_state_dict(state_dict.get("model_state_dict"))

In [None]:
with np.printoptions(threshold=np.inf):
    #print(batch["input_ids"][0].numpy())
    print(tokenizer.decode(batch["input_ids"][0].numpy()))
    #print(batch["attention_mask"][0].numpy())

tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=True)

In [None]:
# Load Model with pretrained projection layer
from utils.train_utils import JSONStoppingCriteria, parse_model_output_to_boxes

model.eval()
# TODO: use val set, so info bout bbox is not in input_ids, check if image tokens are filled with image info

outputs = model.generate(
    input_ids=batch["input_ids"].to(device),
    attention_mask=batch["attention_mask"].to(device),
    image=batch["images"].to(device),
    stopping_criteria=[JSONStoppingCriteria(model.tokenizer)],
    do_sample=True,
    temperature=.6,
    top_p = 0.9,
    top_k = 100,
    
    #max_new_tokens=1000,
)

# Decode predictions
generated_text = model.tokenizer.batch_decode(
    outputs, skip_special_tokens=False
)
print(generated_text)
predicted_boxes = parse_model_output_to_boxes(generated_text, dataset, device)
    

predicted_boxes

In [None]:
# Plot predicted boxes, target boxes and labels on images
id_to_cat_name = dataset.index_to_cat_name
print(predicted_boxes)

#predicted_boxes = [{"class": [1, 32], "bbox": [[0.4879453125, 0.6142578125, 0.6474609375, 0.814453125], [0.0, 0.0, 0.99951171875, 0.9990234375]]}]

for i in range(len(batch["images"])):
    fig, ax = plt.subplots()

    img, bboxes, categories = batch["images"][i], predicted_boxes[i]["boxes"], predicted_boxes[i]["labels"]

    img = img.permute(1, 2, 0).numpy()
    img = img - img.min()
    img = img / img.max()
    ax.imshow(img)

    for cat, bbox in zip(categories, bboxes):
        # print(bbox)
        x1, y1, x2, y2 = bbox # x_min, y_min, x_max, y_max -> YOLO format
        # x1, y1, x2, y2 = x1*img.shape[1], y1*img.shape[0], x2*img.shape[1], y2*img.shape[0] 
        rect = plt.Rectangle((x1, y1), x2-x1, y2-y1, linewidth=1, edgecolor="r", facecolor="none")
        ax.add_patch(rect)
        
        # add label text to rect
        if cat.item() in id_to_cat_name:
            class_name = id_to_cat_name[cat.item()] #no .item()
        else:
            class_name = "Unknown"
        ax.text(x1, y1-5, class_name, fontsize=12, color="red")

    corr_boxes, corr_labels = batch["instance_bboxes"][i], batch["instance_classes_id"][i]

    for cat, bbox in zip(corr_labels, corr_boxes):
        x1, y1, x2, y2 = bbox
        x1, y1, x2, y2 = (
                x1 * img.shape[1],
                y1 * img.shape[0],
                x2 * img.shape[1],
                y2 * img.shape[0],
            )
        rect = plt.Rectangle((x1, y1), x2-x1, y2-y1, linewidth=1, edgecolor="g", facecolor="none")
        ax.add_patch(rect)
        
        # add label text to rect
        if cat.item() in id_to_cat_name:
            class_name = id_to_cat_name[cat.item()]
        ax.text(x1, y1-5, class_name, fontsize=12, color="green")
    
    plt.show()

## Test Train Metrics

In [None]:
# Calculate test metrics
from utils.train_metrics import TrainMetrics
from utils.train_utils import unnormalize_bbox
device = "cpu"
metrics = TrainMetrics(device=device)

target_boxes = [
    {
        "boxes": unnormalize_bbox(
            boxes.to(device),
            model.image_size,
            model.image_size,
        ),
        "labels": labels.to(device),
    }
    for boxes, labels in zip(
        batch["instance_bboxes"], batch["instance_classes_id"]
    )
]

metrics.update(
    predicted_boxes=predicted_boxes,
    target_boxes=target_boxes,
    target_texts=batch["bbox_str"],
    generated_text=generated_text,
)
metrics.compute()

In [None]:
pred_boxes_test = [{
        'boxes': torch.tensor([[183.7680,  92.0112, 332.6478, 226.5556], [169.6680,   5.6480, 324.7800, 377.7072]]), 
        'labels': torch.tensor([17, 70]), 
        'scores': torch.tensor([1., 1.])
    }, 
    {
        'boxes': torch.tensor([[305.1480, 223.1204, 320.6880, 235.3108], [ 73.9140,  37.1321, 219.9680, 262.9113]]),
        'labels': torch.tensor([15,  1]), 
        'scores': torch.tensor([1., 1.])
    }]
target_boxes_test = [{
        'boxes': torch.tensor([[169.6680,   5.6480, 324.7800, 377.7072], [225.7680,  92.0112, 332.6478, 226.5556]]), 
        'labels': torch.tensor([70, 17]), 
        'scores': torch.tensor([1., 1.])
    }, 
    {
        'boxes': torch.tensor([[305.1480, 223.1204, 320.6880, 235.3108], [ 73.9140,  37.1321, 219.9680, 262.9113]]),
        'labels': torch.tensor([15,  1]), 
        'scores': torch.tensor([1., 1.])
    }]

test_metrics = TrainMetrics(device=device)
test_metrics.update(predicted_boxes=pred_boxes_test, target_boxes=target_boxes_test, target_texts=batch["bbox_str"], generated_text=generated_text)
test_metrics.compute()

## Else

In [None]:
from utils.train_utils import build_train_dataloader

dataloader = build_train_dataloader(config, model)
len(dataloader)

In [None]:
device = torch.device("cuda")

if device == torch.device("cuda"):
    print("Using CUDA")


In [None]:
# model to bfloat16
model = model.to(torch.bfloat16)

model.generate(
    input_ids=batch["input_ids"].to(device),
    attention_mask=batch["attention_mask"].to(device),
    image=batch["images"].to(device, torch.bfloat16),
    stopping_criteria=[JSONStoppingCriteria(model.tokenizer)],
    do_sample=True,
    temperature=0.3,
    top_p = 0.9,
    top_k = 50,
    #max_new_tokens=1000,
)

In [None]:
from utils.train_utils import build_train_dataloader
train_dl = build_train_dataloader(config, model)

batch = next(iter(train_dl))

model.tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=True)