In [1]:
%load_ext autoreload
%autoreload 2

## Imports

In [2]:
import sys
import os

import torch
from llava.mm_utils import process_images
from tqdm import tqdm
from transformers import AutoTokenizer

# Add the src directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..", "src")))

from utils.train_utils import build_dataloader, build_train_dataloader, build_val_dataloader
from dataset.processor import Processor
from model.model import VisionLanguageModel
from model.fastrcnn_adapter import FastRCNNAdapter
from utils.config import DatasetConfig, ExperimentConfig
from utils.train_metrics import TrainMetrics


  from .autonotebook import tqdm as notebook_tqdm


Please install pyav to use video processing functions.
OpenCLIP not installed


In [3]:
#hydra imports
import os
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import OmegaConf
from hydra.core.config_store import ConfigStore

OmegaConf.register_new_resolver(
    "ifel", lambda flag, val_true, val_false: val_true if flag else val_false
)

## Load config

In [4]:
# load hydra configs
cs = ConfigStore.instance()
cs.store(name="ExperimentConfig", node=ExperimentConfig)
cs.store(name="DatasetConfig", group="dataset", node=DatasetConfig)
# OmegaConf.register_new_resolver("models_dir", lambda: MODELS_DIR)


with initialize(version_base=None, config_path="../conf"):
    config = compose(config_name="train", overrides=["+experiment=train_local_test", "main_dir='..'"])
    print(OmegaConf.to_yaml(config))

name: local_test
seed: 43
train_dataset:
  name: coco_train
  data_dir: ${main_dir}/data/coco/images/train2017
  annotations_dir: ${main_dir}/data/coco/annotations/instances_train2017.json
val_dataset:
  name: coco_val
  data_dir: ${main_dir}/data/coco/images/val2017
  annotations_dir: ${main_dir}/data/coco/annotations/instances_val2017.json
test_dataset:
  name: coco_test
  data_dir: ${main_dir}/data/coco/images/test2017
  annotations_dir: ${main_dir}/data/coco/annotations/image_info_test2017.json
main_dir: ..
model_name: lmms-lab/llava-onevision-qwen2-0.5b-si
checkpoint_dir: checkpoints
load_checkpoint: null
train: true
evaluate: true
eval_mode: val
num_samples: 15
val_num_samples: 2
max_tokens: 3200
pad_to_multiple_of: 128
batch_size: 1
total_batch_size: 2
epochs: 2
lr: 0.001
warmup_ratio: 0.1
weight_decay: null
max_grad_norm: null
val_freq: 5
val_ep: null
print_freq: 1
num_workers: 0
device: cpu
debug: true
save_components: []
temperature: 0.3
use_amp: false
torch_dtype: null
image

## Load processor, tokenizer, val_dataloader, batch

In [5]:
MODEL_NAME = "last_model_silver-field-126.pt" #"checkpoint_1_vital-sound-133_1741647312.pt" #"last_model_legendary-cloud-125.pt"
MODEL_NAME = "checkpoint_3_rare-fire-135_1741767317.pt" #"checkpoint_3_balmy-snow-134_1741766686.pt"
config.num_coordinate_bins = 100
config.add_special_tokens = True # False

processor = Processor.from_config(config, add_special_tokens=config.add_special_tokens)
tokenizer = processor.tokenizer

In [6]:
val_dataloader = build_val_dataloader(config=config, processor=processor, subset_size=10)
val_batch = next(iter(val_dataloader))

# test labels for train dataset
labels = val_batch["labels"][0][val_batch["labels"][0] != -100]
print(labels.shape)
print(tokenizer.decode(labels))

#check if labels is just -100
if torch.all(val_batch["labels"] == -100):
    print("All labels are -100")

val_batch["bbox_str"]

loading annotations into memory...
Done (t=0.16s)
creating index...
index created!
torch.Size([0])

All labels are -100


['<annotation><object><class>car</class><bbox><x14/><y36/><x18/><y43/></bbox></object><object><class>surfboard</class><bbox><x00/><y40/><x85/><y67/></bbox></object></annotation>']

In [7]:
train_dataloader = build_train_dataloader(config=config, processor=processor, subset_size=10)
train_batch = next(iter(train_dataloader))

loading annotations into memory...
Done (t=4.00s)
creating index...
index created!


In [34]:
model = FastRCNNAdapter(config)

You are using a model of type llava to instantiate a model of type llava_qwen. This is not supported for all configurations of models and can yield errors.


Loading vision tower: google/siglip-so400m-patch14-384
FasterRCNNWithLLaVA(
  (backbone): LLaVAFeatureExtractor(
    (llava_encoder): SigLipVisionTower(
      (vision_tower): SigLipVisionModel(
        (vision_model): SigLipVisionTransformer(
          (embeddings): SigLipVisionEmbeddings(
            (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
            (position_embedding): Embedding(729, 1152)
          )
          (encoder): SigLipEncoder(
            (layers): ModuleList(
              (0-25): 26 x SigLipEncoderLayer(
                (self_attn): SigLipAttention(
                  (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
                  (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
                  (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
                  (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
                )
                (layer_nor

In [35]:
# test model generate
input_ids = train_batch["input_ids"]
images = train_batch["images"]

print(images.shape)

output = model.generate(input_ids=input_ids, image=images)
#print(output)

target_boxes = processor.postprocess_target_batch(batch=train_batch, device=config.device)
#print(target_boxes)

metric = TrainMetrics(config.device, download_nltk=False)
metric.update(output, target_boxes, None, None)
print(metric.compute())

torch.Size([1, 3, 384, 384])
{'map': 0.0, 'map_50': 0.0, 'map_75': 0.0, 'bleu_score': 0, 'meteor_score': 0}


In [36]:
from llava.model.language_model.llava_qwen import LlavaQwenForCausalLM

image_encoder = LlavaQwenForCausalLM.from_pretrained(
    "lmms-lab/llava-onevision-qwen2-0.5b-si"
).get_vision_tower()

image_encoder(images)

You are using a model of type llava to instantiate a model of type llava_qwen. This is not supported for all configurations of models and can yield errors.


Loading vision tower: google/siglip-so400m-patch14-384


tensor([[[-0.6366, -0.7641, -0.2013,  ..., -1.2936,  2.3574, -3.9616],
         [-1.9123, -2.6315, -0.3732,  ..., -2.0648, -0.5618,  0.7966],
         [-1.7799, -2.5519, -0.2654,  ..., -1.9276, -0.8101,  0.4554],
         ...,
         [-0.6656, -1.5752, -1.7274,  ...,  1.2125, -0.9415,  0.0773],
         [-1.0145, -4.4144,  0.3728,  ..., -0.7377,  0.8733,  2.7178],
         [ 0.0412,  0.3303,  1.5648,  ..., -3.3311,  2.6198, -0.7386]]])

In [37]:
# test model forward
output = model(input_ids=input_ids, images=images, labels=target_boxes)
print(output)

CausalLMOutputWithPast(loss=tensor(5.2896, grad_fn=<AddBackward0>), logits=None, past_key_values=None, hidden_states=None, attentions=None)
