In [1]:
import os
import json
from PIL import Image
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from dataclasses import asdict
from typing import Optional, NamedTuple
from vllm.lora.request import LoRARequest
import numpy as np
from transformers import AutoProcessor, AutoTokenizer

# set cuda devices to specific gpus
os.environ["CUDA_VISIBLE_DEVICES"] = "7"  # Set to the GPU IDs you want to use

  from .autonotebook import tqdm as notebook_tqdm


INFO 09-12 17:20:36 [__init__.py:241] Automatically detected platform cuda.


In [2]:
# ---- 2. DMV-style prompt ----
SEGMENT_PROMPT = """
You are a driving examiner evaluating a 5-second driving video segment. 
The video consists of consecutive stitched front and rear views from the ego vehicle, sampled at 1 frames per second. 
Based on the full sequence of images, write an observer’s note that addresses:

1. Traffic environment: road type, other vehicles, pedestrians, cyclists.
2. Ego vehicle control: lane position, spacing, following distance.
3. Compliance: traffic signs, signals, right-of-way.
4. Maneuvers: turns, lane changes, or stops.
5. Hazards: any potential risks or conflicts.
6. Overall driving performance for this short segment.

Write the response as if it were part of a DMV behind the wheel driving performance evaluation sheet.
Be concise but specific, describing what happens across the 5 seconds.
"""

In [3]:
class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompts: list[str]
    stop_token_ids: Optional[list[int]] = None
    lora_requests: Optional[list[LoRARequest]] = None

In [6]:
def load_llava_next(question: str, image_list) -> ModelRequestData:
    model_name = "llava-hf/llava-v1.6-mistral-7b-hf"
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=16,
        limit_mm_per_prompt={"image": len(image_list)},
    )

    placeholders = [{"type": "image", "image": img} for img in image_list]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]

    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[img for img in image_list],
    )

In [7]:
def run_generate(question: str, image_list, seed: Optional[int]):
    req_data = load_llava_next(question, image_list)

    engine_args = asdict(req_data.engine_args) | {"seed": seed}
    llm = LLM(**engine_args)

    sampling_params = SamplingParams(
        temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
    )

    outputs = llm.generate(
        {
            "prompt": req_data.prompt,
            "multi_modal_data": {"image": req_data.image_data},
        },
        sampling_params=sampling_params,
        lora_request=req_data.lora_requests,
    )

    print("-" * 50)
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
        print("-" * 50)

In [2]:
def get_front3_stitched_image(video_dir,frame_id):
    front_views = [
        np.array(Image.open(os.path.join(video_dir, f"FRONT_LEFT", f"{frame_id}.png"))),
        np.array(Image.open(os.path.join(video_dir, f"FRONT", f"{frame_id}.png"))),
        np.array(Image.open(os.path.join(video_dir, f"FRONT_RIGHT", f"{frame_id}.png"))),
    ]
    stitched_front = np.concatenate(front_views, axis=1)
    # resize the stitched image to 224x224
    stitched_front = np.array(Image.fromarray(stitched_front).resize((224, 224)))

    return stitched_front

def get_back3_stitched_image(video_dir,frame_id):
    back_views = [
        np.array(Image.open(os.path.join(video_dir, f"REAR_RIGHT", f"{frame_id}.png"))),
        np.array(Image.open(os.path.join(video_dir, f"REAR", f"{frame_id}.png"))),
        np.array(Image.open(os.path.join(video_dir, f"REAR_LEFT", f"{frame_id}.png"))),
    ]
    stitched_back = np.concatenate(back_views, axis=1)
    # resize the stitched image to 224x224
    stitched_back = np.array(Image.fromarray(stitched_back).resize((224, 224)))
    return stitched_back
    

In [3]:
video_dir = "/datassd4/users/hardik/dev/waymo_e2e/data/0b8097d37ac2cc7a832e2978f431f843"
frame_ids = ["010", "050"]
stitched_images = []
for frame_id in frame_ids:
    front_img = get_front3_stitched_image(video_dir,frame_id)
    # back_img = get_back3_stitched_image(video_dir,frame_id)
    stitched_images.append(Image.fromarray(front_img))
    # stitched_images.append(Image.fromarray(back_img))

print(f"Total stitched images: {len(stitched_images)}" )

Total stitched images: 2


In [22]:
stitched_images[0].size

(224, 224)

In [5]:
# Add 10 <image> tokens, one for each stitched frame
image_placeholders = "\n".join([f"<image>" for i in range(len(stitched_images))])

SEGMENT_PROMPT = f"""[INST]{image_placeholders} You are a driving examiner evaluating a 5-second driving video segment. The video consists of 2 consecutive stitched front views from the ego vehicle, sampled at 2 frames per 5 second.
The images are provided above. Based on the full sequence of images, write an observer’s note that addresses:
1. Ego vehicle control: lane position, spacing, following distance.
2. Compliance: traffic signs, signals, right-of-way.
3. Maneuvers: turns, lane changes, or stops.
4. Hazards: any potential risks or conflicts.
5. Overall driving performance for this short segment.
Write the response as if it were part of a DMV behind the wheel driving performance evaluation sheet.
Be concise but specific, describing what happens across the 5 seconds.
[/INST]"""


In [6]:
# Setup VLLM engine
engine_args = EngineArgs(
    model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
    max_model_len=32768,
    limit_mm_per_prompt={"image": len(stitched_images)},  # we are passing 5 images
    # tensor_parallel_size=7,  # Use 4 GPUs for tensor parallelism
)
llm = LLM(**asdict(engine_args))

# Sampling parameters
sampling_params = SamplingParams(
    temperature=0.2,
    max_tokens=512
)

# Prepare input (all 5 images in one prompt)
inputs = [{
    "prompt": SEGMENT_PROMPT,
    "multi_modal_data": {
        "image": stitched_images
    }
}]

outputs = llm.generate(inputs, sampling_params=sampling_params)

# Extract text
note = outputs[0].outputs[0].text.strip()
print("=== DMV Observer Note ===")
print(note)

# Save to JSON
results = {"video_id": os.path.basename(video_dir), "note": note}
with open("waymo_dmv_notes.json", "w") as f:
    json.dump(results, f, indent=4)


INFO 09-12 17:22:54 [utils.py:326] non-default args: {'model': 'llava-hf/llava-onevision-qwen2-0.5b-ov-hf', 'max_model_len': 32768, 'limit_mm_per_prompt': {'image': 2}}
INFO 09-12 17:23:03 [__init__.py:711] Resolved architecture: LlavaOnevisionForConditionalGeneration
INFO 09-12 17:23:03 [__init__.py:1750] Using max model len 32768


2025-09-12 17:23:04,049	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 09-12 17:23:04 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 09-12 17:23:12 [__init__.py:241] Automatically detected platform cuda.
[1;36m(EngineCore_0 pid=1073580)[0;0m INFO 09-12 17:23:13 [core.py:636] Waiting for init message from front-end.
[1;36m(EngineCore_0 pid=1073580)[0;0m INFO 09-12 17:23:13 [core.py:74] Initializing a V1 LLM engine (v0.10.1.1) with config: model='llava-hf/llava-onevision-qwen2-0.5b-ov-hf', speculative_config=None, tokenizer='llava-hf/llava-onevision-qwen2-0.5b-ov-hf', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False,

[1;36m(EngineCore_0 pid=1073580)[0;0m Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


[1;36m(EngineCore_0 pid=1073580)[0;0m INFO 09-12 17:23:22 [gpu_model_runner.py:1953] Starting to load model llava-hf/llava-onevision-qwen2-0.5b-ov-hf...
[1;36m(EngineCore_0 pid=1073580)[0;0m INFO 09-12 17:23:23 [gpu_model_runner.py:1985] Loading model from scratch...
[1;36m(EngineCore_0 pid=1073580)[0;0m INFO 09-12 17:23:23 [cuda.py:345] Using FlexAttention backend for head_size=72 on V1 engine.
[1;36m(EngineCore_0 pid=1073580)[0;0m INFO 09-12 17:23:23 [cuda.py:328] Using Flash Attention backend on V1 engine.
[1;36m(EngineCore_0 pid=1073580)[0;0m INFO 09-12 17:23:23 [weight_utils.py:296] Using model weights format ['*.safetensors']
[1;36m(EngineCore_0 pid=1073580)[0;0m INFO 09-12 17:23:23 [weight_utils.py:349] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.79it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.79it/s]
[1;36m(EngineCore_0 pid=1073580)[0;0m 


[1;36m(EngineCore_0 pid=1073580)[0;0m INFO 09-12 17:23:24 [default_loader.py:262] Loading weights took 0.62 seconds
[1;36m(EngineCore_0 pid=1073580)[0;0m INFO 09-12 17:23:24 [gpu_model_runner.py:2007] Model loading took 1.6818 GiB and 1.322385 seconds
[1;36m(EngineCore_0 pid=1073580)[0;0m INFO 09-12 17:23:25 [gpu_model_runner.py:2591] Encoder cache will be initialized with a budget of 8748 tokens, and profiled with 1 image items of the maximum feature size.
[1;36m(EngineCore_0 pid=1073580)[0;0m INFO 09-12 17:23:31 [backends.py:548] Using cache directory: /home/hardik/.cache/vllm/torch_compile_cache/4601c77de3/rank_0_0/backbone for vLLM's torch.compile
[1;36m(EngineCore_0 pid=1073580)[0;0m INFO 09-12 17:23:31 [backends.py:559] Dynamo bytecode transform time: 5.72 s
[1;36m(EngineCore_0 pid=1073580)[0;0m INFO 09-12 17:23:36 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 4.180 s
[1;36m(EngineCore_0 pid=1073580)[0;0m INFO 09-12 17:

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 67/67 [00:02<00:00, 29.14it/s]


[1;36m(EngineCore_0 pid=1073580)[0;0m INFO 09-12 17:23:40 [gpu_model_runner.py:2708] Graph capturing finished in 3 secs, took 1.34 GiB
[1;36m(EngineCore_0 pid=1073580)[0;0m INFO 09-12 17:23:40 [core.py:214] init engine (profile, create kv cache, warmup model) took 15.54 seconds
INFO 09-12 17:23:42 [llm.py:298] Supported_tasks: ['generate']


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Adding requests: 100%|██████████| 1/1 [00:03<00:00,  3.45s/it]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 10.58it/s, est. speed input: 33802.86 toks/s, output: 10.78 toks/s]

=== DMV Observer Note ===






In [11]:
outputs[0]

RequestOutput(request_id=0, prompt='[INST]<image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image