In [1]:
from helpers import (
    prepare_vllm_inputs_embedding,
    get_rank_scores
)

import numpy as np
from vllm import LLM
from vllm import SamplingParams
from pdf2image import convert_from_path

In [2]:
# Initialize embedding model
embedding_model = LLM(
    model="Qwen/Qwen3-VL-Embedding-2B",
    runner="pooling",
    dtype='bfloat16',
    trust_remote_code=True,
    gpu_memory_utilization=0.5,
    max_model_len=-1,
)

INFO 02-03 12:14:30 [utils.py:263] non-default args: {'runner': 'pooling', 'trust_remote_code': True, 'dtype': 'bfloat16', 'max_model_len': -1, 'gpu_memory_utilization': 0.5, 'disable_log_stats': True, 'model': 'Qwen/Qwen3-VL-Embedding-2B'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 02-03 12:14:32 [model.py:859] Resolved `--convert auto` to `--convert embed`. Pass the value explicitly to silence this message.
INFO 02-03 12:14:32 [model.py:530] Resolved architecture: Qwen3VLForConditionalGeneration
INFO 02-03 12:14:32 [model.py:1545] Using max model len 262144
INFO 02-03 12:14:32 [scheduler.py:229] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 02-03 12:14:32 [vllm.py:630] Asynchronous scheduling is enabled.
INFO 02-03 12:14:32 [vllm.py:637] Disabling NCCL for DP synchronization when using async scheduling.


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


[0;36m(EngineCore_DP0 pid=27971)[0;0m INFO 02-03 12:14:36 [core.py:97] Initializing a V1 LLM engine (v0.14.0) with config: model='Qwen/Qwen3-VL-Embedding-2B', speculative_config=None, tokenizer='Qwen/Qwen3-VL-Embedding-2B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=262144, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_trac

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


[0;36m(EngineCore_DP0 pid=27971)[0;0m INFO 02-03 12:14:58 [default_loader.py:291] Loading weights took 4.00 seconds
[0;36m(EngineCore_DP0 pid=27971)[0;0m INFO 02-03 12:14:59 [gpu_model_runner.py:3905] Model loading took 4.31 GiB memory and 6.071407 seconds
[0;36m(EngineCore_DP0 pid=27971)[0;0m INFO 02-03 12:14:59 [gpu_model_runner.py:4715] Encoder cache will be initialized with a budget of 12288 tokens, and profiled with 1 video items of the maximum feature size.
[0;36m(EngineCore_DP0 pid=27971)[0;0m INFO 02-03 12:15:20 [backends.py:644] Using cache directory: /home/jamesloy/.cache/vllm/torch_compile_cache/001baba539/rank_0_0/backbone for vLLM's torch.compile
[0;36m(EngineCore_DP0 pid=27971)[0;0m INFO 02-03 12:15:20 [backends.py:704] Dynamo bytecode transform time: 7.54 s
[0;36m(EngineCore_DP0 pid=27971)[0;0m INFO 02-03 12:15:25 [backends.py:261] Cache the graph of compile range (1, 8192) for later use
[0;36m(EngineCore_DP0 pid=27971)[0;0m INFO 02-03 12:15:27 [backends.py

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 51/51 [00:02<00:00, 18.28it/s]


[0;36m(EngineCore_DP0 pid=27971)[0;0m INFO 02-03 12:15:31 [gpu_model_runner.py:4856] Graph capturing finished in 3 secs, took 0.30 GiB
[0;36m(EngineCore_DP0 pid=27971)[0;0m INFO 02-03 12:15:31 [core.py:273] init engine (profile, create kv cache, warmup model) took 32.65 seconds
INFO 02-03 12:15:32 [llm.py:347] Supported tasks: ['embed', 'token_embed']


In [3]:
class MultiModalRAG:
    def __init__(self, embedding_model=None):
        self.embedding_model = embedding_model
        self.documents = [] 
        self.embeddings = []

    def embed_pdf(self, pdf_path):
        images = convert_from_path(pdf_path)
        emb_inputs = []
        for idx, img in enumerate(images):
            self.documents.append({'pdf_path': pdf_path, 'page_num': idx, 'image': img})
            emb_inputs.append({"image": img})
        
        emb_inputs_for_vllm = [prepare_vllm_inputs_embedding(inp, self.embedding_model) for inp in emb_inputs]
        
        emb_outputs = self.embedding_model.embed(emb_inputs_for_vllm)

        for output in emb_outputs:
            emb = output.outputs.embedding
            self.embeddings.append(emb)

    def embed_query(self, query):
        emb_input = {
            "text": query,
            "instruction": "Retrieve images or text relevant to the user's query.",
        }
        emb_input_for_vllm = prepare_vllm_inputs_embedding(emb_input, self.embedding_model)
        emb_output = self.embedding_model.embed(emb_input_for_vllm)
        query_emb = emb_output[0].outputs.embedding
        return query_emb
    
    def search(self, query):
        query_embeddings = self.embed_query(query)
        similarity_scores = np.array(query_embeddings) @ np.array(self.embeddings).T
        print("Similarity Score Matrix:")
        print(similarity_scores)
        top_5_indices = sorted(range(len(similarity_scores)), key=lambda i: similarity_scores[i], reverse=True)[:5]
        top_5_documents = [self.documents[i] for i in top_5_indices]
        return top_5_documents



multimodal_rag = MultiModalRAG(embedding_model=embedding_model)

In [4]:
multimodal_rag.embed_pdf("data/slides/2025q1-alphabet-earnings-slides.pdf")

Adding requests:   0%|          | 0/10 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/10 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

In [5]:
search_results = multimodal_rag.search("What is Alphabet's Q1 2025 revenue?")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Similarity Score Matrix:
[0.56046956 0.39872584 0.56346045 0.67699222 0.56930597 0.5670403
 0.63675879 0.58206822 0.42043154 0.38459344]


# Reranker

In [6]:
try:
    del embedding_model
except:
    pass

In [7]:
# Initialize the Qwen3-VL-Reranker model
reranker = LLM(
    model='Qwen/Qwen3-VL-Reranker-2B',
    runner='pooling',
    dtype='bfloat16',
    trust_remote_code=True,
    hf_overrides={
        "architectures": ["Qwen3VLForSequenceClassification"],
        "classifier_from_token": ["no", "yes"],
        "is_original_qwen3_reranker": True,
    },
    gpu_memory_utilization=0.5,
    max_model_len=-1
)

print("Reranker initialized successfully!")

INFO 02-03 12:15:46 [utils.py:263] non-default args: {'runner': 'pooling', 'trust_remote_code': True, 'dtype': 'bfloat16', 'max_model_len': -1, 'gpu_memory_utilization': 0.5, 'disable_log_stats': True, 'hf_overrides': {'architectures': ['Qwen3VLForSequenceClassification'], 'classifier_from_token': ['no', 'yes'], 'is_original_qwen3_reranker': True}, 'model': 'Qwen/Qwen3-VL-Reranker-2B'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 02-03 12:15:48 [model.py:859] Resolved `--convert auto` to `--convert classify`. Pass the value explicitly to silence this message.
INFO 02-03 12:15:48 [model.py:530] Resolved architecture: Qwen3VLForSequenceClassification
INFO 02-03 12:15:48 [model.py:1545] Using max model len 262144
INFO 02-03 12:15:48 [scheduler.py:229] Chunked prefill is enabled with max_num_batched_tokens=8192.
[0;36m(EngineCore_DP0 pid=28465)[0;0m INFO 02-03 12:15:50 [core.py:97] Initializing a V1 LLM engine (v0.14.0) with config: model='Qwen/Qwen3-VL-Reranker-2B', speculative_config=None, tokenizer='Qwen/Qwen3-VL-Reranker-2B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=262144, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtyp

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


[0;36m(EngineCore_DP0 pid=28465)[0;0m INFO 02-03 12:16:14 [default_loader.py:291] Loading weights took 6.35 seconds
[0;36m(EngineCore_DP0 pid=28465)[0;0m INFO 02-03 12:16:15 [gpu_model_runner.py:3905] Model loading took 4.31 GiB memory and 8.076468 seconds
[0;36m(EngineCore_DP0 pid=28465)[0;0m INFO 02-03 12:16:15 [gpu_model_runner.py:4715] Encoder cache will be initialized with a budget of 12288 tokens, and profiled with 1 video items of the maximum feature size.
[0;36m(EngineCore_DP0 pid=28465)[0;0m INFO 02-03 12:16:38 [backends.py:644] Using cache directory: /home/jamesloy/.cache/vllm/torch_compile_cache/34a3b2e8da/rank_0_0/backbone for vLLM's torch.compile
[0;36m(EngineCore_DP0 pid=28465)[0;0m INFO 02-03 12:16:38 [backends.py:704] Dynamo bytecode transform time: 9.43 s
[0;36m(EngineCore_DP0 pid=28465)[0;0m INFO 02-03 12:16:43 [backends.py:226] Directly load the compiled graph(s) for compile range (1, 8192) from the cache, took 1.420 s
[0;36m(EngineCore_DP0 pid=28465)[0

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 51/51 [00:02<00:00, 20.46it/s]


[0;36m(EngineCore_DP0 pid=28465)[0;0m INFO 02-03 12:16:48 [gpu_model_runner.py:4856] Graph capturing finished in 3 secs, took 0.30 GiB
[0;36m(EngineCore_DP0 pid=28465)[0;0m INFO 02-03 12:16:48 [core.py:273] init engine (profile, create kv cache, warmup model) took 32.77 seconds
INFO 02-03 12:16:48 [llm.py:347] Supported tasks: ['score', 'token_classify', 'classify']
Reranker initialized successfully!


In [8]:
# Define query and candidate documents for reranking
inputs = {
    "instruction": "Retrieve images or text relevant to the user's query.",
    "query": {
        "text": "What is Alphabet's Q1 2025 revenue?"
    },
    "documents": search_results
}

print(f"Prepared query with {len(inputs['documents'])} candidate documents")

Prepared query with 5 candidate documents


In [9]:
# Get relevance scores for each document
scores = get_rank_scores(reranker, inputs)

print("Relevance Scores:")
for i, score in enumerate(scores):
    print(f"Document {i+1}: {score:.4f}")

def get_top_reranker_result(scores, search_results):
    top_document_idx = scores.index(max(scores))
    return search_results[top_document_idx]

top_reranker_result = get_top_reranker_result(scores, search_results)

Adding requests:   0%|          | 0/5 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/5 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Relevance Scores:
Document 1: 0.7822
Document 2: 0.7608
Document 3: 0.5079
Document 4: 0.6863
Document 5: 0.7393


In [10]:
top_reranker_result

{'pdf_path': 'data/slides/2025q1-alphabet-earnings-slides.pdf',
 'page_num': 3,
 'image': <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=2000x1125>}

# Multimodal Q&A

In [11]:
try:
    del reranker
except:
    pass

In [12]:
# Initialize the Qwen3-VL-Reranker model
vlm = LLM(
    model='Qwen/Qwen3-VL-2B-Instruct',
    gpu_memory_utilization=0.6,
    max_model_len=1024*4,
    max_num_seqs=1,
    # limit_mm_per_prompt={"image": 1, "video": 0},
    # mm_processor_kwargs={
    #     "min_pixels": 28 * 28,
    #     "max_pixels": 1280 * 28 * 28,
    # },
)

INFO 02-03 12:17:00 [utils.py:263] non-default args: {'max_model_len': 4096, 'gpu_memory_utilization': 0.6, 'max_num_seqs': 1, 'disable_log_stats': True, 'model': 'Qwen/Qwen3-VL-2B-Instruct'}
INFO 02-03 12:17:02 [model.py:530] Resolved architecture: Qwen3VLForConditionalGeneration
INFO 02-03 12:17:02 [model.py:1545] Using max model len 4096
INFO 02-03 12:17:02 [scheduler.py:229] Chunked prefill is enabled with max_num_batched_tokens=4096.
[0;36m(EngineCore_DP0 pid=28921)[0;0m INFO 02-03 12:17:04 [core.py:97] Initializing a V1 LLM engine (v0.14.0) with config: model='Qwen/Qwen3-VL-2B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen3-VL-2B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


[0;36m(EngineCore_DP0 pid=28921)[0;0m INFO 02-03 12:17:21 [default_loader.py:291] Loading weights took 4.39 seconds
[0;36m(EngineCore_DP0 pid=28921)[0;0m INFO 02-03 12:17:22 [gpu_model_runner.py:3905] Model loading took 4.26 GiB memory and 6.129570 seconds
[0;36m(EngineCore_DP0 pid=28921)[0;0m INFO 02-03 12:17:22 [gpu_model_runner.py:4715] Encoder cache will be initialized with a budget of 16384 tokens, and profiled with 1 image items of the maximum feature size.
[0;36m(EngineCore_DP0 pid=28921)[0;0m INFO 02-03 12:17:32 [backends.py:644] Using cache directory: /home/jamesloy/.cache/vllm/torch_compile_cache/b9d5ea5080/rank_0_0/backbone for vLLM's torch.compile
[0;36m(EngineCore_DP0 pid=28921)[0;0m INFO 02-03 12:17:32 [backends.py:704] Dynamo bytecode transform time: 7.90 s
[0;36m(EngineCore_DP0 pid=28921)[0;0m INFO 02-03 12:17:35 [backends.py:261] Cache the graph of compile range (1, 4096) for later use
[0;36m(EngineCore_DP0 pid=28921)[0;0m INFO 02-03 12:17:37 [backends.py

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 2/2 [00:00<00:00, 18.45it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 1/1 [00:00<00:00, 12.24it/s]


[0;36m(EngineCore_DP0 pid=28921)[0;0m INFO 02-03 12:17:39 [gpu_model_runner.py:4856] Graph capturing finished in 1 secs, took 0.04 GiB
[0;36m(EngineCore_DP0 pid=28921)[0;0m INFO 02-03 12:17:40 [core.py:273] init engine (profile, create kv cache, warmup model) took 17.99 seconds
INFO 02-03 12:17:40 [llm.py:347] Supported tasks: ['generate']


In [16]:
vlm_input = {
    "instruction": "You are an expert financial analyst.",
    "text": "What is Alphabet's revenue in Q1 2025?",
    "image": top_reranker_result['image']
}

vlm_input = prepare_vllm_inputs_embedding(vlm_input, vlm)

In [17]:
sampling_params = SamplingParams(max_tokens=1024, temperature=0.01)

vlm_output  = vlm.generate(vlm_input, sampling_params)

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

In [18]:
print(vlm_output[0].outputs[0].text)

Based on the provided chart titled "Alphabet Revenues and Operating Income," we can determine Alphabet's revenue for Q1 2025.

The chart on the left displays "Revenues ($MM)" over two periods:
- Q1'24 (Q1 2024): $80,539 million
- Q1'25 (Q1 2025): $90,234 million

The chart explicitly states that the revenue figures are in millions of dollars.

Therefore, Alphabet's revenue in Q1 2025 was $90,234 million.
