# Environment Setup and Library **Imports**
**bold text**This block installs and imports necessary libraries (vllm, transformers, sqlite3, etc.) and create a local data directory.



In [None]:
!pip install vllm
!pip install qwen-vl-utils
!pip install -U transformers accelerate


import os
os.makedirs("data", exist_ok=True)
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
from qwen_vl_utils import process_vision_info
import sqlite3
import pandas as pd

# **Model Configuration and Database Initialization**
Define the model path and it initializes the SQLite database. It connects to the file qwen_t4_captions.db and executes a SQL command to create the captions table, defining its structure with fields for image id, url, and the generated caption.

In [None]:
MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"

conn = sqlite3.connect("qwen_t4_captions.db")
cursor = conn.cursor()
cursor.execute('CREATE TABLE IF NOT EXISTS captions (id INTEGER PRIMARY KEY, url TEXT, caption TEXT)')
conn.commit()

# **VLLM and Sampling Parameter Setup**
Initialize the VLLM engine (llm) on the GPU and configure the tokenizer and generation parameters (sampling_params).

Engine setup phase. It initializes the LLM object, loading the Qwen model onto the GPU .It also loads the corresponding tokenizer and defines the SamplingParams to control generation, setting a low temperature=0.2  and limiting the length with max_tokens=128.

In [None]:

llm = LLM(
    model=MODEL_PATH,
    dtype="half",
    max_model_len=4096,
    gpu_memory_utilization=0.95,
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
sampling_params = SamplingParams(temperature=0.2, max_tokens=128, stop_token_ids=[151645])

# **Input Data Definition (COCO Subset)**
This block explicitly defines the input data structure: a list named coco_subset. This list contains dictionaries, each linking a unique COCO image identifier (id) to its direct HTTP URL. The database connection is also re-established to ensure it's ready for insertion in the subsequent loop.

In [None]:
coco_subset = [
    {"id": 397133, "url": "http://images.cocodataset.org/val2017/000000397133.jpg"},
    {"id": 785,    "url": "http://images.cocodataset.org/val2017/000000000785.jpg"},
    {"id": 87038,  "url": "http://images.cocodataset.org/val2017/000000087038.jpg"},
    {"id": 174482, "url": "http://images.cocodataset.org/val2017/000000174482.jpg"}
]

conn = sqlite3.connect("qwen_t4_captions.db")
cursor = conn.cursor()

# **The Main Processing Loop and Database Storage**
Loop through each image, format the multimodal prompt, use VLLM to generate the caption, and store the result (ID, URL, caption) in the SQLite database.
 It iterates over each image in the coco_subset. Inside the loop, it constructs the multimodal chat prompt (image URL + text instruction) and prepares the input data for VLLM. It then calls llm.generate() to get the output caption. The generated text is extracted and stored in the SQLite database using an INSERT OR REPLACE command. A try...except block handles runtime errors. Finally, the database connection is closed.

In [None]:
for item in coco_subset:

    messages = [
        {"role": "user", "content": [
            {"type": "image", "image": item['url']},
            {"type": "text", "text": "Describe this image in one sentence."}
        ]}
    ]


    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_data, video_data = process_vision_info(messages)

    inputs = {
        "prompt": prompt,
        "multi_modal_data": {"image": image_data},
    }

    try:

        outputs = llm.generate([inputs], sampling_params=sampling_params)
        caption = outputs[0].outputs[0].text.strip()


        cursor.execute("INSERT OR REPLACE INTO captions (id, url, caption) VALUES (?, ?, ?)",
                       (item['id'], item['url'], caption))
        conn.commit()
        print(f"ID {item['id']}: {caption}")

    except Exception as e:
        print(f"Error on {item['id']}: {e}")


conn.close()

# **Display**
 This final block reconnects to the database and uses the Pandas library (pd.read_sql) to query and load all stored data from the captions table into a DataFrame. It configures Pandas to display the full caption text (pd.set_option) and prints the final table, confirming the captions were successfully generated and stored.

In [None]:
conn = sqlite3.connect("qwen_t4_captions.db")
df = pd.read_sql("SELECT * FROM captions", conn)
conn.close()

pd.set_option('display.max_colwidth', None)
display(df)