In [1]:
# !pip install qwen_vl_utils
# !pip install sentence-transformers faiss-cpu transformers

In [2]:
import subprocess

def check_cuda_memory():
    try:
        output = subprocess.check_output(["nvidia-smi", "--query-gpu=memory.total,memory.used,memory.free", "--format=csv,nounits,noheader"])
        total, used, free = map(int, output.decode("utf-8").strip().split("\n")[0].split(", "))
        print(f"Total GPU Memory: {total} MB")
        print(f"Used GPU Memory: {used} MB")
        print(f"Free GPU Memory: {free} MB")
    except Exception as e:
        print(f"Error: {e}")

check_cuda_memory()


Total GPU Memory: 24576 MB
Used GPU Memory: 8672 MB
Free GPU Memory: 15474 MB


In [3]:
# Testing with local dataset of oranges. Why not banana? Later ;3
documents = [
    "Oranges are citrus fruits rich in vitamin C.",
    "An orange tree can live for 20 to 30 years, but some can reach 100 years.",
    "Orange juice is a popular breakfast drink worldwide.",
    "Oranges contain antioxidants that boost the immune system.",
    "Brazil is the world's largest producer of oranges.",
    "The peel of an orange can be used for essential oils and flavoring.",
    "Oranges originated in Southeast Asia thousands of years ago."
]

print(f"Loaded {len(documents)} documents about oranges.")


Loaded 7 documents about oranges.


In [4]:
# Convert documents to embeddings
from sentence_transformers import SentenceTransformer

embed_model = SentenceTransformer("all-MiniLM-L6-v2")

document_embeddings = [embed_model.encode(doc) for doc in documents]

print(f"Generated {len(document_embeddings)} embeddings.")

Generated 7 embeddings.


In [5]:
# Convert embeddings to NumPy array and add to FAISS index
import faiss
import numpy as np

dimension = len(document_embeddings[0])  # Vector size 
index = faiss.IndexFlatL2(dimension)

vectors = np.array(document_embeddings, dtype="float32")
index.add(vectors)

print("Stored document embeddings in FAISS vector database.")


Stored document embeddings in FAISS vector database.


In [6]:
# Code from https://github.com/QwenLM/Qwen2.5-VL, adjusted with 3B version

from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# default: Load the model on the available device(s)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct", 
    torch_dtype=torch.bfloat16, 
    attn_implementation="flash_attention_2",
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
# No RAG
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                # "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
                # "image": "color_img/color_img_7.png",
                "image": "orange.jpg",
                # "image": "flowchart_img/flowchart_1.png",
            },
            {"type": "text", "text": "This is an orange. How long can their trees live for? And who is producing them? Is Brazil the largest producer?"},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to(model.device)

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=512)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


["Oranges are typically grown on trees that can live for several decades to over 100 years. The lifespan of an orange tree depends on various factors such as climate, soil quality, and care practices.\n\nAs for who produces oranges, they are primarily produced by farmers and orchardists around the world. Some of the major producers include China, India, Brazil, and the United States.\n\nBrazil is indeed one of the largest producers of oranges in the world, with a significant portion of its production coming from the state of São Paulo. However, it's important to note that other countries also produce large quantities of oranges, making Brazil just one of many major producers globally."]


In [8]:
# With RAG
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
import torch

# Function to retrieve the most relevant document based on the user's actual question
def retrieve_relevant_text(user_question, top_k=2):
    query_embedding = embed_model.encode(user_question).reshape(1, -1)
    _, retrieved_indices = index.search(query_embedding, top_k)
    retrieved_docs = [documents[i] for i in retrieved_indices[0]]
    return retrieved_docs

# SET USER QUERY HERE
user_question = "This is an orange. How long can their trees live for? And who is producing them? Is Brazil the largest producer?"

# Use the user's actual question as the query
retrieved_context = retrieve_relevant_text(user_question)

# Debug output: Print retrieved context separately
print("\n[DEBUG] Retrieved Context:")
for i, fact in enumerate(retrieved_context, 1):
    print(f"{i}. {fact}")

# Combine retrieved context into a single string for input
retrieved_text = " ".join(retrieved_context)

# Prepare chat message with retrieved context and image
messages = [
    {"role": "system", "content": "Use the following information to enhance your answer."},
    {"role": "user", "content": [
        {
            "type": "image",
            "image": "orange.jpg",
        },
        {"type": "text", "text": f"{user_question}"},
    ]}
]

# Convert to model input
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
).to(model.device)

# Generate response
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = processor.batch_decode(generated_ids, skip_special_tokens=True)

# Print final output in a readable format
print("\n=== 💬 AI Response ===")
print(response[0])
print("=======================")




[DEBUG] Retrieved Context:
1. Brazil is the world's largest producer of oranges.
2. An orange tree can live for more than 69 years.

=== 💬 AI Response ===
system
Use the following information to enhance your answer.
user
This is an orange. How long can their trees live for? And who is producing them? Is Brazil the largest producer?
assistant
Orange trees can live for several decades, with some reaching up to 50 years or more under ideal conditions. The lifespan of an orange tree depends on factors such as climate, soil quality, and care practices.

Oranges are typically grown by farmers and orchardists around the world. They are a popular crop in many countries, including Brazil, which is indeed one of the largest producers of oranges globally. Brazil's production is significant due to its favorable climate, large agricultural land, and advanced technology in citrus farming.


In [9]:
import subprocess

def check_cuda_memory():
    try:
        output = subprocess.check_output(["nvidia-smi", "--query-gpu=memory.total,memory.used,memory.free", "--format=csv,nounits,noheader"])
        total, used, free = map(int, output.decode("utf-8").strip().split("\n")[0].split(", "))
        print(f"Total GPU Memory: {total} MB")
        print(f"Used GPU Memory: {used} MB")
        print(f"Free GPU Memory: {free} MB")
    except Exception as e:
        print(f"Error: {e}")

check_cuda_memory()


Total GPU Memory: 24576 MB
Used GPU Memory: 16873 MB
Free GPU Memory: 7273 MB
