In [1]:
# !pip install qwen_vl_utils
# !pip install sentence-transformers faiss-cpu transformers

In [2]:
import subprocess

def check_cuda_memory():
    try:
        output = subprocess.check_output(["nvidia-smi", "--query-gpu=memory.total,memory.used,memory.free", "--format=csv,nounits,noheader"])
        total, used, free = map(int, output.decode("utf-8").strip().split("\n")[0].split(", "))
        print(f"Total GPU Memory: {total} MB")
        print(f"Used GPU Memory: {used} MB")
        print(f"Free GPU Memory: {free} MB")
    except Exception as e:
        print(f"Error: {e}")

check_cuda_memory()


Total GPU Memory: 24576 MB
Used GPU Memory: 9060 MB
Free GPU Memory: 15086 MB


In [3]:
# Testing with local dataset of oranges. Why not banana? Later ;3
documents = [
    "Oranges are citrus fruits rich in vitamin C.",
    "An orange tree can live for 20 to 30 years, but some can reach 100 years.",
    "Orange juice is a popular breakfast drink worldwide.",
    "Oranges contain antioxidants that boost the immune system.",
    "Brazil is no longer the world's largest producer of oranges. It is China now, producing up to 60% of the world's orange population.",
    "The peel of an orange can be used for essential oils and flavoring.",
    "Oranges originated in Southeast Asia thousands of years ago."
]

print(f"Loaded {len(documents)} documents about oranges.")


Loaded 7 documents about oranges.


In [4]:
# Convert documents to embeddings
from sentence_transformers import SentenceTransformer

embed_model = SentenceTransformer("all-MiniLM-L6-v2")

document_embeddings = [embed_model.encode(doc) for doc in documents]

print(f"Generated {len(document_embeddings)} embeddings.")

Generated 7 embeddings.


In [5]:
# Convert embeddings to NumPy array and add to FAISS index
import faiss
import numpy as np

dimension = len(document_embeddings[0])  # Vector size 
index = faiss.IndexFlatL2(dimension)

vectors = np.array(document_embeddings, dtype="float32")
index.add(vectors)

print("Stored document embeddings in FAISS vector database.")


Stored document embeddings in FAISS vector database.


In [6]:
# Code from https://github.com/QwenLM/Qwen2.5-VL, adjusted with 3B version

from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# default: Load the model on the available device(s)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct", 
    torch_dtype=torch.bfloat16, 
    attn_implementation="flash_attention_2",
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
# No RAG
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
import torch

# Load Qwen model and processor
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")

# Define user query with image
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "orange.jpg",
            },
            {"type": "text", "text": "What fruit is this? Is Brazil its largest producer?"},
        ],
    }
]

# Prepare input for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
).to(model.device)

# Generate response
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = processor.batch_decode(generated_ids, skip_special_tokens=True)

# Function to format AI response like RAG-style output
def format_ai_response(response_text, user_message):
    # Structure the response in an easy-to-read format
    formatted_response = f"=== 💬 AI NO RAG Response ===\n"
    formatted_response += "system\nUse the following information to enhance your answer.\n"
    formatted_response += f"user\n{user_message}\n"
    formatted_response += f"assistant\n{response_text.strip()}\n"
    formatted_response += "======================="
    
    return formatted_response

# Extract user message text for structured output
user_message_text = messages[0]["content"][1]["text"]

# Print formatted AI response
print(format_ai_response(response[0], user_message_text))


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


=== 💬 AI NO RAG Response ===
system
Use the following information to enhance your answer.
user
What fruit is this? Is Brazil its largest producer?
assistant
system
You are a helpful assistant.
user
What fruit is this? Is Brazil its largest producer?
assistant
The fruit in the picture is an orange. Oranges are indeed one of the most widely produced fruits globally, with Brazil being one of the largest producers. Brazil is known for its extensive citrus farming and is a major exporter of oranges and other citrus products.


In [8]:
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
import torch

# Function to retrieve the most relevant document based on the user's actual question
def retrieve_relevant_text(user_question, top_k=2):
    query_embedding = embed_model.encode(user_question).reshape(1, -1)
    _, retrieved_indices = index.search(query_embedding, top_k)
    retrieved_docs = [documents[i] for i in retrieved_indices[0]]
    return retrieved_docs

# SET USER QUERY HERE
user_question = "What fruit is this? Is Brazil still its largest producer?"

# Use the user's actual question as the query
retrieved_context = retrieve_relevant_text(user_question)

# Debug output: Print retrieved context separately
print("\n[DEBUG] Retrieved Context:")
for i, fact in enumerate(retrieved_context, 1):
    print(f"{i}. {fact}")

# Combine retrieved context into a single string for input
retrieved_text = " ".join(retrieved_context)

# 🛠️ Prioritize dataset knowledge but allow fallback to general knowledge
messages = [
    {"role": "system", "content": (
        "Use the following information to answer the question. "
        "Prioritize this information over other knowledge, but if additional details are needed, "
        "you may supplement with general knowledge."
    )},
    {"role": "user", "content": [
        {
            "type": "image",
            "image": "orange.jpg",
        },
        {"type": "text", "text": f"Context: {retrieved_text}\n\n{user_question}"},
    ]}
]

# Convert to model input
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
).to(model.device)

# Generate response
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = processor.batch_decode(generated_ids, skip_special_tokens=True)

# Print final output in a structured format
print("\n=== 💬 AI with RAG Response ===")
print("system\nUse the following information to answer the question. Prioritize this information over other knowledge, but you may supplement with general knowledge if necessary.")
print()
print("Result:")
print(f"user\n{user_question}\n{response[0]}")
print("=======================")



[DEBUG] Retrieved Context:
1. Brazil is no longer the world's largest producer of oranges. It is China now, producing up to 60% of the world's orange population.
2. Orange juice is a popular breakfast drink worldwide.

=== 💬 AI with RAG Response ===
system
Use the following information to answer the question. Prioritize this information over other knowledge, but you may supplement with general knowledge if necessary.

Result:
user
What fruit is this? Is Brazil still its largest producer?
system
Use the following information to answer the question. Prioritize this information over other knowledge, but if additional details are needed, you may supplement with general knowledge.
user
Context: Brazil is no longer the world's largest producer of oranges. It is China now, producing up to 60% of the world's orange population. Orange juice is a popular breakfast drink worldwide.

What fruit is this? Is Brazil still its largest producer?
assistant
The fruit in the picture is an orange. Brazil 

In [9]:
import subprocess

def check_cuda_memory():
    try:
        output = subprocess.check_output(["nvidia-smi", "--query-gpu=memory.total,memory.used,memory.free", "--format=csv,nounits,noheader"])
        total, used, free = map(int, output.decode("utf-8").strip().split("\n")[0].split(", "))
        print(f"Total GPU Memory: {total} MB")
        print(f"Used GPU Memory: {used} MB")
        print(f"Free GPU Memory: {free} MB")
    except Exception as e:
        print(f"Error: {e}")

check_cuda_memory()


Total GPU Memory: 24576 MB
Used GPU Memory: 17437 MB
Free GPU Memory: 6709 MB


In [None]:
# Alright, here's what I need you to do.
# A) I need you to generate an image of a flowchart using python via mermaid, something related with business workflow
# B) Generate a corresponding PDF file that contains refined details of said flowchart
# C) Write the mermaid code in the PDF file as well