In [None]:
Q~!34567890

In [40]:
url = "https://magazine.sebastianraschka.com/p/understanding-multimodal-llms"
url2 = "https://medium.com/user-experience-design-1/cracking-the-code-of-vibe-coding-124b9288e551"
data = scrape_website(url2)
print((len(data[0]), len(data[1])))

(18619, 8)


In [41]:
import re
import string
import spacy
from nltk.corpus import stopwords

# Load spaCy model and stop words
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words('english'))

# Custom unwanted words/phrases typical in web-scraped UI content
custom_unwanted_phrases = [
    "login", "sign up", "signup", "create account", "forgot password", 
    "click here", "read more", "terms of service", "privacy policy",
    "subscribe", "register", "next page", "previous page", "contact us",
    "about us", "home", "back to top", "follow us", "view more"
]

def preprocess_text(text):
    """
    Clean and preprocess text content, removing web-scraped UI junk.
    """
    if not text:
        return ""
    
    # Lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove unwanted phrases
    for phrase in custom_unwanted_phrases:
        text = text.replace(phrase, '')

    # Remove special characters and numbers
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d+', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Lemmatization and stopword removal
    doc = nlp(text)
    tokens = [
        token.lemma_ for token in doc 
        if token.text not in stop_words 
        and token.text not in string.punctuation
        and token.lemma_ not in custom_unwanted_phrases
    ]
    
    return ' '.join(tokens)


In [42]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import requests
from io import BytesIO

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model.to(device)

def get_text_embedding(text):
    inputs = processor(text=text, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        text_features = model.get_text_features(**inputs)
    return text_features.cpu().numpy()

def get_image_embedding(image_url):
    try:
        response = requests.get(image_url)
        image = Image.open(BytesIO(response.content))
        inputs = processor(images=image, return_tensors="pt", padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            image_features = model.get_image_features(**inputs)
        return image_features.cpu().numpy()
    except Exception as e:
        print(f"Error processing image {image_url}: {str(e)}")
        return None

In [49]:
get_text_embedding("This is a test sentence.")

array([[ 1.07689589e-01, -1.29972696e-01, -1.50694892e-01,
        -2.86011547e-01, -2.99172521e-01, -1.29091665e-01,
        -1.23714887e-01, -1.31375062e+00, -1.60170764e-01,
         1.09250106e-01, -1.23362169e-01,  3.32235098e-01,
         1.28503039e-01, -1.25202835e-02,  2.03039885e-01,
        -2.05394387e-01, -5.69737665e-02,  7.06199706e-02,
        -3.75526935e-01, -1.10198423e-01, -3.41495872e-03,
        -3.45017970e-01,  4.62136269e-02, -3.99029732e-01,
        -1.64712101e-01,  1.66862607e-01,  1.19063042e-01,
        -3.19007337e-01,  5.79217523e-02, -2.54918635e-03,
        -2.61154056e-01, -2.09452629e-01,  2.71080792e-01,
         1.62050739e-01,  2.22844034e-02,  1.71472698e-01,
         8.56696516e-02, -1.40662462e-01,  1.06197596e-02,
        -2.51558602e-01,  4.29157317e-02, -3.73860568e-01,
         1.42899960e-01,  8.22819844e-02, -1.10874474e-02,
         2.87431777e-01,  2.66495347e-01, -1.55539393e-01,
         2.66740113e-01,  1.02855302e-01, -4.56015944e-0

In [51]:
import chromadb
from chromadb.utils import embedding_functions
import numpy as np

# Initialize ChromaDB client
chroma_client = chromadb.Client()

# Create embedding function (using same CLIP dimensions)
clip_embedding_function = embedding_functions.DefaultEmbeddingFunction()

def initialize_collection(collection_name="multimodal_rag"):
    """
    Initialize ChromaDB collection with multimodal support
    """
    collection = chroma_client.create_collection(
        name=collection_name,
        embedding_function=clip_embedding_function,
        metadata={"hnsw:space": "cosine"}  # Similarity metric
    )
    return collection

def store_data(collection_name, text, text_embedding, image_url=None, image_embedding=None):
    """
    Store data in ChromaDB with multimodal support
    """
    collection = chroma_client.get_collection(collection_name)
    
    # Create document ID
    doc_id = str(hash(text + (image_url if image_url else "")))
    
    # Prepare metadata
    metadata = {
        "image_url": image_url if image_url else "",
        "source_type": "multimodal" if image_url else "text_only",
        "processed_text": preprocess_text(text)
    }
    
    # Convert embeddings to list format
    text_embedding = text_embedding[0].tolist() if isinstance(text_embedding, np.ndarray) else text_embedding
    image_embedding = image_embedding[0].tolist() if image_embedding and isinstance(image_embedding, np.ndarray) else None
    
    # Store with both embeddings in metadata
    collection.add(
        ids=[doc_id],
        documents=[text],
        embeddings=[text_embedding],
        metadatas=[{
            **metadata,
            "image_embedding": image_embedding if image_embedding else None
        }]
    )

def retrieve_relevant_chunks(query, collection_name="multimodal_rag", top_k=3):
    """
    Retrieve relevant chunks from ChromaDB
    """
    collection = chroma_client.get_collection(collection_name)
    
    # Get query embedding
    query_embedding = get_text_embedding(query)[0].tolist()
    
    # Query the collection
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        include=["documents", "metadatas", "distances"]
    )
    
    # Format results similar to Qdrant's output
    class Result:
        def __init__(self, payload, score):
            self.payload = payload
            self.score = 1 - score  # Convert distance to similarity
    
    formatted_results = []
    for doc, meta, dist in zip(results['documents'][0], 
                              results['metadatas'][0], 
                              results['distances'][0]):
        formatted_results.append(
            Result(payload={
                "text": doc,
                "image_url": meta.get("image_url"),
                "processed_text": meta.get("processed_text"),
                "metadata": meta
            }, score=1 - dist)
        )
    
    return formatted_results

Failed to send telemetry event ClientStartEvent: module 'chromadb' has no attribute '__version__'


In [None]:
# Initialize
collection = initialize_collection()

# Store data (example)
text = "Example content about AI"
text_embedding = get_text_embedding(text)
image_url = "http://example.com/image.jpg"
image_embedding = get_image_embedding(image_url)

store_data("multimodal_rag", text, text_embedding, image_url, image_embedding)

# Retrieve
results = retrieve_relevant_chunks("What is AI?", "multimodal_rag")
for result in results:
    print(f"Score: {result.score:.3f}")
    print(f"Text: {result.payload['text'][:100]}...")
    if result.payload['image_url']:
        print(f"Image: {result.payload['image_url']}")

In [44]:
from ollama import Client
import numpy as np

# Initialize Ollama client
ollama_client = Client(host='http://localhost:11434')

def retrieve_relevant_chunks(query, collection_name="multimodal_rag", top_k=3):
    """Enhanced retrieval with hybrid search (text + image)"""
    # Get query embedding
    query_embedding = get_text_embedding(query)
    
    # Search in Qdrant with hybrid approach
    results = client.search(
        collection_name=collection_name,
        query_vector=("text", query_embedding[0].tolist()),
        query_filter=None,  # Add filters here if needed
        limit=top_k,
        with_vectors=True,
        with_payload=True,
        score_threshold=0.3  # Minimum similarity score
    )
    
    return results

def generate_response(query, retrieved_chunks, use_images=False):
    """
    Generate response using deepseek-r1:14b with enhanced prompting
    Args:
        use_images: Whether to include image context in the prompt
    """
    # Prepare text context
    text_context = "\n".join([
        f"Source {i+1}:\n{chunk.payload['text']}\n" 
        for i, chunk in enumerate(retrieved_chunks)
    ])
    
    # Prepare image context if available and requested
    image_context = ""
    if use_images:
        image_descriptions = []
        for chunk in retrieved_chunks:
            if chunk.payload.get('image_url'):
                img_desc = f"Image from {chunk.payload['image_url']} related to: {chunk.payload['text'][:100]}..."
                image_descriptions.append(img_desc)
        if image_descriptions:
            image_context = "\nVisual Context:\n" + "\n".join(image_descriptions) + "\n"
    
    # Craft optimized prompt for deepseek-r1:14b
    prompt = f"""<|im_start|>system
You are an AI assistant that answers questions using the provided context. 
Be concise yet informative. If unsure, say you don't know.<|im_end|>

<|im_start|>user
Context Information:
{text_context}
{image_context}

Question: {query}

Provide a detailed answer based on the context. If the context doesn't contain the answer, 
say "I couldn't find that information in my sources."<|im_end|>

<|im_start|>assistant
"""
    
    # Generate response with optimized parameters for deepseek-r1
    response = ollama_client.generate(
        model='deepseek-r1:14b',
        prompt=prompt,
        options={
            'temperature': 0.3,  # Lower for more factual responses
            'top_p': 0.9,
            'num_ctx': 4096,     # Utilize full context window
            'repeat_penalty': 1.1
        },
        stream=False
    )
    
    return response['response'].strip()

def multimodal_qa(query, collection_name="multimodal_rag", top_k=3):
    """End-to-end QA pipeline with multimodal capabilities"""
    # Retrieve relevant chunks
    chunks = retrieve_relevant_chunks(query, collection_name, top_k)
    
    # First try with text only
    answer = generate_response(query, chunks, use_images=False)
    
    # If answer is uncertain, try including image context
    if "I couldn't find" in answer or "don't know" in answer.lower():
        answer = generate_response(query, chunks, use_images=True)
    
    return answer

In [45]:
def process_website(url, collection_name="multimodal_rag", max_images=20):
    """
    Enhanced website processing with better error handling and metadata capture
    Args:
        url: Website URL to process
        collection_name: VectorDB collection name
        max_images: Maximum number of images to process per page
    Returns:
        Tuple (success: bool, num_images_processed: int)
    """
    try:
        # Scrape website with enhanced metadata
        text_content, image_urls = scrape_website(url)
        
        if not text_content:
            print(f"No text content found at {url}")
            return False, 0
        
        # Process text with additional metadata
        processed_text = preprocess_text(text_content)
        text_embedding = get_text_embedding(processed_text)
        
        # Process images with better error handling
        image_embeddings = []
        successful_images = 0
        for img_url in image_urls[:max_images]:
            try:
                img_embedding = get_image_embedding(img_url)
                if img_embedding is not None:
                    image_embeddings.append(img_embedding)
                    successful_images += 1
            except Exception as img_e:
                print(f"Failed to process image {img_url}: {str(img_e)}")
                continue
        
        # Prepare metadata payload
        metadata = {
            "source_url": url,
            "timestamp": datetime.datetime.now().isoformat(),
            "text_length": len(text_content),
            "num_images": successful_images,
            "processed_text_length": len(processed_text)
        }
        
        # Store in vector DB - now supports multiple images
        for i, (img_url, img_embedding) in enumerate(zip(image_urls[:max_images], image_embeddings)):
            store_data(
                collection_name=collection_name,
                text=text_content,
                text_embedding=text_embedding,
                image_url=img_url,
                image_embedding=img_embedding,
                metadata={**metadata, "image_index": i}
            )
        
        # Also store a text-only version if we have images
        if successful_images > 0:
            store_data(
                collection_name=collection_name,
                text=text_content,
                text_embedding=text_embedding,
                image_url=None,
                image_embedding=None,
                metadata=metadata
            )
        
        return True, successful_images
    
    except Exception as e:
        print(f"Error processing website {url}: {str(e)}")
        return False, 0

def ask_question(query, collection_name="multimodal_rag", include_sources=False):
    """
    Enhanced QA with multimodal support and source attribution
    Args:
        query: User question
        collection_name: VectorDB collection name
        include_sources: Whether to include source URLs in response
    Returns:
        Dict: {
            "answer": str,
            "sources": list[str] (if include_sources=True),
            "confidence": float (0-1)
        }
    """
    try:
        # Retrieve relevant chunks with hybrid search
        retrieved_chunks = retrieve_relevant_chunks(query, collection_name)
        
        if not retrieved_chunks:
            return {
                "answer": "I couldn't find any relevant information to answer your question.",
                "sources": [],
                "confidence": 0.0
            }
        
        # Generate initial text-based response
        answer = generate_response(query, retrieved_chunks, use_images=False)
        
        # Calculate confidence score based on retrieval scores
        confidence = min(1.0, max(0.0, sum(c.score for c in retrieved_chunks) / len(retrieved_chunks)))
        
        # If low confidence, try with image context
        if confidence < 0.5:
            image_answer = generate_response(query, retrieved_chunks, use_images=True)
            if "I couldn't find" not in image_answer:
                answer = image_answer
                confidence = min(1.0, confidence + 0.2)  # Boost confidence
        
        # Prepare sources if requested
        sources = []
        if include_sources:
            sources = list({c.payload.get('metadata', {}).get('source_url') 
                          for c in retrieved_chunks if c.payload.get('metadata', {}).get('source_url')})
        
        return {
            "answer": answer,
            "sources": sources if include_sources else None,
            "confidence": round(confidence, 2)
        }
    
    except Exception as e:
        print(f"Error answering question: {str(e)}")
        return {
            "answer": "An error occurred while processing your question.",
            "sources": [],
            "confidence": 0.0
        }

In [47]:
# Process a website
success, num_images = process_website(url, max_images=8)
print(f"Processed {num_images} images")

# Ask a question with sources
response = ask_question("What is this website about?", include_sources=True)
print("Answer:", response["answer"])
print("Sources:", response["sources"])
print("Confidence:", response["confidence"])

# Simple query
answer = ask_question("When was this content last updated?")
print(answer["answer"])

Error processing website https://magazine.sebastianraschka.com/p/understanding-multimodal-llms: store_data() got an unexpected keyword argument 'metadata'
Processed 0 images
Error answering question: Collection multimodal_rag not found
Answer: An error occurred while processing your question.
Sources: []
Confidence: 0.0
Error answering question: Collection multimodal_rag not found
An error occurred while processing your question.


  results = client.search(
