In [3]:
# <a href="https://colab.research.google.com/github/run-llama/llama_index/blob/main/docs/docs/examples/multi_modal/multi_modal_retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
# 
# # Multi-Modal Retrieval using Groq Text Embedding and CLIP Image Embedding for Wikipedia Articles
# 
# In this notebook, we demonstrate how to build a multi-modal retrieval system using LlamaIndex with Groq text embeddings and CLIP image embeddings.
# 
# **Wikipedia Text Embedding Index:** Generate Groq text embeddings for texts.
# 
# **Wikipedia Images Embedding Index:** [CLIP](https://github.com/openai/CLIP) embeddings from OpenAI for images.
# 
# **Query Encoder:**
# - Encode query text for the text index using Groq embeddings.
# - Encode query text for the image index using CLIP embeddings.
# 
# **Framework:** [LlamaIndex](https://github.com/run-llama/llama_index)
# 
# **Steps:**
# 1. Download texts and images from Wikipedia articles.
# 2. Build a text index for the vector store using Groq embeddings.
# 3. Build an image index for the vector store using CLIP embeddings.
# 4. Retrieve relevant text and images simultaneously using different query encoding embeddings and vector stores.

In [1]:
# Step 1: Install Required Packages
%pip install -q llama-index-vector-stores-qdrant

You should consider upgrading via the '/Users/taurangela/Desktop/Github/Multimodal-RAG/env/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
%pip install llama_index ftfy regex tqdm
%pip install git+https://github.com/openai/CLIP.git
%pip install torch torchvision
%pip install matplotlib scikit-image
%pip install -U qdrant_client
%pip install requests

Collecting llama_index
  Downloading llama_index-0.11.8-py3-none-any.whl (6.8 kB)
Collecting ftfy
  Downloading ftfy-6.2.3-py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.0/43.0 KB[0m [31m400.3 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting llama-index-agent-openai<0.4.0,>=0.3.1
  Downloading llama_index_agent_openai-0.3.1-py3-none-any.whl (13 kB)
Collecting llama-index-legacy<0.10.0,>=0.9.48
  Downloading llama_index_legacy-0.9.48.post3-py3-none-any.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting llama-index-multi-modal-llms-openai<0.3.0,>=0.2.0
  Downloading llama_index_multi_modal_llms_openai-0.2.0-py3-none-any.whl (5.9 kB)
Collecting llama-index-cli<0.4.0,>=0.3.1
  Downloading llama_index_cli-0.3.1-py3-none-any.whl (27 kB)
Collecting llama-index-program-openai<0.3.0,>=0.2.0
  Downloading llama_index_program_openai-

In [6]:
%pip install -qU langchain-groq python-dotenv

You should consider upgrading via the '/Users/taurangela/Desktop/Github/Multimodal-RAG/env/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [7]:
# Step 2: Load and Download Multi-Modal Datasets Including Texts and Images from Wikipedia

from pathlib import Path
import requests

wiki_titles = ["RoboCop", "Labour Party (UK)", "SpaceX", "OpenAI"]
data_path = Path("data_wiki")

data_path.mkdir(parents=True, exist_ok=True)

for title in wiki_titles:
    response = requests.get(
        "https://en.wikipedia.org/w/api.php",
        params={"action": "query", "format": "json", "titles": title, "prop": "extracts", "explaintext": True},
    ).json()
    page = next(iter(response["query"]["pages"].values()))
    wiki_text = page["extract"]

    with open(data_path / f"{title}.txt", "w") as fp:
        fp.write(wiki_text)

In [8]:
# Step 3: Parse Wikipedia Images and Load into Local Folder

import wikipedia
import urllib.request
import os

image_path = Path("data_wiki")
image_uuid = 0
image_metadata_dict = {}
MAX_IMAGES_PER_WIKI = 30

image_path.mkdir(parents=True, exist_ok=True)

for title in wiki_titles:
    images_per_wiki = 0
    try:
        page_py = wikipedia.page(title)
        list_img_urls = page_py.images
        for url in list_img_urls:
            if url.endswith((".jpg", ".png")):
                image_uuid += 1
                image_file_name = f"{title}_{url.split('/')[-1]}"
                image_file_path = image_path / f"{image_uuid}.jpg"

                image_metadata_dict[image_uuid] = {"filename": image_file_name, "img_path": str(image_file_path)}
                urllib.request.urlretrieve(url, image_file_path)
                images_per_wiki += 1
                if images_per_wiki >= MAX_IMAGES_PER_WIKI:
                    break
    except Exception as e:
        print(f"Error retrieving images for Wikipedia page: {title}. {e}")
        continue

ModuleNotFoundError: No module named 'wikipedia'

In [None]:
# Step 4: Set Up API Key for Groq

import os
from dotenv import load_dotenv

# Load environment variables from .env
load_dotenv()

# Access the GROQ_API_KEY
api_key = os.getenv('GROQ_API_KEY')

In [None]:
# Step 5: Build Multi-Modal Vector Store Using Text and Image Embeddings Under Different Collections

import groq_client
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core.indices import MultiModalVectorStoreIndex

client = qdrant_client.QdrantClient(path="qdrant_d_0")

text_store = QdrantVectorStore(client=client, collection_name="text_collection_0")
image_store = QdrantVectorStore(client=client, collection_name="image_collection_0")
storage_context = StorageContext.from_defaults(vector_store=text_store, image_store=image_store)

# Load documents from local directory
documents = SimpleDirectoryReader("./data_wiki/").load_data()

# Create a MultiModal index
index = MultiModalVectorStoreIndex.from_documents(documents, storage_context=storage_context)


In [None]:
# Step 6: Plot Downloaded Images from Wikipedia

from PIL import Image
import matplotlib.pyplot as plt

def plot_images(image_metadata_dict, max_images=64):
    plt.figure(figsize=(16, 16))
    for idx, image_id in enumerate(image_metadata_dict):
        if idx >= max_images:
            break
        img_path = image_metadata_dict[image_id]["img_path"]
        if os.path.isfile(img_path):
            image = Image.open(img_path).convert("RGB")
            plt.subplot(8, 8, idx + 1)
            plt.imshow(image)
            plt.xticks([])
            plt.yticks([])

    plt.tight_layout()
    plt.show()

plot_images(image_metadata_dict)

In [None]:
# Step 7: Get Multi-Modal Retrieval Results for Some Example Queries

from llama_index.core.response.notebook_utils import display_source_node
from llama_index.core.schema import ImageNode

def retrieve_and_plot(query, top_k_text=3, top_k_image=5):
    # Use Groq API to get text embeddings
    text_embeddings = groq_client.get_text_embeddings(query)
    
    # Use CLIP to get image embeddings
    retriever = index.as_retriever(similarity_top_k=top_k_text, image_similarity_top_k=top_k_image)
    retrieval_results = retriever.retrieve(query)
    
    retrieved_image = []
    for res_node in retrieval_results:
        if isinstance(res_node.node, ImageNode):
            retrieved_image.append(res_node.node.metadata["file_path"])
        else:
            display_source_node(res_node, source_length=200)

    plot_images(retrieved_image)

In [None]:
# Example queries
queries = [
    "What is the Labour Party?",
    "Who created RoboCop?",
    "What does OpenAI do?",
    "Which company makes Tesla?"
]

for query in queries:
    retrieve_and_plot(query)