## Multimodal Retrieval Augmented Generation (RAG) with Llama Parse

### 1. Dependencies, Imports & Setup

In [2]:
from llama_parse import LlamaParse
from llama_index.core import Settings
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import MarkdownElementNodeParser
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.vector_stores.kdbai import KDBAIVectorStore
from llama_index.postprocessor.cohere_rerank import CohereRerank
from getpass import getpass
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.core import Settings

import requests
import os
import io
import pandas as pd
import torch
from dotenv import load_dotenv

from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import OllamaEmbeddings
from llama_index.llms.groq import Groq

from unstructured.partition.pdf import partition_pdf
import chromadb

  from .autonotebook import tqdm as notebook_tqdm


### 2. Setting Needed APIs

In [8]:
import nest_asyncio
nest_asyncio.apply()

load_dotenv('../.env')
llama_cloud = os.getenv('LLMA_CLOUD_API')
open_ai = os.getenv('OPENAI_API')
groq = os.getenv('GROQ_API')

### 3. Setting up Chroma DB

In [None]:
def add_collection(file_path, collection_name):
  '''

  '''
  load_dotenv('.env')

  storage_path = os.getenv('STORAGE_PATH')
  if storage_path is None:
      raise ValueError('STORAGE_PATH environment variable is not set')

  if not os.path.isdir(storage_path):
    raise NotADirectoryError('STORAGE_PATH must be a repository')

  # Local PDF file uploads
  print("Reading pdf...")
  loader = UnstructuredPDFLoader(file_path=file_path)
  data = loader.load()
  print("Done!")

  # Split and chunk
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
  chunks = text_splitter.split_documents(data)

  # Add to vector database
  vector_db = Chroma.from_documents(
      documents=chunks,
      embedding=embeddings,
      collection_name=collection_name,
      persist_directory=storage_path
  )

  print(f'File {file_path} uploaded to collection {collection_name}')

### 4. LlamaParse & LlamaIndex Setup

In [4]:
EMBEDDING_MODEL  = "nomic-embed-text"
GENERATION_MODEL = "mistral"

# LLM from Ollama
local_model = "mistral"
# llm_local = ChatOllama(model=local_model)

llm_local = Groq(model="mixtral-8x7b-32768", api_key= groq)
# embed_model = OllamaEmbeddings(model=EMBEDDING_MODEL,show_progress=True)
embed_model = FastEmbedEmbedding(model_name="BAAI/bge-small-en-v1.5")

Settings.llm = llm_local
Settings.embed_model = embed_model
Settings.chunk_size = 1024

Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 18063.32it/s]


In [5]:
short_pdf = "../pdf_files/owner_manual_p283-p300.pdf"
pdf_path = '../pdf_files/owner_manual_full.pdf'

In [None]:
# parsing_instructions = '''The document titled "LLM In-Context Recall is Prompt Dependent"
#     is an academic preprint from April 2024, authored by Daniel Machlab and Rick Battle from the VMware NLP Lab.
#     It explores the in-context recall capabilities of Large Language Models (LLMs) using a method called
#     "needle-in-a-haystack," where a specific factoid is embedded in a block of unrelated text.
#     The study investigates how the recall performance of various LLMs is influenced by the content
#     of prompts and the biases in their training data. The research involves testing multiple LLMs with
#     varying context window sizes to assess their ability to recall information accurately when prompted differently.
#     The paper includes detailed methodologies, results from numerous tests, discussions on the impact of prompt variations
#     and training data, and conclusions on improving LLM utility in practical applications. It contains many tables.
#     Answer questions using the information in this article and be precise.'''


### 5. Parse the document with LlamaParse into markdown format

In [6]:
documents = LlamaParse(api_key=llama_cloud,result_type="markdown").load_data(pdf_path)

Started parsing the file under job_id e2c09d41-429b-48cd-9cae-14372fd4cd6d
...

### 6. Extract Text and Table nodes from Markdown Document

In [7]:
node_parser = MarkdownElementNodeParser(llm=llm_local,num_workers=8).from_defaults()
nodes = node_parser.get_nodes_from_documents(documents)
base_nodes, objects = node_parser.get_nodes_and_objects(nodes)

0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, 7133.17it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, 9383.23it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, 11848.32it/s]
1it [00:00, 4712.70it/s]
1it [00:00, 3533.53it/s]
2it [00:00, 8004.40it/s]
1it [00:00, 4232.40it/s]
1it [00:00, 3971.88it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, 7002.18it/s]
0it [00:00, ?it/s]
1it [00:00, 10356.31it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, 10810.06it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, 10782.27it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, 6502.80it/s]
0it [00:00, ?it/s]
0it [0

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for model `mixtral-8x7b-32768` in organization `org_01j92rtewxfa8s38dr38s3yz1n` on tokens per minute (TPM): Limit 5000, Used 11309, Requested 253. Please try again in 1m18.754s. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}

### 7. Use a Reranker to improve retrieval

In [83]:
recursive_index = VectorStoreIndex(nodes=base_nodes+objects)

In [85]:
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

reranker = FlagEmbeddingReranker(
    top_n=5,
    model="BAAI/bge-reranker-large",
)

recursive_query_engine = recursive_index.as_query_engine(
    similarity_top_k=15,
    node_postprocessors=[reranker],
    verbose=True
)

OllamaEmbeddings:   0%|          | 0/10 [12:04:50<?, ?it/s]
OllamaEmbeddings:   0%|          | 0/10 [12:02:39<?, ?it/s]
OllamaEmbeddings:   0%|          | 0/10 [12:01:13<?, ?it/s]
OllamaEmbeddings:   0%|          | 0/10 [9:46:57<?, ?it/s]


In [90]:
query = "Can you show me any table?"
response = recursive_query_engine.query(query)
print(response)

[1;3;38;2;11;159;203mRetrieval entering 51b418eb-1700-4e15-ad7f-9c8a0efdfbd3: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query Can you show me any table?
[0m[1;3;38;2;11;159;203mRetrieval entering be878eff-e32b-4bc6-9cba-9f875da90a8a: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query Can you show me any table?
[0m[1;3;38;2;11;159;203mRetrieval entering bbcc67a7-7f3b-4bbc-aa5a-62296fdc27eb: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query Can you show me any table?
[0m[1;3;38;2;11;159;203mRetrieval entering c6dcaf6f-a9e2-4221-81a2-6195bc22b012: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query Can you show me any table?
[0mSure, I can present you with the "Towing Options" table from the context information:

Towing Options

|Wheels Off The Ground|Two-Wheel Drive Models|
|---|---|
|Flat Tow|NONE|
|Dolly Tow|Front|
| |NOT ALLOWED|
| |Rear|
| |OK|
|On Trailer|ALL|
| |OK