## Multimodal Retrieval Augmented Generation (RAG) with Llama Parse

### 1. Dependencies, Imports & Setup

In [1]:
from llama_parse import LlamaParse
from llama_index.core import Settings,SimpleDirectoryReader,StorageContext,VectorStoreIndex
from llama_index.core.node_parser import MarkdownElementNodeParser
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.vector_stores.kdbai import KDBAIVectorStore
from llama_index.postprocessor.cohere_rerank import CohereRerank
from getpass import getpass
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.llms.ollama import Ollama


import requests
import os
import io
import pandas as pd
import torch
from dotenv import load_dotenv

from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from llama_index.llms.groq import Groq

from unstructured.partition.pdf import partition_pdf
import chromadb

[nltk_data] Downloading package punkt_tab to /home/joaocosentino/.pyen
[nltk_data]     v/versions/user_manual/lib/python3.10/site-
[nltk_data]     packages/llama_index/core/_static/nltk_cache...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


ImportError: cannot import name 'field_validator' from 'llama_index.core.bridge.pydantic' (/home/joaocosentino/.pyenv/versions/user_manual/lib/python3.10/site-packages/llama_index/core/bridge/pydantic.py)

### 2. Setting Needed APIs

In [2]:
import nest_asyncio
nest_asyncio.apply()

load_dotenv('../.env')
llama_cloud = os.getenv('LLMA_CLOUD_API')
open_ai = os.getenv('OPENAI_API')
groq = os.getenv('GROQ_API')

### 3. Setting up Chroma DB

In [None]:
def add_collection(file_path, collection_name):
  '''

  '''
  load_dotenv('.env')

  storage_path = os.getenv('STORAGE_PATH')
  if storage_path is None:
      raise ValueError('STORAGE_PATH environment variable is not set')

  if not os.path.isdir(storage_path):
    raise NotADirectoryError('STORAGE_PATH must be a repository')

  # Local PDF file uploads
  print("Reading pdf...")
  loader = UnstructuredPDFLoader(file_path=file_path)
  data = loader.load()
  print("Done!")

  # Split and chunk
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
  chunks = text_splitter.split_documents(data)

  # Add to vector database
  vector_db = Chroma.from_documents(
      documents=chunks,
      embedding=embeddings,
      collection_name=collection_name,
      persist_directory=storage_path
  )

  print(f'File {file_path} uploaded to collection {collection_name}')

### 4. LlamaParse & LlamaIndex Setup

In [3]:
EMBEDDING_MODEL  = "nomic-embed-text"
GENERATION_MODEL = "mistral"


# LLM from Ollama
local_model = "mistral"
# llm_local = ChatOllama(model=local_model)

llm_local = Groq(model="mixtral-8x7b-32768", api_key= groq)
# embed_model = OllamaEmbeddings(model=EMBEDDING_MODEL,show_progress=True)
embed_model = FastEmbedEmbedding(model_name="BAAI/bge-small-en-v1.5")

Settings.llm = llm_local
Settings.embed_model = embed_model
Settings.chunk_size = 1024

Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 31823.25it/s]


### PDF Paths

In [4]:
short_pdf = "../pdf_files/owner_manual_p283-p300.pdf"
pdf_path = '../pdf_files/owner_manual_full.pdf'

In [23]:
parsing_instructions = '''You will answer questions about information that can be found in the owner's manual of the RAM 1500 vehicle, model year 2025, Crew Cab version.
    Your task is to generate five different versions of the given user question to retrieve relevant documents
    from a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.'''


In [14]:
ins = """
You are a highly proficient language model designed to convert pages from PDF, PPT and other files into structured markdown text. Your goal is to accurately transcribe text, represent formulas in LaTeX MathJax notation, and identify and describe images, particularly graphs and other graphical elements.

You have been tasked with creating a markdown copy of each page from the provided PDF or PPT image. Each image description must include a full description of the content, a summary of the graphical object.

Maintain the sequence of all the elements.

For the following element, follow the requirement of extraction:
for Text:
   - Extract all readable text from the page.
   - Exclude any diagonal text, headers, and footers.

for Text which includes hyperlink:
    -Extract hyperlink and present it with the text

for Formulas:
   - Identify and convert all formulas into LaTeX MathJax notation.

for Image Identification and Description:
   - Identify all images, graphs, and other graphical elements on the page.
   - If image contains wording that is hard to extract , flag it with <unidentifiable section> instead of parsing.
   - For each image, include a full description of the content in the alt text, followed by a brief summary of the graphical object.
   - If the image has a subtitle or caption, include it in the description.
   - If the image has a formula convert it into LaTeX MathJax notation.
   - If the image has a organisation chart , convert it into a hierachical understandable format.
   - for graph , extract the value in table form as markdown representation


# OUTPUT INSTRUCTIONS

- Ensure all formulas are in LaTeX MathJax notation.
- Exclude any diagonal text, headers, and footers from the output.
- For each image and graph, provide a detailed description and summary.
"""

### 5. Parse the document with LlamaParse into markdown format

In [15]:
documents = LlamaParse(api_key=llama_cloud,result_type="markdown",parsing_instructions=ins).load_data(short_pdf)

Started parsing the file under job_id 779544e4-0959-4339-b643-cc5530ba2337


### 6. Extract Text and Table nodes from Markdown Document

In [16]:
node_parser = MarkdownElementNodeParser(llm=llm_local,num_workers=8).from_defaults()
nodes = node_parser.get_nodes_from_documents(documents)
base_nodes, objects = node_parser.get_nodes_and_objects(nodes)

1it [00:00, 2998.07it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, 10782.27it/s]
0it [00:00, ?it/s]
1it [00:00, 6553.60it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, 8756.38it/s]
0it [00:00, ?it/s]


### 7. Use a Reranker to improve retrieval

In [17]:
recursive_index = VectorStoreIndex(nodes=base_nodes+objects)

In [19]:
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

reranker = FlagEmbeddingReranker(
    top_n=5,
    model="BAAI/bge-reranker-large",
)

recursive_query_engine = recursive_index.as_query_engine(
    similarity_top_k=15,
    node_postprocessors=[reranker],
    verbose=True
)

In [36]:
query = "can you tell me which LLM are you based on?"
response = recursive_query_engine.query(query)
print(response)

[1;3;38;2;11;159;203mRetrieval entering 32cc2375-fff9-4404-9241-be54f33ea61d: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query can you tell me which LLM are you based on?
[0m[1;3;38;2;11;159;203mRetrieval entering e1078670-a011-49fe-88b0-921aa4959b8b: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query can you tell me which LLM are you based on?
[0m[1;3;38;2;11;159;203mRetrieval entering 25f5be44-08fb-4f7a-9979-8776a4bcfa03: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query can you tell me which LLM are you based on?
[0m[1;3;38;2;11;159;203mRetrieval entering ae5caff0-cb51-43e3-bb91-063d798bc25f: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query can you tell me which LLM are you based on?
[0mBased on the provided context, there is no information about an "LLM" or any system, organization, or entity by that acronym. Therefore, I cannot determine any association with an 

: 

In [28]:


dir = "../pdf_files"
reader = SimpleDirectoryReader(input_dir=dir)
docs = reader.load_data()

In [31]:
len(docs)

396