In [4]:
from fastapi import FastAPI, File, UploadFile, HTTPException
from pydantic import BaseModel
from typing import List
import os
from llama_parse import LlamaParse
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from rank_bm25 import BM25Okapi
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from flashrank import Ranker
import pickle
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
embeddings = OpenAIEmbeddings()
llm = OpenAI(model="gpt-3.5-turbo", temperature=0)
ranker = Ranker()

  embeddings = OpenAIEmbeddings()
  llm = OpenAI(model="gpt-3.5-turbo", temperature=0)
INFO:flashrank.Ranker:Downloading ms-marco-TinyBERT-L-2-v2...
ms-marco-TinyBERT-L-2-v2.zip: 100%|██████████| 3.26M/3.26M [00:01<00:00, 2.91MiB/s]


#### Utility functions

In [5]:
ARCHIVE_DIR = "./archives"
def load_or_initialize_indices():
  try:
    with open(ARCHIVE_DIR+"/bm25_index.pkl", "rb") as f:
      bm25_index = pickle.load(f)
  except FileNotFoundError:
    bm25_index = BM25Okapi([])

  try:
    faiss_index = FAISS.load_local(ARCHIVE_DIR+"/faiss_index", embeddings)
  except FileNotFoundError:
    faiss_index = None

  return bm25_index, faiss_index

def save_indices(bm25_index, faiss_index):
  with open(ARCHIVE_DIR+"/bm25_index.pkl", "wb") as f:
    pickle.dump(bm25_index, f)
  faiss_index.save_local(ARCHIVE_DIR+"/faiss_index")

def rerank_documents(query, documents):
  reranked_docs = ranker.rerank(query, documents)
  return reranked_docs

#### Document ingestion

In [18]:
instruction = """The provided document is a PDF file containing structured and unstructured content.
It may include financial information, tables, management discussions, and analyses.
Try to capture the essence of the document, including text, tables, and key highlights.
Be precise and ensure data integrity while processing."""

async def parse_pdf(file_path: str):
  parser = LlamaParse(
      result_type="markdown",
      parsing_instruction=instruction,
      max_timeout=5000,
  )
  return await parser.aload_data(file_path)

async def ingest_documents(files: List[str]):
  try:
    parsed_docs = []
    for file_path in files:
      root, ext = os.path.splitext(file_path)
      if ext == ".pdf":
        with open(file_path, "rb") as file:
          parsed_content = await parse_pdf(file_path)
          parsed_docs.extend(parsed_content)

    documents = []
    for doc in parsed_docs:
      md_content = doc.to_markdown()
      chunks = text_splitter.split_text(md_content)
      documents.extend(chunks)

    bm25_index, faiss_index = load_or_initialize_indices()

    new_corpus = bm25_index.corpus + [doc.split() for doc in documents]
    bm25_index = BM25Okapi(new_corpus)

    if faiss_index:
      faiss_index.add_texts(documents)
    else:
      faiss_index = FAISS.from_texts(documents, embeddings)

    save_indices(bm25_index, faiss_index)

    return {"message": "Documents ingested and indexed successfully."}
  except Exception as e:
    raise HTTPException(status_code=500, detail=str(e))

#### Query for answer

In [10]:
class Query(BaseModel):
  question: str

def query_documents(query: Query):
  try:
    bm25_index, faiss_index = load_or_initialize_indices()

    bm25_docs = bm25_index.get_top_n(query.question.split(), bm25_index.corpus, n=5)

    faiss_docs = faiss_index.similarity_search(query.question, k=5)

    combined_docs = list(set(bm25_docs + faiss_docs))
    reranked_docs = rerank_documents(query.question, combined_docs)

    qa_chain = RetrievalQA.from_chain_type(llm, retriever=faiss_index.as_retriever())
    answer = qa_chain.run(query.question)

    return {"answer": answer, "reranked_docs": reranked_docs}
  except Exception as e:
    raise HTTPException(status_code=500, detail=str(e))

#### Begin data ingestion

In [19]:
files = ["./dataset/ColPali_2407.01449v3.pdf"]
await ingest_documents(files)

INFO:httpx:HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id 0320b55d-cd25-4023-92f7-569feb2f5819


INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/0320b55d-cd25-4023-92f7-569feb2f5819 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/0320b55d-cd25-4023-92f7-569feb2f5819 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/0320b55d-cd25-4023-92f7-569feb2f5819 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/0320b55d-cd25-4023-92f7-569feb2f5819/result/markdown "HTTP/1.1 200 OK"


HTTPException: 500: 'Document' object has no attribute 'to_markdown'