In [5]:
import nest_asyncio
nest_asyncio.apply()
import dotenv
dotenv.load_dotenv()
from rich import print as rprint

In [3]:
## llama parse to parse RAG files into documents


from llama_cloud_services import LlamaParse
import os
import asyncio

parser = LlamaParse(
    verbose=True,
    language="en"
)

# Directory containing RAG files
RAG_DIR = "data"

# Supported file extensions
SUPPORTED_EXTENSIONS = [".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx", ".txt", ".md"]

async def parse_all_rag_files():
    """Parse all RAG files in the specified directory."""
    all_files = [
        os.path.join(RAG_DIR, f)
        for f in os.listdir(RAG_DIR)
        if os.path.splitext(f)[1].lower() in SUPPORTED_EXTENSIONS
    ]

    documents = []
    tasks = []

    for rag_doc in all_files:
        print(f"Queuing: {rag_doc}")
        tasks.append(parser.aparse(rag_doc)) # aysync parse

    # Run all parsing tasks concurrently
    results = await asyncio.gather(*tasks, return_exceptions=True)

    for rag_doc, job in zip(all_files, results):
        if isinstance(job, Exception):
            print(f"❌ Error parsing {rag_doc}: {job}")
            continue

        docs = job.get_markdown_documents(split_by_page=True) # From Job task get markdown docs
        print(f"✅ Parsed {rag_doc} -> {len(docs)} docs")

        for i, doc in enumerate(docs):
           
            doc.metadata["page_links"] = job.pages[i].links # Add page links metadata
            doc.metadata["file_name"] = os.path.basename(rag_doc) # Add original file

        documents.extend(docs)

    return documents

all_docs = asyncio.run(parse_all_rag_files())


Queuing: data/DoorDash Content Guidelines.pdf
Queuing: data/Partner Code of Conduct - United States.pdf
Queuing: data/Viewing your Canceled Orders on DoorDash for Merchants.pdf
Queuing: data/How do I cancel my order_.pdf
Queuing: data/Independent Contractor Agreement for USA.pdf
Queuing: data/Credits and Refunds.pdf
Queuing: data/How to appeal Dasher account deactivations_.pdf
Queuing: data/Retail Returns Policy.pdf
Queuing: data/Independent Contractor Agreement for NEW ZEALAND.pdf
Queuing: data/Independent Contractor Agreement for AUSTRALIA.pdf
Queuing: data/Understanding Auto Insurance Maintained by DoorDash.pdf
Queuing: data/Partner Code of Conduct - Canada.pdf
Queuing: data/cert_SEA_DoorDash_ Inc._3824483_7.pdf
Queuing: data/Independent Contractor Agreement for CANADA.pdf
Queuing: data/DoorDash Ads and Promotions Policies .pdf
Queuing: data/DoorDash Community Guidelines.pdf
Queuing: data/Consumer Terms and Conditions.pdf
Queuing: data/DoorDash Sexual Harassment Policy.pdf
Queuing: 

Retrying llama_cloud_services.parse.utils.make_api_request.<locals>._make_request in 4.0 seconds as it raised HTTPStatusError: Client error '429 Too Many Requests' for url 'https://api.cloud.llamaindex.ai/api/parsing/upload'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429.
Retrying llama_cloud_services.parse.utils.make_api_request.<locals>._make_request in 4.0 seconds as it raised HTTPStatusError: Client error '429 Too Many Requests' for url 'https://api.cloud.llamaindex.ai/api/parsing/upload'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429.
Retrying llama_cloud_services.parse.utils.make_api_request.<locals>._make_request in 4.0 seconds as it raised HTTPStatusError: Client error '429 Too Many Requests' for url 'https://api.cloud.llamaindex.ai/api/parsing/upload'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429.
Retrying llama_cloud_services.parse.utils.make_api_requ

Started parsing the file under job_id f51a8c06-2b6a-4b97-942b-bb94799b7c0c
Started parsing the file under job_id 4af5e115-9be1-4857-b5cc-b2ffa332af9b
Started parsing the file under job_id d0469f3d-a2d5-4246-ae87-66161a871771
Started parsing the file under job_id d8d19c3c-96e6-4c46-8628-9e2079324683
Started parsing the file under job_id d7dde8bd-f845-464a-857a-6cdd1c61d874
Started parsing the file under job_id 871de713-6ff1-403b-998c-916f478f03a1
Started parsing the file under job_id cf70b01f-b66e-41fb-b49e-263ab7e0b167
Started parsing the file under job_id 3036d6f6-e555-41f7-9ee0-3b12d141f40c
Started parsing the file under job_id e76c856b-e5de-49a8-b367-6278a04b4a20
Started parsing the file under job_id a437db3b-b5b8-452f-a11d-e11b7f591fa8
✅ Parsed data/DoorDash Content Guidelines.pdf -> 5 docs
✅ Parsed data/Partner Code of Conduct - United States.pdf -> 3 docs
✅ Parsed data/Viewing your Canceled Orders on DoorDash for Merchants.pdf -> 8 docs
✅ Parsed data/How do I cancel my order_.pdf

In [None]:
rprint(len(all_docs))

import pickle


Saved 292 documents ✅
Loaded 292 documents ✅


In [None]:
from rich import print as rprint

# rprint(docs)
import pandas as pd

# df = pd.read_csv("pdf_sources.csv")

# url_map = dict(zip(df['filename'], df['source_link']))

# for doc in all_docs:
#     filename = doc.metadata.get('file_name')
#     if filename in url_map:
#         doc.metadata['source_link'] = url_map[filename]
#         # Ensure uniqueness by using sets
#         exclude_keys = {"file_name", "page_number"}

# # Exclude certain metadata keys from being embedded or sent to LLM
#         doc.excluded_embed_metadata_keys = list(
#             set(doc.excluded_embed_metadata_keys) | exclude_keys
#         )
#         doc.excluded_llm_metadata_keys = list(
#             set(doc.excluded_llm_metadata_keys) | exclude_keys
#         )


rprint(all_docs[10])
with open("parsed_docs.pkl", "wb") as f:
    pickle.dump(all_docs, f)

print(f"Saved {len(all_docs)} documents ✅")

# Later, load them back
with open("parsed_docs.pkl", "rb") as f:
    loaded_docs = pickle.load(f)

print(f"Loaded {len(loaded_docs)} documents ✅")

In [9]:
# Spliting documents into chunks for better embedding and retrieval

# Groq LLM setup
from llama_index.llms.groq import Groq
import os
import getpass

# os.environ["GROQ_API_KEY"] = getpass.getpass("Enter your Groq API key: ")

if os.environ.get("GROQ_API_KEY"):
    pass
else:
    os.environ["GROQ_API_KEY"] = getpass.getpass("Enter your Groq API key: ")

llm_transformations = Groq(model="llama-3.1-8b-instant", temperature=0)


  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [None]:
out = llm_transformations.complete("Hello, world!")
rprint(out)

In [10]:

BATCH_SIZE = 5       # Number of documents per batch
DOCS_PER_CALL = 3    # How many docs you send to pipeline in one call
DELAY_SEC = 20.0

async def run_with_throttle(pipeline, documents):
    all_nodes = []
    for i in range(0, len(documents), BATCH_SIZE):
        batch_docs = documents[i:i + BATCH_SIZE]
        print(f"Processing batch {i // BATCH_SIZE + 1} ({len(batch_docs)} docs)...")

        # Split batch into smaller calls
        for j in range(0, len(batch_docs), DOCS_PER_CALL):
            call_docs = batch_docs[j:j + DOCS_PER_CALL]
            nodes = await pipeline.arun(
                documents=call_docs,
                show_progress=True,
                in_place=True,
                num_workers=4
            )
            all_nodes.extend(nodes)
            print(f"Processed {len(call_docs)} docs, waiting {DELAY_SEC} seconds...")
            await asyncio.sleep(DELAY_SEC)

        print(f"Batch {i // BATCH_SIZE + 1} completed.\n")

    print("All Documents processed Successfully")
    return all_nodes


In [None]:
from llama_index.core.extractors import (
    TitleExtractor,
    QuestionsAnsweredExtractor,
)
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SentenceSplitter

# Text splitting for better embedding and retrieval using SentenceSplitter
text_splitter = SentenceSplitter(
    separator=" ",
    chunk_size=1500,
    chunk_overlap=150,
)

# # Metadata 
# title_extractor = TitleExtractor(llm=llm_transformations, nodes=2, num_workers=1)

# Metadata with Questions and Answers for better context of the document
question_answer_extractor = QuestionsAnsweredExtractor(llm=llm_transformations, questions=1, num_workers=1)



pipeline = IngestionPipeline(
    transformations=[
        text_splitter,
        question_answer_extractor,
    ],
   
)

nodes = asyncio.run(run_with_throttle(pipeline, all_docs))


In [14]:
len(nodes)
with open("parsed_nodes.pkl", "wb") as f:
    pickle.dump(nodes, f)

print(f"Saved {len(nodes)} documents ✅")

# Later, load them back
with open("parsed_nodes.pkl", "rb") as f:
    loaded_nodes = pickle.load(f)

print(f"Loaded {len(loaded_nodes)} documents ✅")

Saved 296 documents ✅
Loaded 296 documents ✅


In [None]:
from llama_index.core.schema import MetadataMode
# rprint(nodes[150].get_content(metadata_mode=MetadataMode.EMBED))
rprint(nodes[4])

In [29]:
from llama_index.embeddings.google_genai import GoogleGenAIEmbedding

if os.environ.get("GOOGLE_API_KEY"):
    pass
else:
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google API key: ")

# Embedding model setup with Google Gemini Embeddings
embed_model = GoogleGenAIEmbedding(
    model_name="gemini-embedding-001",
    embed_batch_size=20,
    embedding_config={
        "output_dimensionality": 1536 # Vector dimensionality
    }

)

In [30]:
async def embed_nodes_with_backoff(nodes, embed_model, initial_delay=5):
    embedded_nodes = []
    delay = initial_delay

    for idx, node in enumerate(nodes):
        while True:
            try:
                # ✅ Use the public API, not private `_aget_text_embedding`
                embedding = await embed_model.aget_text_embedding(node.get_content())
                node.embedding = embedding  # store embedding in the node
                embedded_nodes.append(node)
                
                print(f"✅ Embedded {idx+1}/{len(nodes)}")
                delay = initial_delay  # reset delay on success
                break

            except Exception as e:
                if "429" in str(e) or "rate limit" in str(e).lower():
                    print(f"⚠️ Rate limit hit. Retrying in {delay}s...")
                    await asyncio.sleep(delay)
                    delay = min(delay * 2, 60)  # exponential backoff
                else:
                    raise e

    return embedded_nodes

In [31]:
embeded_nodes = asyncio.run(embed_nodes_with_backoff(nodes, embed_model))

✅ Embedded 1/296
✅ Embedded 2/296
✅ Embedded 3/296
✅ Embedded 4/296
✅ Embedded 5/296
✅ Embedded 6/296
✅ Embedded 7/296
✅ Embedded 8/296
✅ Embedded 9/296
✅ Embedded 10/296
✅ Embedded 11/296
✅ Embedded 12/296
✅ Embedded 13/296
✅ Embedded 14/296
✅ Embedded 15/296
✅ Embedded 16/296
✅ Embedded 17/296
✅ Embedded 18/296
✅ Embedded 19/296
✅ Embedded 20/296
✅ Embedded 21/296
✅ Embedded 22/296
✅ Embedded 23/296
✅ Embedded 24/296
✅ Embedded 25/296
✅ Embedded 26/296
✅ Embedded 27/296
✅ Embedded 28/296
✅ Embedded 29/296
✅ Embedded 30/296
✅ Embedded 31/296
✅ Embedded 32/296
✅ Embedded 33/296
✅ Embedded 34/296
✅ Embedded 35/296
✅ Embedded 36/296
✅ Embedded 37/296
✅ Embedded 38/296
✅ Embedded 39/296
✅ Embedded 40/296
✅ Embedded 41/296
✅ Embedded 42/296
✅ Embedded 43/296
✅ Embedded 44/296
✅ Embedded 45/296
✅ Embedded 46/296
✅ Embedded 47/296
✅ Embedded 48/296
✅ Embedded 49/296
✅ Embedded 50/296
✅ Embedded 51/296
✅ Embedded 52/296
✅ Embedded 53/296
✅ Embedded 54/296
✅ Embedded 55/296
✅ Embedded 56/296
✅

In [33]:

with open("embeded_nodes.pkl", "wb") as f:
    pickle.dump(embeded_nodes, f)


In [None]:
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.postgres import PGVectorStore

connection_string = ""
if os.environ.get("POSTGRES_CONNECTION_STRING"):
    connection_string = os.environ.get("POSTGRES_CONNECTION_STRING")
else:
    connection_string = getpass.getpass("Enter your Postgres connection string: ")
db_name = ""
if os.environ.get("POSTGRES_DB_NAME"):
    db_name = os.environ.get("POSTGRES_DB_NAME")
else:
    db_name = getpass.getpass("Enter your Postgres database name: ")

In [119]:
from llama_index.core import ChatPromptTemplate, PromptTemplate

chat_template = ChatPromptTemplate.from_messages([
    ("system", """You are a helpful customer support assistant with a vast knowledge of DoorDash policies and procedures. 
     Use the following context to answer the question at the end. If you don't know the answer, just say that you don't know, 
    don't try to make up an answer. Be concise.
      and show citations with links where possible and a source link at the end if provided by the document."""),
    ("user", "{query_str}")
])

system_instructions = """
Add happy dasher at the end of each answer to the user
"""

# Build your QA prompt
qa_prompt = PromptTemplate(
    system_instructions +
    "\nContext:\n{context_str}\n\nQuestion:\n{query_str}\nAnswer:"
)


llm = Groq(model="openai/gpt-oss-20b", temperature=0.2)

In [None]:
from sqlalchemy import make_url

url = make_url(connection_string)
vector_store = PGVectorStore.from_params(
    database=db_name,
    host=url.host,
    password=url.password,
    port=url.port,
    user=url.username,
    table_name="door_dash_rag",
    embed_dim=1536,  # embedding dimension
    hnsw_kwargs={
        "hnsw_m": 16,
        "hnsw_ef_construction": 64,
        "hnsw_ef_search": 40,
        "hnsw_dist_method": "vector_cosine_ops",
    },
)

storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex(
    embeded_nodes, storage_context=storage_context, embed_model=embed_model, show_progress=True
)



Generating embeddings: 0it [00:00, ?it/s]


In [None]:
query_engine = index.as_chat_engine(chatMode="BEST", llm=llm, similarity_top_k=5,  )

In [117]:


out = query_engine.chat("i live in australia and im an independent contractor for doordash, i really need to know and understand the payment dispute process where i reside only. provide all links available espcially the source link for the document the answer was retrieved from so the user can know the source of the documents if they want to check it themself")

In [106]:
rprint(out)

In [None]:
from IPython.display import display, Markdown

Markdown(out.response)

In [3]:
from IPython import get_ipython
get_ipython().run_line_magic('reset', '-sf')
