In [1]:
import os
import json

from langchain_community.document_loaders import PDFPlumberLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_xai import ChatXAI
from langchain.prompts import PromptTemplate
from langchain.chains.llm import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains import RetrievalQA

In [3]:
def load_all_documents(pdf_folder):
    """
    Loads all PDF documents from the specified folder using PDFPlumberLoader.
    Returns a list of document objects.
    """
    all_docs = []
    if not os.path.exists(pdf_folder):
        print(f"PDF folder '{pdf_folder}' not found. Make sure your PDFs are in the folder.")
        return all_docs

    for filename in os.listdir(pdf_folder):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            print("Loading:", pdf_path)
            loader = PDFPlumberLoader(pdf_path)
            all_docs.extend(loader.load())
    return all_docs

In [5]:
def get_vector_store(documents, embedder, index_dir):
    """
    Creates or loads a FAISS vector store from the provided documents using the specified embedder.
    """
    if os.path.exists(index_dir):
        vector = FAISS.load_local(index_dir, embedder, allow_dangerous_deserialization=True)
        print("Loaded vector store from disk.")
    else:
        # Split documents into chunks using SemanticChunker.
        text_splitter = SemanticChunker(HuggingFaceEmbeddings())
        docs_chunks = text_splitter.split_documents(documents)
        vector = FAISS.from_documents(docs_chunks, embedder)
        vector.save_local(index_dir)
        print("Created and saved new vector store.")
    return vector

In [7]:
def build_llm_chain(llm):
    """
    Creates an LLMChain using the provided LLM and a QA prompt.
    """
    prompt = """
1. Use the following pieces of context to answer the question at the end.
2. If you don't know the answer, just say that "I don't know" but don't make up an answer on your own.
3. Keep the answer crisp and limited to 3-4 sentences.

Context: {context}

Question: {question}

Helpful Answer:"""
    QA_CHAIN_PROMPT = PromptTemplate.from_template(prompt)
    llm_chain = LLMChain(llm=llm, prompt=QA_CHAIN_PROMPT, verbose=True)
    return llm_chain


def build_combined_documents_chain(llm_chain):
    """
    Creates a StuffDocumentsChain for combining retrieved documents using a custom document prompt.
    """
    document_prompt = PromptTemplate(
        input_variables=["page_content", "source"],
        template="Context:\ncontent: {page_content}\nsource: {source}",
    )
    combined_chain = StuffDocumentsChain(
        llm_chain=llm_chain,
        document_variable_name="context",
        document_prompt=document_prompt,
    )
    return combined_chain

In [9]:
def build_qa_chain(retriever, combined_documents_chain):
    """
    Builds and returns a RetrievalQA chain using the provided retriever and combined documents chain.
    """
    qa_chain = RetrievalQA(
        combine_documents_chain=combined_documents_chain,
        retriever=retriever,
        return_source_documents=True,
        verbose=True,
    )
    return qa_chain

In [19]:
def initialize_chain():
    """
    Initializes the full RetrievalQA chain by:
      1. Loading PDF documents from 'static/uploads'.
      2. Creating or loading a FAISS vector store.
      3. Building the LLM and associated chains.
    Returns the initialized QA chain.
    """
    pdf_folder = os.path.join("static", "uploads")
    all_docs = load_all_documents(pdf_folder)
    if not all_docs:
        print("No PDF documents loaded.")
        return None

    index_dir = "faiss_index"
    embedder = HuggingFaceEmbeddings()
    vector = get_vector_store(all_docs, embedder, index_dir)
    retriever = vector.as_retriever(search_type="similarity", search_kwargs={"k": 1})

    # Define the LLM 
    X_API="API KEY"
    llm= llm = ChatXAI(
        model="grok-2-latest",
        temperature=0.01,
        max_tokens=None,
        timeout=None,
        max_retries=2,
        api_key=X_API,
        
    )
    llm_chain = build_llm_chain(llm)
    combined_documents_chain = build_combined_documents_chain(llm_chain)
    qa_chain = build_qa_chain(retriever, combined_documents_chain)
    return qa_chain

In [13]:
def get_response(qa_chain, question):
    """
    Runs the QA chain with the provided question.
    Returns the answer and a relative URL for the source PDF (if available).
    """
    response = qa_chain(question)
    answer_text = response.get("result", "I don't know")
    pdf_url = None

    if response.get("source_documents") and len(response["source_documents"]) > 0:
        doc = response["source_documents"][0]
        metadata = doc.metadata
        source_doc = metadata.get("source", "")
        page_num = metadata.get("page", 0)
        # Normalize path: replace backslashes with forward slashes.
        normalized_source = source_doc.replace("\\", "/")
        # Remove any leading "static/" so we can build a relative URL.
        if normalized_source.lower().startswith("static/"):
            normalized_source = normalized_source[len("static/"):]
        pdf_url = f"/static/{normalized_source}#page={page_num+1}"
    return answer_text, pdf_url

In [21]:
qa_chain = initialize_chain()

Loading: static\uploads\CELEX_02006R1907-20231201_EN_TXT-Registration, Evaluation, Authorisation and Restriction of Chemicals (REACH).pdf
Loading: static\uploads\CELEX_31994L0062_EN_TXT-packaging and packaging waste.pdf


  embedder = HuggingFaceEmbeddings()


Loaded vector store from disk.


  llm_chain = LLMChain(llm=llm, prompt=QA_CHAIN_PROMPT, verbose=True)
  combined_chain = StuffDocumentsChain(
  qa_chain = RetrievalQA(


In [22]:
sample_question = " ESSENTAIL REQUIREMENTS ON THE COMPOSITION AND THE REUSABLE AND RECOVERABLE, INCLUDINGRECYCLABLE,NATUREOFPACKAGING"


In [25]:
answer, pdf_url = get_response(qa_chain, sample_question)

  response = qa_chain(question)




[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
1. Use the following pieces of context to answer the question at the end.
2. If you don't know the answer, just say that "I don't know" but don't make up an answer on your own.
3. Keep the answer crisp and limited to 3-4 sentences.

Context: Context:
content: 12. 94 Official Journal of the European Communities No L 365/19
ANNEX II
ESSENTAIL REQUIREMENTS ON THE COMPOSITION AND THE REUSABLE AND
RECOVERABLE, INCLUDING RECYCLABLE, NATURE OF PACKAGING
1. Requirements specific to the manufacturing and composition of packaging
— Packaging shall be so manufactured that the packaging volume and weight be limited to the
minimum adequate amount to maintain the necessary level ofsafety, hygiene and acceptance for the
packed product and for the consumer. — Packaging shall be designed, produced and commercialized in such a way as to permit its reuse or
recovery, including r

In [27]:
print("Question:", sample_question)
print("Answer:", answer)
print("PDF URL:", pdf_url)

Question:  ESSENTAIL REQUIREMENTS ON THE COMPOSITION AND THE REUSABLE AND RECOVERABLE, INCLUDINGRECYCLABLE,NATUREOFPACKAGING
Answer: The essential requirements for packaging composition and its reusable and recoverable nature, as outlined in the Official Journal of the European Communities, focus on minimizing the environmental impact of packaging. Packaging must be manufactured to limit its volume and weight to the minimum necessary while ensuring safety, hygiene, and consumer acceptance. It should be designed for reuse or recovery, including recycling, and minimize the presence of hazardous substances. Additionally, packaging must meet specific criteria for being reusable and recoverable, such as enabling multiple uses, being processable for health and safety, and being recyclable or suitable for energy recovery or composting.
PDF URL: /static/uploads/CELEX_31994L0062_EN_TXT-packaging and packaging waste.pdf#page=10
