## 1. Data Ingestion

In [None]:

import os

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
os.getcwd()

file_path = os.path.join(os.getcwd(), "data", "sample.pdf")

documents = PyPDFLoader(file_path).load() # Each page is a separate document

print(f"Number of pages in the document: {len(documents)}")




In [None]:
import json
from langchain.schema import Document

def documents_to_json(docs: list[Document]):
    return json.dumps([
        {
            "page_content": doc.page_content,
            "metadata": doc.metadata
        } for doc in docs
    ], indent=2)
    
print(documents_to_json(documents))

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=150,
    length_function=len,
)

splitted_doc = text_splitter.split_documents(documents)

# display the number of chunks after splitting
print(f"Number of chunks after splitting: {len(splitted_doc)}")
# display the metadata of the first chunk
print(f"Metadata of first chunk: {splitted_doc[0].metadata}")
# display the content of the first chunk
print(f"Content of first chunk: {splitted_doc[0].page_content}")

In [None]:
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from dotenv import load_dotenv

load_dotenv()

azOpenAIembeddings = AzureOpenAIEmbeddings(
    model="text-embedding-ada-002",
    api_version="2023-05-15",
)
azOpenAIembeddings.embed_query("What is the capital of France?")

In [None]:
from langchain.vectorstores import FAISS

# FAISS is in memory vector store, so it will not persist across sessions
vectorstore = FAISS.from_documents(
    documents=splitted_doc,
    embedding=azOpenAIembeddings
)

# Retrieval Process

In [None]:

relavant_docs = vectorstore.similarity_search("who prepared the document?")
# print(documents_to_json(relavant_docs))

retriever=vectorstore.as_retriever(search_kwargs = {"k": 2})

result = retriever.invoke("which are the sample package contains?")

print(f"Result: {result[0].page_content}")

In [None]:
azOpenAIllm = AzureChatOpenAI(
    azure_deployment="gpt-4o-mini",
    api_version="2025-01-01-preview",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)
azOpenAIllm.invoke("What is the capital of France?").content

In [None]:
prmpt_template = """
    Answer the question based on the context below. 
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    Context: {context}
    Question: {question}
"""

user_question = "who prepared the document?"


from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

prompt = PromptTemplate(template=prmpt_template, input_variables=["context", "question"])

In [None]:
def format_docs(docs: list[Document]) -> str:
    return "\n\n".join([doc.page_content for doc in docs])

In [None]:
llm = AzureChatOpenAI(
    azure_deployment="gpt-4o-mini",
    api_version="2025-01-01-preview",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

from langchain_core.runnables import RunnableLambda

# LCEL: Langchain Core Expression Language
# This is a runnable chain that takes the context and question, formats the context, and then
# passes it to the prompt, which is then passed to the LLM, and finally parses the output as a string.
# The final output is a string that contains the answer to the question based on the context.

rag_chain = (
    {
        "context": RunnableLambda(lambda x: format_docs(retriever)),
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("when is the sample dated?")