In [2]:
## Importing the required libraries

from langchain_community.document_loaders import PyPDFLoader
from docx import Document
from langchain_chroma import Chroma
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
import numpy as np



In [28]:
from langchain_text_splitters import CharacterTextSplitter

def read_pdf(file_path):
    loader = PyPDFLoader(file_path)
    documents = loader.load_and_split()
    # text = " ".join([doc.page_content for doc in documents])
    # print(text)
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    docs = text_splitter.split_documents(documents)
    # print(docs)
    return docs

read_pdf("C:/Users/JineshPatel/Downloads/Get_Started_With_Smallpdf.pdf")

[Document(page_content='Welcome to Smallpdf\nDigital Documents—All In One Place\nAccess Files Anytime, Anywhere Enhance Documents in One Click \nCollaborate With Others With the new Smallpdf experience, you can \nfreely upload, organize, and share digital \ndocuments. When you enable the ‘Storage’ \noption, we’ll also store all processed files here. \nYou can access files stored on Smallpdf from \nyour computer, phone, or tablet. We’ll also \nsync files from the Smallpdf Mobile App to our \nonline portalWhen you right-click on a file, we’ll present \nyou with an array of options to convert, \ncompress, or modify it. \nForget mundane administrative tasks. With \nSmallpdf, you can request e-signatures, send \nlarge files, or even enable the Smallpdf G Suite \nApp for your entire organization. Ready to take document management to the next level?', metadata={'source': 'C:/Users/JineshPatel/Downloads/Get_Started_With_Smallpdf.pdf', 'page': 0})]

In [4]:
from langchain_core.documents import Document as customDocument
def read_doc(file_path):
    # Load the document
    doc = Document(file_path)
    
    # Extract text from paragraphs
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"  # Add newline to separate paragraphs
    
    # Create a single Document object with the extracted text as page_content
    document = customDocument(page_content=text, metadata={"source": file_path})
    
    return [document]  # Return as a list to match the format of read_pdf
read_doc("C:/Users/JineshPatel/Downloads/Untitled document.docx")

[Document(page_content="| **Compensation**          | **Monthly** | **Yearly**   |\n|---------------------------|-------------|--------------|\n| **Basic Salary**          | ₹24,000     | ₹288,000     |\n| **House Rent Allowance**  | ₹9,600      | ₹115,200     |\n| **Standard Allowance**    | ₹6,000      | ₹72,000      |\n| **Special Allowance**     | ₹16,580     | ₹198,960     |\n| **Total Gross Pay (A)**   | ₹56,180     | ₹674,160     |\n| **Employee's contribution in PF** | ₹2,520 | ₹30,240    |\n| **Professional Tax**      | ₹200        | ₹2,400       |\n| **Total Deductions (B)**  | ₹2,700      | ₹32,640      |\n| **Employer's contribution in PF** | ₹2,520 | ₹30,240    |\n| **EPF Admin Charges**     | ₹150        | ₹1,800       |\n| **Gratuity**              | ₹1,150      | ₹13,800      |\n| **Total Employer's Contributions (C)** | ₹3,820 | ₹45,840 |\n| **Total CTC (A+C)**       | ₹60,000     | ₹720,000     |\n| **Net Take Home**         | ₹53,480     | ₹641,760     |\n\n\n\n\n\n\

In [5]:
#%pip install sentence-transformers

In [45]:
def generate_embeddings_and_store_chromadb(docs):

    # create the open-source embedding function
    embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

    # load it into Chroma
    db = Chroma.from_documents(docs, embedding_function)
    return db
vector_store = generate_embeddings_and_store_chromadb(read_pdf("C:/Users/JineshPatel/Downloads/Budget_2024.pdf"))



In [46]:
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3},
)

from transformers import pipeline

# Assuming retriever is already defined
llm = pipeline("question-answering", model="deepset/roberta-base-squad2")


In [47]:
from langchain_core.prompts import ChatPromptTemplate

message = """
Please provide a detailed answer to the following question using the provided context.

{question}

Context:
{context}
"""

prompt = ChatPromptTemplate.from_messages([("human", message)])


class CustomChain:
    def __init__(self, retriever, prompt, llm):
        self.retriever = retriever
        self.prompt = prompt
        self.llm = llm

    def invoke(self, question):
        # Retrieve context using the retriever
        context_docs = self.retriever.invoke(question)
        context = context_docs[0].page_content if context_docs else ""

        # Format the prompt with the retrieved context
        formatted_message = self.prompt.format_messages(
            question=question,
            context=context
        )[0]
        formatted_prompt = formatted_message.content # Just for debugging
        # print(formatted_prompt)

        # Use the QA pipeline to answer the question
        response = self.llm(question=question, context=context)
        return response
# Create an instance of the chain
rag_chain = CustomChain(retriever=retriever, prompt=prompt, llm=llm)

In [48]:
question = "what is bill?  "
response = rag_chain.invoke(question)

print("Answer is : ",response['answer'])


Answer is :  The Medium-Term Fiscal Policy cum Fiscal Policy Strategy Statement


In [1]:

%pip install flask

Collecting flask
  Downloading flask-3.0.3-py3-none-any.whl.metadata (3.2 kB)
Collecting Werkzeug>=3.0.0 (from flask)
  Downloading werkzeug-3.0.3-py3-none-any.whl.metadata (3.7 kB)
Collecting itsdangerous>=2.1.2 (from flask)
  Downloading itsdangerous-2.2.0-py3-none-any.whl.metadata (1.9 kB)
Downloading flask-3.0.3-py3-none-any.whl (101 kB)
   ---------------------------------------- 0.0/101.7 kB ? eta -:--:--
   ---- ----------------------------------- 10.2/101.7 kB ? eta -:--:--
   --------------- ----------------------- 41.0/101.7 kB 487.6 kB/s eta 0:00:01
   ----------------------------------- --- 92.2/101.7 kB 744.7 kB/s eta 0:00:01
   -------------------------------------- 101.7/101.7 kB 648.9 kB/s eta 0:00:00
Downloading itsdangerous-2.2.0-py3-none-any.whl (16 kB)
Downloading werkzeug-3.0.3-py3-none-any.whl (227 kB)
   ---------------------------------------- 0.0/227.3 kB ? eta -:--:--
   ---------------------------------- ----- 194.6/227.3 kB 5.9 MB/s eta 0:00:01
   ----------