In [None]:
!pip install langchain-pinecone langchain langchain-community langchain-google-genai pymupdf
!pip install pypdf

Collecting pypdf
  Downloading pypdf-5.3.0-py3-none-any.whl.metadata (7.2 kB)
Downloading pypdf-5.3.0-py3-none-any.whl (300 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.7/300.7 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.3.0


# without streamlit

In [None]:
# Install necessary packages.
!pip install langchain-pinecone langchain langchain-community langchain-google-genai pymupdf
!pip install pypdf


# Import necessary modules.
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from google.generativeai.types.safety_types import HarmBlockThreshold, HarmCategory
from langchain_pinecone import PineconeVectorStore
import os

# Load the PDF document.
loader = PyPDFLoader('/content/Sample Financial Statement.pdf')
documents = loader.load() #Correct the spelling of document and remove the unnecessary list, document was a list object already.

# Split the document into chunks.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents) # Pass in the documents list of Document objects.

# Initialize embeddings.
embeddings = GoogleGenerativeAIEmbeddings(
    model='models/embedding-001',
    google_api_key='AIzaSyAUH70gKFSmR52QAbZq4fJFM3WSbTYCHp8',
    task_type="retrieval_query"
)

# Define safety settings.
safety_settings = {
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
}

# Initialize the chat model.
chat_model = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    google_api_key='AIzaSyAUH70gKFSmR52QAbZq4fJFM3WSbTYCHp8',
    temperature=0.3,
    safety_settings=safety_settings
)

# Set up Pinecone.
pinecone_key = 'pcsk_6s2Qdu_PjCPwjuTV679GarCruoJGPpH6JrXQuXFZTEmZnji5LkcsBetasx2BvMuZFRtp2m'
os.environ['PINECONE_API_KEY'] = pinecone_key
index_name = 'assignment-index1'

# Convert to Pinecone index and insert chunked documents.
docsearch = PineconeVectorStore.from_documents(texts, embeddings, index_name=index_name)

# Define the prompt template.
prompt_template = """
You are a financial AI assistant specializing in analyzing Profit & Loss (P&L) statements. Your task is to answer user queries based on the provided financial data.

### Context:
{context}

### User Question:
{question}

### Guidelines:
1. *Extract only relevant information* from the context.
2. *Provide structured responses* (e.g., bullet points, tables, or calculations).
3. *Include necessary calculations* and financial insights where applicable.
4. *Avoid assumptions*—state explicitly if data is missing.
5. *Ensure clarity and conciseness* while maintaining informative depth.

### Answer:
"""

# Define the prompt template.
prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])

# Create the QA.
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=docsearch.as_retriever(search_kwargs={"k": 5}),
    llm=chat_model
)

qa_chain = RetrievalQA.from_chain_type(
    llm=chat_model,
    retriever=retriever_from_llm,
    return_source_documents=True,
    chain_type='stuff', # Correct the typo in chain_type
    chain_type_kwargs={"prompt": prompt}
)

# Get the response.
response = qa_chain.invoke({"query": "what is the gross profit for Q3 2024?"}) # Correct the key to query
print(response['result'])

The provided text shows the gross profit for the year ended March 31, 2024 (which includes Q3 2024) as ₹46,257 crore.  The data does *not* provide a breakdown of gross profit by quarter.  Therefore, the gross profit for Q3 2024 specifically cannot be determined from this information.


In [None]:
print(response['result'])

The provided text shows the gross profit for the year ended March 31, 2024 (which includes Q3 2024) as ₹46,257 crore.  The data does *not* provide a breakdown of gross profit by quarter.  Therefore, the gross profit for Q3 2024 specifically cannot be determined from this information.


In [None]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.42.2-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.42.2-py2.py3-none-any.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m95.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[

# With streamlit

In [None]:
!pip install -U langchain-community
! pip install langchain_google_genai
! pip install langchain_pinecone
! pip install pypdf
! pip install streamlit

Collecting pypdf
  Downloading pypdf-5.3.0-py3-none-any.whl.metadata (7.2 kB)
Downloading pypdf-5.3.0-py3-none-any.whl (300 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.7/300.7 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.3.0


In [None]:
%%writefile app.py

pinecone_key = 'pcsk_6UC13T_F7ykvzzEouavESB8Da8LzQbT3Td1T7m2X2T28qGohvowrtRc23hxERhvP4rcV7i'

import os
import streamlit as st

os.environ['PINECONE_API_KEY'] = pinecone_key

google_api_key = "AIzaSyDdK8v_PZEOVrvCWj7YFkzzZ0W8bl-eaXU"

st.title('AI Document Assistant - RAG Model')
st.subheader("Upload any document and ask questions related to its content")

from langchain_community.document_loaders import PyPDFLoader, TextLoader, UnstructuredWordDocumentLoader

uploaded_file = st.file_uploader("Upload a document", type=["pdf", "txt", "docx"])

# Initialize documents as an empty list
documents = []

if uploaded_file is not None:
    file_path = f'temp_{uploaded_file.name}'
    with open(file_path, "wb") as f:
        f.write(uploaded_file.getvalue())

    if uploaded_file.name.endswith(".pdf"):
        loader = PyPDFLoader(file_path)
    elif uploaded_file.name.endswith(".txt"):
        loader = TextLoader(file_path)
    elif uploaded_file.name.endswith(".docx"):
        loader = UnstructuredWordDocumentLoader(file_path)
    else:
        st.error("Unsupported file format.")
        st.stop()

    documents = loader.load()  # Now documents will always be defined

from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

# Ensure we only process documents if they exist
texts = text_splitter.split_documents(documents) if documents else []

from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_pinecone import PineconeVectorStore

embeddings = GoogleGenerativeAIEmbeddings(
    model='models/embedding-001',
    google_api_key=google_api_key,
    task_type='retrieval_query'
)

index_name = 'langchain-test-index'
docsearch = PineconeVectorStore.from_documents(texts, embeddings, index_name=index_name) if texts else None

from langchain.prompts import PromptTemplate

prompt_template = """You are an AI assistant specializing in analyzing and answering questions about uploaded documents.
Your task is to provide clear and accurate answers based on the content of the uploaded file.

### **Context:**
{context}

### **User Question:**
{question}

### **Guidelines for Response:**
1. **Extract only relevant information** from the document.
2. **Provide structured responses** (bullet points, tables, summaries, or calculations when needed).
3. **Clarify missing data**—if the document lacks the necessary details, state it explicitly.
4. **Avoid assumptions** and rely strictly on document content.
5. **Ensure clarity and conciseness** while maintaining informative depth.
6. **Adapt response format** based on file type:
   - **For PDFs, Word Docs:** Summarize sections or extract key points.
   - **For CSV/Excel Files:** Analyze numerical trends, patterns, and insights.
   - **For Text Files:** Extract key details and highlight main ideas.
   - **For Code Files:** Explain functions, structure, and purpose.

### **Answer:**
"""

prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA
from langchain.retrievers.multi_query import MultiQueryRetriever

chat_model = ChatGoogleGenerativeAI(
    model='gemini-1.5-flash',
    google_api_key=google_api_key, temperature=0.3
)

retriever_from_llm = None
if docsearch:
    retriever_from_llm = MultiQueryRetriever.from_llm(
        retriever=docsearch.as_retriever(search_kwargs={'k': 5}),
        llm=chat_model
    )

qa_chain = None
if retriever_from_llm:
    qa_chain = RetrievalQA.from_chain_type(
        llm=chat_model,
        retriever=retriever_from_llm,
        return_source_documents=True,
        chain_type='stuff',
        chain_type_kwargs={'prompt': prompt}
    )

user_question = st.text_input('Enter your question')

if st.button("Get Response"):
    if user_question:
        if qa_chain:
            response = qa_chain.invoke({"query": user_question})  # Fixed query key
            st.subheader("Answer")

            # Fix dictionary key access
            if isinstance(response, dict):
                if "result" in response:
                    st.write(response["result"])
                elif "Response" in response:
                    st.write(response["Response"])
                else:
                    st.write("No relevant information found.")
            else:
                st.write("Unexpected response format.")

        else:
            st.error("Please upload a document first.")
    else:
        st.warning("Please enter a question")


Overwriting app.py


In [None]:
!pip install pyngrok



In [None]:
from pyngrok import ngrok
ngrok.kill()
outh_token="2tV16YrLyTi19bdvVt3LwyfyGFx_8mCCsoDZw3NafVXwXYCd"
ngrok.set_auth_token(outh_token)

#create the tunnel
ngrok_tunnel=ngrok.connect(addr="5000",proto="http")
print("Tricking url :",ngrok_tunnel.public_url)

!streamlit run --server.port 5000 app.py

Tricking url : https://90a5-35-204-32-161.ngrok-free.app

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:5000[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:5000[0m
[34m  External URL: [0m[1mhttp://35.204.32.161:5000[0m
[0m
[34m  Stopping...[0m
[34m  Stopping...[0m
E0000 00:00:1740460064.493686   12953 init.cc:232] grpc_wait_for_shutdown_with_timeout() timed out.
