## PART - 1

In [287]:
# Installing Required Libraries
%pip install python-docx
%pip install python-pptx
%pip install PyPDF2
%pip install langchain
%pip install langchain_community
%pip install langchain_google_genai
%pip install langchain_text_splitters
%pip install sentence-transformers
%pip install faiss-cpu
%pip install cohere
%pip install gr

Collecting gr
  Downloading gr-1.27.0.tar.gz (130 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.4/130.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: gr
  Building wheel for gr (pyproject.toml) ... [?25l[?25hdone
  Created wheel for gr: filename=gr-1.27.0-cp311-cp311-linux_x86_64.whl size=42632731 sha256=93c2d6218495a66a7f1cc7d98a0877b19043d56e5aee6e494cada9a1f8b3a0e1
  Stored in directory: /root/.cache/pip/wheels/f8/a2/16/86ad3021ec907b77415c370d331a0065e6b4d873d477b4a0eb
Successfully built gr
Installing collected packages: gr
Successfully installed gr-1.27.0


In [288]:
# necessary Imports
from docx import Document
from PyPDF2 import PdfReader
from pptx import Presentation
from langchain_community.llms import Cohere
from langchain_community.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAI
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.messages import AIMessage, HumanMessage
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts  import PromptTemplate, ChatPromptTemplate, MessagesPlaceholder
import gradio as gr

In [289]:
# Example PDF Path
pdf_file = "Sample Financial Statement.pdf"

In [290]:
# extracting pdf data
pdf_text = ""
pdf_reader = PdfReader(pdf_file)
for page in pdf_reader.pages:
    pdf_text += page.extract_text()


In [291]:
# merging all the text

all_text = pdf_text
len(all_text)

213951

In [292]:
# splitting the text into chunks for embeddings creation

text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 200, # This is helpul to handle the data loss while chunking.
        length_function = len,
        separators=['\n', '\n\n', ' ', '']
    )

chunks = text_splitter.split_text(text = all_text)
len(chunks)

273

In [293]:
import os
os.environ['HuggingFaceHub_API_Token']= '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'
os.environ['GOOGLE_API_KEY']= "AIzaSyCoGAkfKk2JcAUS829HSSEo-Tnz72yP0fo"
os.environ['cohere_api_key'] = "jTIqO0PIhrTAV6OalQjp8U6MvhvrbAu6Y6aeZ67K"

In [294]:
# Initializing embeddings model

embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

In [295]:
# Indexing the data using FAISS
vectorstore = FAISS.from_texts(chunks, embedding = embeddings)

In [296]:
# creating retriever
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

In [297]:
# Prompt template for financial metrics
prompt_template = """You are a financial assistant specialized in Profit & Loss (P&L) statements.
Answer the question as precise as possible using the provided context. If the answer is
not contained in the context, say "answer not available in context" \n\n
Context: \n {context}?\n
Question: \n {question} \n
Answer:"""

In [298]:
prompt = PromptTemplate.from_template(template=prompt_template)

In [299]:
# function to create a single string of relevant documents given by Faiss.
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [300]:
# RAG Chain

def generate_answer(question):
    cohere_llm = Cohere(model="command", temperature=0.1, cohere_api_key=os.getenv('cohere_api_key'))

    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | cohere_llm
        | StrOutputParser()
    )

    return rag_chain.invoke(question)


## EXAMPLE 1 (irrelevant question response)

In [301]:
ans = generate_answer("Show the operating margin for the last 6 months.")
print(ans)

 The answer is not available in the context provided. 
Operating margin is a crucial metric that indicates the profitability of a company's core business operations, excluding interest and taxes. 
It is calculated by dividing operating profit by total revenue or sales, and it is typically expressed as a percentage. 
However, in order to calculate the operating margin for the last six months, the figures for those specific months are required, along with the revenue and expense components that make up the operating profit and the total revenue. 

The provided financial statement does not contain the necessary data to compute the operating margin for the last six months specifically, and it appears to be an annual statement. 
If you have the required monthly data, please provide it and I will be happy to assist you in calculating the operating margin for the last six months using the provided context and any other relevant information. Failing that, you may need to refer to the original 

EXAMPLE 2 (direct answer)

In [309]:
ans = generate_answer("What are the total expenses for Q2 2023?")
print(ans)

 answer not available in context


EXAMPLE 3 (circling around and answering)

In [303]:
ans = generate_answer("What is the net profit for the last quarter? if not available, then calulate")
print(ans)

 The net profit for the last quarter is not explicitly given in the provided context. However, I can calculate it based on the information provided. 

For the quarter ended March 31, 2024, the profit for the period is given as 26,248 USD. 

To calculate the net profit, we need to subtract the total comprehensive income attributable to the period from the profit for the period. 

The total comprehensive income attributable to the period is calculated as:
Profit for the period (net of tax) + Other comprehensive income - Items that will be reclassified to profit or loss + Tax expense. 

We can calculate the net profit as:
26,248 + (14 + 25 + 120 - 8 - 12 - 15 + 139 + 1) - (1,173 + 2,260 + 8,390 + 1,350 + 166 + 139 + 381 + 513) 

Which simplifies to:
26,248 + 148 - 9,287 

Resulting in an approximate net profit for the last quarter of 16,869 USD. 


In [304]:
ans = generate_answer("Show the revenue growth for the past year.")
print(ans)

 The provided context contains data for the three months and year ended March 31, 2024, and March 31, 2023, respectively.

To calculate the percentage revenue growth between these two periods, you should use the following formula:
Revenue growth (%) = ((Rev_2024 - Rev_2023) / Rev_2023) * 100

In this case, Rev_2024 is the revenue for the three months and year ended March 31, 2024, and Rev_2023 is the revenue for the same periods in 2023.

However, the provided context does not explicitly contain the revenue for the year ended March 31, 2023, so I cannot calculate the revenue growth rate without incomplete information.

The revenue for the three months and year ended March 31, 2024, is given in the table below. The revenue is presented disaggregated by geography and contract type, which is how the group believes it best depicts the nature, amount, timing, and uncertainty of revenues and cash flows.

North America	Europe	All other segments
22,606 10,861	12,645

I cannot calculate the rev

## PART - 2

In [305]:
def process_pdf(pdf_file):
    """Extracts text from a user-uploaded PDF."""
    pdf_text = ""
    pdf_reader = PdfReader(pdf_file.name)
    for page in pdf_reader.pages:
        pdf_text += page.extract_text() + "\n"

    # Splitting text into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, chunk_overlap=200, separators=['\n', '\n\n', ' ', '']
    )
    chunks = text_splitter.split_text(text=pdf_text)

    # Indexing the data using FAISS
    vectorstore = FAISS.from_texts(chunks, embedding=embeddings)
    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

    return retriever

retriever = None  # Global retriever variable

In [306]:
def upload_pdf(pdf_file):
    global retriever
    retriever = process_pdf(pdf_file)
    return "PDF uploaded and processed successfully! You can now ask financial questions."

# Define the prompt template
prompt_template = """You are a financial assistant specialized in Profit & Loss (P&L) statements.
Answer the question as precisely as possible using the provided context. If the answer is
not contained in the context, say "answer not available in context".

Context: \n{context}\n
Question: \n{question}\n
Answer:"""

prompt = PromptTemplate.from_template(template=prompt_template)

In [307]:
def generate_answer(question):
    if retriever is None:
        return "Please upload a PDF first."

    cohere_llm = Cohere(model="command", temperature=0.1, cohere_api_key=os.getenv('cohere_api_key'))

    rag_chain = (
        {"context": retriever | (lambda docs: "\n\n".join(doc.page_content for doc in docs)), "question": RunnablePassthrough()}
        | prompt
        | cohere_llm
        | StrOutputParser()
    )

    return rag_chain.invoke(question)

In [308]:
# Gradio UI
demo = gr.Blocks()

with demo:
    gr.Markdown("""# Financial QA Chatbot
    Upload a **Profit & Loss (P&L) Statement PDF**, and ask financial questions in real-time.
    """)

    with gr.Row():
        pdf_input = gr.File(label="Upload P&L PDF")
        upload_button = gr.Button("Process PDF")

    upload_status = gr.Textbox(label="Status", interactive=False)

    with gr.Row():
        question_input = gr.Textbox(label="Ask a question about the financial data")
        submit_button = gr.Button("Get Answer")

    answer_output = gr.Textbox(label="Answer")

    upload_button.click(upload_pdf, inputs=[pdf_input], outputs=[upload_status])
    submit_button.click(generate_answer, inputs=[question_input], outputs=[answer_output])

demo.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://dab6bdf0a91ca1f67a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


