In [1]:
! pip install chromadb pypdf ollama



In [2]:
import chromadb
import os
from pypdf import PdfReader

In [3]:
# extract the text from pdf
def extract_text_from_pdf(pdf_path):
    pdf_texts=[]
    for filename in os.listdir(pdf_path):
        pdf_path= os.path.join(pdf_path, filename)
        if filename.endswith('.pdf'):
            reader= PdfReader(pdf_path)
            text=''
            for page in reader.pages:
                text+=page.extract_text().strip()
            pdf_texts.append(text.strip())
    return pdf_texts

In [4]:
extract_text_from_pdf= extract_text_from_pdf('google_pdfs')

split it using Recursive character Splitter.

Then with sentence transformer splitter

In [5]:
import langchain
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    SentenceTransformersTokenTextSplitter)
# split it into characters first and then into tokens
# helps in efficiency

def character_splitting(texts):
    character_splitter= RecursiveCharacterTextSplitter(
        chunk_size= 250,
        chunk_overlap= 20,
        length_function= len,
        is_separator_regex=False,
    )
    character_split=character_splitter.split_text(texts)
    return character_split

In [31]:
extract_text_from_pdf



In [6]:
# make sure to convert the text into string before passing into character split
# takes in only string
extract_text_from_pdf= '\n'.join(extract_text_from_pdf)

In [7]:
character_split_texts= character_splitting(extract_text_from_pdf)

In [8]:
# doing the sentence transformer token split
def sentence_token_split(texts):
    """Splits text into smaller chunks using SentenceTransformersTokenTextSplitter."""
    token_split_text=[]
    sentence_token_splitter= SentenceTransformersTokenTextSplitter(
        tokens_per_chunk=250, # for llm's accuracy
        chunk_overlap=20, # for llm's accuracy
    )
    for text in texts:
        token_split_text.extend(sentence_token_splitter.split_text(text))
    return token_split_text

In [9]:
token_split_texts= sentence_token_split(character_split_texts)

In [10]:
# build the embedding model
from sentence_transformers import SentenceTransformer
def build_embedding_model():
    """Builds and returns a SentenceTransformer model."""
    model = SentenceTransformer('all-MiniLM-L6-v2')
    return model

In [11]:
# build your chroma db vector database
def build_embeddings_and_store_in_vector_db(token_splitted_texts):
    
    # get the embedding model
    embedding_model= build_embedding_model()

    # craete the chroma client
    client = chromadb.PersistentClient(
        path="google_pdf_embeddings"
    )

    # define the collection [like a table]
    collection= client.get_or_create_collection(
        name="google_collection",
    ) 

    # get the embedding of each chunk
    embedding= embedding_model.encode(token_splitted_texts)

    # add this to the collection
    collection.add(
        documents=token_splitted_texts,
        ids=[str(i) for i in range(len(token_splitted_texts))],
        embeddings= embedding,
        metadatas=[{"source": "google_pdfs"}] * len(token_splitted_texts),
    )
    return collection


In [12]:
collection_built= build_embeddings_and_store_in_vector_db(token_split_texts)

In [13]:
import ollama

In [14]:
query="How did the google's revenue change from 2022 to 2023? And Why?"

In [15]:
def send_query_to_llm_for_more_queries(query, model_name="gemma3n:latest"):
    prompt = f"""   You are a knowledgeable financial research assistant. 
    Your users are inquiring about an annual report. 
    For the given question, propose up to five related questions to assist them in finding the information they need. 
    Provide concise, single-topic questions (withouth compounding sentences) that cover various aspects of the topic. 
    Ensure each question is complete and directly related to the original inquiry. 
    List each question on a separate line without numbering.
    

    Question:
    {query}

    Answer:"""
    

    try:
        response = ollama.chat(
            model=model_name,
            messages=[
                {
                    'role': 'user',
                    'content': prompt,
                }
            ],
            options={
                'temperature': 0.5, #[ closer to 0: Deterministic]
                'max_tokens': 250,  # length of response
                'top_p': 0.7, # Filters the model's vocabulary to the top probable tokens.
            }                 # Range: 0.1 (strict) to 1.0 (broad).
        )
        return response['message']['content']
    except Exception as e:
        return f"Error generating response: {str(e)}"


In [16]:
similar_queries= send_query_to_llm_for_more_queries(query)

In [17]:
similar_queries

"What was Google's total revenue in 2022?\nWhat was Google's total revenue in 2023?\nWhat were the primary drivers of Google's revenue growth or decline between 2022 and 2023?\nHow did revenue from Google's advertising business change between 2022 and 2023?\nWhat was the impact of economic conditions on Google's revenue in 2023?\n\n\n\n"

In [18]:
# now make an augmented query
def augment_query_with_similar_queries(original_query, similar_queries):
    return f"{original_query}/n {similar_queries}"

In [19]:
augmented_query= augment_query_with_similar_queries(query, similar_queries)

In [26]:
# query the database over this
def query_vectorDb(query, collection):
    results= collection.query(
        query_texts=[query], # form the embedding of query
        n_results=7,
        include=["documents", "embeddings"],
    )
    return results

In [24]:
retrieved_response_from_vector_db= query_vectorDb(augmented_query, collection_built)

In [25]:
retrieved_response_from_vector_db

{'ids': [['2894', '2851', '1904', '3419', '3366', '2845', '2033']],
 'embeddings': None,
 'documents': [['2 2 m i l l i o n, $ 3. 0 b i l l i o n,',
   't o t a l $ 8 2, 2 7 5',
   '$ 2. 0 b i l l i o n a n d $ 2 3 6 m i l l i o n',
   't a x b e n e f i t.',
   '$ 2 1. 7 b i l l i o n, r e s p e c t i v e l y.',
   '3. 7 b i l l i o n',
   'a n d $ 6. 0 b i l l i o n']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[{'source': 'google_pdfs'},
   {'source': 'google_pdfs'},
   {'source': 'google_pdfs'},
   {'source': 'google_pdfs'},
   {'source': 'google_pdfs'},
   {'source': 'google_pdfs'},
   {'source': 'google_pdfs'}]],
 'distances': [[1.7227498292922974,
   1.742136001586914,
   1.7941402196884155,
   1.808369517326355,
   1.8084187507629395,
   1.8224856853485107,
   1.8316559791564941]]}

In [27]:
# send the response to the LLM for summarization
def send_retrieved_response_llm(retrieved_response, model_name="gemma3n:latest"):
    prompt = f"""You are a financial research assistant. 
    Also let the users know what all information [line by line] was provided to you.
    Your users have retrieved information from an annual report. 
    Summarize the key points from the provided text, focusing on the main findings and insights. 
    Ensure the summary is concise and captures the essence of the information.

    Retrieved Information:
    {retrieved_response}

    Summary:"""
    
    try:
        response = ollama.chat(
            model=model_name,
            messages=[
                {
                    'role': 'user',
                    'content': prompt,
                }
            ],
            options={
                'temperature': 0.5, #[ closer to 0: Deterministic]
                'max_tokens': 300,  # length of response
                'top_p': 0.7, # Filters the model's vocabulary to the top probable tokens.
            }                 # Range: 0.1 (strict) to 1.0 (broad).
        )
        return response['message']['content']
    except Exception as e:
        return f"Error generating response: {str(e)}"


In [28]:
final_answer=send_retrieved_response_llm(retrieved_response_from_vector_db)

In [29]:
final_answer

'Okay, I\'ve analyzed the provided information from the annual report. Here\'s a concise summary of the key points:\n\n**Information Provided:**\n\nThe data consists of a list of document IDs (`2894`, `2851`, `1904`, `3419`, `3366`, `2845`, `2033`), snippets of text extracted from the documents, and source information indicating these snippets were sourced from "google_pdfs". The snippets contain financial figures, including totals like "$2.0 billion" and "$3.0 billion", and specific amounts such as "$2.1 billion", "$3.7 billion", and "$6.0 billion".  The data also includes distance measurements associated with the document IDs.\n\n**Key Findings & Insights:**\n\nThe text snippets reveal significant financial figures within the annual report.  Specifically, the report mentions totals of $2.0 billion and $3.0 billion, alongside other figures of $2.1 billion, $3.7 billion, and $6.0 billion.  The presence of a "tax benefit" of $2.1 billion is also noted. These figures likely represent key

'Okay, I\'ve analyzed the provided information from the annual report. Here\'s a concise summary of the key points:\n\n**Information Provided:**\n\nThe data consists of a list of document IDs (`2894`, `2851`, `1904`, `3419`, `3366`, `2845`, `2033`), snippets of text extracted from the documents, and source information indicating these snippets were sourced from "google_pdfs". The snippets contain financial figures, including totals like "$2.0 billion" and "$3.0 billion", and specific amounts such as "$2.1 billion", "$3.7 billion", and "$6.0 billion".  The data also includes distance measurements associated with the document IDs.\n\n**Key Findings & Insights:**\n\nThe text snippets reveal significant financial figures within the annual report.  Specifically, the report mentions totals of $2.0 billion and $3.0 billion, alongside other figures of $2.1 billion, $3.7 billion, and $6.0 billion.  The presence of a "tax benefit" of $2.1 billion is also noted. These figures likely represent key financial performance indicators for the company during the reporting period.  Further analysis of the full documents (identified by the IDs) would be needed to understand the context of these numbers (e.g., revenue, expenses, profit, assets, etc.) and their implications for the company\'s financial health.\n\n\n\n'

In [32]:
# i am not really satisfied with the answer given by the LLM.
# I will be changing the embedding model for better results