In [None]:
# !pip install PyPDF2

In [2]:
import torch
print(torch.__version__)
torch.cuda.is_available()

2.3.0+cu118


True

In [3]:
# import tabula
import PyPDF2 
# from tabulate import tabulate 
# import fitz 
# import io 
from typing import List 
# from pdf2docx import Converter 



class Reader:
    def __init__(self, file_path: str, end_page: int = None, start_page: int = 1,  verbose: bool = True): 
        self.verboseprint = print if verbose else lambda *a: None 
        self.filepath = file_path 
        self.start_page = start_page 
        self.texts = []
        self.tables = []

        self.parsed_pdf = PyPDF2.PdfReader(self.filepath)
        self.end_page = len(self.parsed_pdf.pages)


    def extract_text(self) -> List[str]:
        for page_number in range(self.start_page -1 , min(self.end_page, len(self.parsed_pdf.pages))):
            pageObj = self.parsed_pdf.pages[page_number] 
            text = pageObj.extract_text() 
            self.texts.append(text)
        return ''.join(self.texts) 
    
    # def extract_tables(self) -> List[str]: 
    #     dfs = tabula.io.read_pdf(self.filepath, pages = "all")
    #     print(dfs[0])
    #     print(dfs[0].to_html())


In [4]:
reader = Reader(file_path='./financial-reports/eicher.pdf')
text = reader.extract_text()
text

'CONTENTS\nCORPORATE \nOVERVIEW\n06-63\n66-113\n17INTEGRATED REPORTOnwards and Upwards /06\nHighlights of FY 2022-23 /08\nEngineered By Excellence,  \nPowered by Creativity /10\nMessage to the Shareholders /14\nPure Motorcycling with Royal Enfield /18\nRoyal Enfield History /20Quarterly Highlights /22\nHunter 350 /26Super Meteor 650 /30\nElevating Experiences  \nwith Motorcycle Upgrades /36\nElevating Pure Motorcycling  Experiences /40\nThe Great Himalayan Exploration /44\nCollaborations /48\nWidening Reach and Expanding  \nGlobal Footprint /50\nCreating Brand-led Customer  \nExperiences /52\nIgniting the Pure Motorcyling Spirit /56\nNurturing the Motorcycling  \nSubcultures /60\nUnlocking Creativity with  \nCustomisation /62At Royal Enfield, \nwe have a huge opportunity to redefine motorcycling in this new era as we continue to maintain razor-sharp focus on what is important to us - ensuring joy and excitement each time someone experiences our motorcycles. Business Model /66\nOperatin

In [5]:
len(text)

1175663

In [6]:
def chunk_text(text, chunk_size=512):
    """Chunk the text into smaller pieces."""
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunks.append(text[i:i+chunk_size])
    return chunks

chunks = chunk_text(text)

In [7]:
len(chunks)

2297

In [None]:
# !pip install sentence-transformers

In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"

from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# embedding_model.to(device)

chunk_embeddings = embedding_model.encode(chunks)
print(chunk_embeddings)

[[-0.07719564  0.08590856  0.02712858 ... -0.04713814 -0.04810253
  -0.04145248]
 [ 0.06992162  0.10352073  0.00223129 ... -0.03263901 -0.0153402
  -0.03915515]
 [ 0.09092546 -0.01099225 -0.05107707 ... -0.05702045  0.0722733
  -0.06603406]
 ...
 [-0.07137544 -0.01139496 -0.07930989 ... -0.06162428 -0.04293181
   0.01977128]
 [-0.04882916 -0.04517161 -0.07059311 ... -0.08919964 -0.05466387
   0.02149768]
 [-0.04439776 -0.02990265  0.01132493 ... -0.06577379 -0.0529393
   0.08253177]]


In [9]:
len(chunk_embeddings), len(chunks)

(2297, 2297)

In [10]:
prompt = """How many hunter sold 350 in six months?"""

prompt_embeddings = embedding_model.encode(prompt)
# print(prompt_embeddings)

In [11]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def find_closest_chunk(prompt_embedding, chunk_embeddings):
    """Find the closest chunk embeddings to the prompt embedding."""
    similarities = cosine_similarity([prompt_embedding], chunk_embeddings)[0]
    closest_chunk_index = np.argmax(similarities)
    closest_chunk_similarity = similarities[closest_chunk_index]
    closest_chunk_embedding = chunk_embeddings[closest_chunk_index]
    return closest_chunk_index, closest_chunk_similarity, closest_chunk_embedding


idx, score, embedding = find_closest_chunk(prompt_embeddings, chunk_embeddings)

In [12]:
idx

460

In [13]:
chunks[idx]

'es\nThe Company has established world-class product \ndevelopment capabilities spanning across its three major manufacturing facilities at - Vallam Vadagal, Oragadam and Tiruvottiyur. Royal Enfield added another model to its J-series platform, Hunter 350. Since its launch in August 2022, Hunter 350 received an overwhelming response from consumers. The production capacity for Hunter 350 was ramped up and more than 1 lakh motorcycles were produced within 6 months of launch. RE launched its premium cruiser, Supe'

In [14]:
import tqdm as notebook_tqdm

In [15]:
input_text = f"""
You are a helpful AI assistant. Help in answering the following query based on the given contexts.

Context: {chunks[idx]}

Query: {prompt}

"""

In [18]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")

device = "cpu"
input_ids = tokenizer(input_text, return_tensors="pt").to(device)
model = model.to(device)
outputs = model.generate(**input_ids, max_length=1024)
print(tokenizer.decode(outputs[0]))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

<bos>
You are a helpful AI assistant. Help in answering the following query based on the given contexts.

Context: es
The Company has established world-class product 
development capabilities spanning across its three major manufacturing facilities at - Vallam Vadagal, Oragadam and Tiruvottiyur. Royal Enfield added another model to its J-series platform, Hunter 350. Since its launch in August 2022, Hunter 350 received an overwhelming response from consumers. The production capacity for Hunter 350 was ramped up and more than 1 lakh motorcycles were produced within 6 months of launch. RE launched its premium cruiser, Supe

Query: How many hunter sold 350 in six months?

The context does not specify the number of Hunter 350 motorcycles sold in six months, so I cannot answer this question from the context.<eos>
