In [1]:
import ollama
import PyPDF2
import torch

from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer, SimilarityFunction

  from tqdm.autonotebook import tqdm, trange


In [2]:
pdf_file = "./documents/epix_(Gen_2)_Series_OM_EN-US.pdf"

In [3]:
# Set similarity functdion
similarity_fn_name = SimilarityFunction.COSINE
#similarity_fn_name = SimilarityFunction.DOT_PRODUCT
#similarity_fn_name = SimilarityFunction.EUCLIDEAN

# Initialize encoding model
model = SentenceTransformer("all-mpnet-base-v2", similarity_fn_name=similarity_fn_name)



In [4]:
# Extract text from PDF
def extract_text(pdf_path):

	extracted_text = []

	with open(pdf_path, "rb") as pdf_file:

		# Instantiate PyPDF reader
		reader = PyPDF2.PdfReader(pdf_file)

		# Parse each page
		for page in reader.pages:

			# Extract page text
			page_text = page.extract_text().encode("utf-8", "xmlcharrefreplace")
			page_text = page_text.decode("utf-8")

			# Replace known characters
			page_text = page_text.replace("\n", " ")
			page_text = page_text.replace("\xa0", "")
			page_text = page_text.replace("\xad ", "")

			# Split the page text into sentences
			sentences = sent_tokenize(page_text)
			extracted_text.extend(sentences)

	return extracted_text

In [5]:
# Extract text from PDF file
extracted_text = extract_text(pdf_file)
print(f"Number of sentences: {len(extracted_text)}")
#print(extracted_text)

Number of sentences: 3899


In [6]:
def vectorize_text(sentences, model):

	# Generate sentence embeddings
	sentence_embeddings = model.encode(sentences)

	return sentences, sentence_embeddings

In [7]:
# Vectorize text
sentences, sentence_embeddings = vectorize_text(extracted_text, model)

print(sentence_embeddings.shape)

(3899, 768)


In [8]:
# Generate RAG response
def generate_response(query_text, context):

	# Pass query and context to LLM
	system_message = f"""
		You are a knowledgeable assistant tasked with answering questions based solely on the provided context.
		Your responses should be strictly based on the information contained in the context.
		If the answer is not clear from the context, you can extrapolate from the information available in the context.
		If the answer is not in the context, respond with "The answer is not available in the provided context."
		Here is the context you should use:
		{context}
		"""

	# Generate response
	response = ollama.chat(model="llama3.1", messages=[
		{
			"role": "system",
			"content": system_message
		},
		{
			"role": "user",
			"content": f"{query_text}"
		}
	])

	return response["message"]["content"]

In [9]:
# Query the knowledge base
def query_kb(query_text, sentence_embeddings, model, show_context=False):

	# Generate embeddings for the query text
	query_embeddings = model.encode(query_text)

	# Generate tensor of similarities between query and sentences
	similarities = model.similarity(query_embeddings, sentence_embeddings)
	
	# Extract top k similar sentences
	k = 25
	top_k_indices = torch.topk(similarities, k).indices.tolist()
	top_k_indices = [item for sublist in top_k_indices for item in sublist]
	relevant_sentences = [sentences[i] for i in top_k_indices]

	# Generate LLM response using similar sentences as context
	response = generate_response(query_text, relevant_sentences)

	# DEBUG
	if show_context:
		print("#### CONTEXT ####")
		print("\n".join(relevant_sentences))
		print("####################\n")

	return response

In [13]:
# Epix Questions

#query_text = "How deep can I swim while wearing the watch?"
#query_text = "How much water pressure can the watch withstand?"
#query_text = "How do I navigate back home when I'm on a run?"
#query_text = "How do I get a SpO2 measurement?"
query_text = "How do I set an alarm?"

In [14]:
response = query_kb(query_text, sentence_embeddings, model, show_context=False)
print(response)

To set an alarm, follow these steps: 

1. Select Clocks.
2. Select ALARMS.
3. Select Add Alarm.

You can also select Clocks > ALARMS to view and manage existing alarms.

Additionally, you can set multiple alarms by selecting Clocks and then following the on-screen instructions to add each individual alarm.
