### Importing Prerequisite Libraries

In [1]:
import os
from dotenv import load_dotenv
import openai

# Loading environment variables from the .env file
load_dotenv()

# Retrieving the OpenAI API key from the environment variables
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Defining the GPT model to be used
MODEL = "gpt-3.5-turbo"

# Creating an instance of the OpenAI client with the API key
openai_client = openai.OpenAI(api_key=OPENAI_API_KEY)

### Setup Bagel client

In [2]:
import bagel
from bagel.config import Settings

# Set environment variables (Optional)
os.environ['BAGEL_API_KEY'] = os.getenv("BAGEL_API_KEY")
os.environ['BAGEL_USER_ID'] = os.getenv("BAGEL_USER_ID")

# Configure Bagel server settings for production
server_settings = Settings(
    bagel_api_impl="rest",
    bagel_server_host="api.bageldb.ai",
)

# Initialize the Bagel client
client = bagel.Client(server_settings)

# Test server connectivity
print(client.ping())

# Retrieve server version information
print(client.get_version())

1711349683647990288000
0.3.26


### Load PDF Document(s)

In [3]:
from langchain_community.document_loaders import PyPDFLoader

# Expecting the PDF file with meaningful name
filepath = "./pdfs/coding_patterns.pdf"
# Extracting the filename without extension
filename = os.path.basename(filepath).replace(".pdf", "")

# Loading and splitting the PDF into pages
loader = PyPDFLoader(filepath)
pages = loader.load_and_split()
pages

[Document(page_content='20 Coding Patterns \nto master\nMAANG Interviews\nDesignGurus.org', metadata={'source': '/home/kaiser/Documents/bagel/pdfs/coding_patterns.pdf', 'page': 0}),
 Document(page_content='➡ What if you don’t like to solve\n100s of coding questions before\nyour interview?\n➡ Don’t just LeetCode; follow these\n20 Coding Patterns instead.\nDesignGurus.org', metadata={'source': '/home/kaiser/Documents/bagel/pdfs/coding_patterns.pdf', 'page': 1}),
 Document(page_content="Usage : This algorithmic technique is used when\nwe need to handle the input data in a specific\nwindow size.Sliding Window1.\nDS Involved : Array, String, HashTable\nLongest Substring with 'K' Distinct Characters\nFruits into BasketsSample Problems : \nDesignGurus.org", metadata={'source': '/home/kaiser/Documents/bagel/pdfs/coding_patterns.pdf', 'page': 2}),
 Document(page_content='Usage : This pattern describes all the efficient\nways of traversing a matrix (or 2D array).2. Islands (Matrix Traversal)\nDS

### Create Cluster

In [4]:
cluster=client.get_or_create_cluster(name = filename, embedding_model="bagel-text")
print(f"Cluster created: {cluster}")

Cluster created: name='coding_patterns' id=UUID('e2a5e264-6f43-4cd3-84bb-19cdacf6569b') cluster_size=2.0 embedding_size=None metadata={}


### Add documents into the cluster

In [5]:
from langchain.vectorstores import Bagel

for i, page in enumerate(pages):
    cluster.add(
  documents=[page.page_content],
  metadatas=[page.metadata],
  ids=[str(pages[i].metadata["page"])]
)

In [6]:
import tiktoken

# Setting the topic based on the filename
TOPIC= filename.replace("_", " ")

def num_tokens(text: str, model: str = MODEL) -> int:
    """Calculates and returns the number of tokens in the provided text using the specified model's tokenizer."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

def query_message(query: str, model: str, token_budget: int, print_message: bool = False) -> str:
    f"""Constructs a query message for the GPT model by including relevant {TOPIC} articles from Bagel DB based on the search query."""
    # Search the cluster for documents related to the query
    response = cluster.find(query_texts=[query], n_results=5)
    documents, distances, metadatas = map(lambda l: list([item for sublist in l for item in sublist]),
                                          (response['documents'], response['distances'], response['metadatas']))

    if print_message:
      print("Found following documents:")
      for metadata, distance in zip(metadatas, distances):
          print(f"Metadata: {metadata}\n" + "-" * 50)

    introduction = f'Use the below articles on the {TOPIC}. If the answer cannot be found in the articles, write "Sorry, I could not find an answer."'
    question = f"\n\nQuestion: {query}"
    message = introduction
    for string in documents:
        next_article = f'\n\n{TOPIC}:\n"""\n{string}\n"""'
        if (
            num_tokens(message + next_article + question, model=model)
            > token_budget
        ):
            break
        else:
            message += next_article
    return message + question

def ask(query: str, model: str = MODEL, token_budget: int = 3596, print_message: bool = False) -> str:
    """Generates an answer to the given query by assembling relevant texts and querying the GPT model, while adhering to a token budget."""
    message = query_message(query, model=model, token_budget=token_budget, print_message=print_message)

    messages = [
        {"role": "system", "content": f"You answer questions about the {TOPIC}."},
        {"role": "user", "content": message}
    ]
    response = openai_client.chat.completions.create(model=model, messages=messages, temperature=0)

    print("Response from agent:")
    return response.choices[0].message.content

In [7]:
# Ask relevant questions about the topic
ask('Tell me about Two Pointers.', print_message=True)

Found following documents:
Response from agent:


'Two Pointers is a coding pattern that involves using two pointers that can move towards each other or in the same direction to solve problems efficiently. This pattern is often used to optimize solutions for problems that involve arrays or linked lists. By using two pointers, you can iterate through the data structure in a more efficient way, often reducing the time complexity of the algorithm. The two pointers can be used to solve problems like finding pairs that meet certain conditions, finding subarrays with a specific sum, or detecting cycles in a linked list.'

In [8]:
# Ask relevant questions about the topic
ask('What is Depth First Search?', print_message=True)

Found following documents:
Response from agent:


'Depth First Search (DFS) is a graph traversal algorithm that explores as far as possible along each branch before backtracking. It starts at the root node and explores as far as possible along each branch before backtracking. This algorithm can be implemented using recursion or a stack data structure.'

In [9]:
# Ask relevant questions about the topic
ask('Descirbe Fibonacci Numbers', print_message=True)

Found following documents:
Response from agent:


'Fibonacci numbers are a sequence of numbers in which each number is the sum of the two preceding ones, usually starting with 0 and 1. The sequence goes like this: 0, 1, 1, 2, 3, 5, 8, 13, 21, and so on. This pattern can be implemented in various programming languages using different approaches such as recursion, iteration, or dynamic programming. It is a common example used to demonstrate coding patterns like recursion and dynamic programming.'

In [10]:
# Ask irrelevant question which is not in the topic
ask('Tell me about Bagel')

Response from agent:


'Sorry, I could not find an answer.'

In [11]:
# Ask irrelevant question which is not in the topic
ask('What is Machine Learning?')

Response from agent:


'Sorry, I could not find an answer.'

### The END