In [1]:
%pip install -r ../requirements.txt

Note: you may need to restart the kernel to use updated packages.


Load Env Variables and Secrets

In [2]:
import os
from dotenv import load_dotenv
load_dotenv('../../../azure.env')
os.environ["AZURE_OPENAI_API_VERSION"] = "2024-06-01"
os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"] = 'gpt-4o-mini'
os.environ["AZURE_OPENAI_MODEL_VERSION"] = '2024-06-01'


Import packages

In [3]:
from langchain_openai import AzureChatOpenAI
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_core.vectorstores import InMemoryVectorStore

Initialize the Model

In [4]:
model = AzureChatOpenAI(
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"],
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    model_version=os.environ['AZURE_OPENAI_MODEL_VERSION']
)

In [5]:
from langchain_openai import AzureOpenAIEmbeddings

embeddings = AzureOpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    # dimensions: Optional[int] = None, # Can specify dimensions with new text-embedding-3 models
    # azure_endpoint="https://<your-endpoint>.openai.azure.com/", If not provided, will read env variable AZURE_OPENAI_ENDPOINT
    # api_key=... # Can provide an API key directly. If missing read env variable AZURE_OPENAI_API_KEY
    # openai_api_version=..., # If not provided, will read env variable AZURE_OPENAI_API_VERSION
)

In [6]:
# Define the directory containing the text file and the persistent directory
current_dir = os.path.dirname(os.path.commonpath('.'))
books_dir = os.path.join(current_dir, "books_small")
print(f"Books directory: {books_dir}")

Books directory: books_small


In [7]:
# Ensure the books directory exists
if not os.path.exists(books_dir):
    raise FileNotFoundError(
        f"The directory {books_dir} does not exist. Please check the path."
    )

# List all text files in the directory
book_files = [f for f in os.listdir(books_dir) if f.endswith(".txt")]

# Read the text content from each file and store it with metadata
documents = []
for book_file in book_files:
    file_path = os.path.join(books_dir, book_file)
    loader = TextLoader(file_path)
    book_docs = loader.load()
    for doc in book_docs:
        # Add metadata to each document indicating its source
        doc.metadata = {"source": book_file}
        documents.append(doc)

In [8]:
# Function to create and persist vector store
def create_vector_store(docs, store_name):
    # Create the vector store and persist it automatically
    print(f"\n--- Creating vector store {store_name} ---")
    vectorstore = InMemoryVectorStore.from_documents(
        documents=docs,
        embedding=embeddings,
    )
    print("\n--- Finished creating vector store ---")
    return vectorstore

In [9]:
# Function to query a vector store
def query_vector_store(vcs,store_name, query,search_kwargs,search_type):
    print(f"\n--- Querying the Vector Store {store_name} ---")
    # Use the vectorstore as a retriever
    retriever = vcs.as_retriever(
        search_type=search_type,
        search_kwargs=search_kwargs)

    # Retrieve the most similar text
    retrieved_documents = retriever.invoke(query)

    # Display the relevant results with metadata
    print("\n--- Relevant Documents ---")
    for i, doc in enumerate(retrieved_documents, 1):
        print(f"Document {i}:\n{doc.page_content}\n")
        if doc.metadata:
            print(f"Source: {doc.metadata.get('source', 'Unknown')}\n")

In [12]:
# Define the user's question
query = "How did Juliet die?"
# 4. Recursive Character-based Splitting
# Attempts to split text at natural boundaries (sentences, paragraphs) within character limit.
# Balances between maintaining coherence and adhering to character limits.
print("\n--- Using Recursive Character-based Splitting ---")
rec_char_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=100)
rec_char_docs = rec_char_splitter.split_documents(documents)
vcs = create_vector_store(rec_char_docs, "db_rec_char")


--- Using Recursive Character-based Splitting ---

--- Creating vector store db_rec_char ---

--- Finished creating vector store ---


In [14]:
# 1. Similarity Search
# This method retrieves documents based on vector similarity.
# It finds the most similar documents to the query vector based on cosine similarity.
# Use this when you want to retrieve the top k most similar documents.
print("\n--- Using Similarity Search ---")
query_vector_store(vcs,"chroma_db_with_metadata", query,
                    search_type="similarity", search_kwargs={"k": 3})


--- Using Similarity Search ---

--- Querying the Vector Store chroma_db_with_metadata ---

--- Relevant Documents ---
Document 1:
NURSE.
I saw the wound, I saw it with mine eyes,
God save the mark!—here on his manly breast.
A piteous corse, a bloody piteous corse;
Pale, pale as ashes, all bedaub’d in blood,
All in gore-blood. I swounded at the sight.

JULIET.
O, break, my heart. Poor bankrout, break at once.
To prison, eyes; ne’er look on liberty.
Vile earth to earth resign; end motion here,
And thou and Romeo press one heavy bier.

NURSE.
O Tybalt, Tybalt, the best friend I had.
O courteous Tybalt, honest gentleman!
That ever I should live to see thee dead.

JULIET.
What storm is this that blows so contrary?
Is Romeo slaughter’d and is Tybalt dead?
My dearest cousin, and my dearer lord?
Then dreadful trumpet sound the general doom,
For who is living, if those two are gone?

NURSE.
Tybalt is gone, and Romeo banished,
Romeo that kill’d him, he is banished.

JULIET.
O God! Did Romeo’s 

In [15]:
# 2. Max Marginal Relevance (MMR)
# This method balances between selecting documents that are relevant to the query and diverse among themselves.
# 'fetch_k' specifies the number of documents to initially fetch based on similarity.
# 'lambda_mult' controls the diversity of the results: 1 for minimum diversity, 0 for maximum.
# Use this when you want to avoid redundancy and retrieve diverse yet relevant documents.
# Note: Relevance measures how closely documents match the query.
# Note: Diversity ensures that the retrieved documents are not too similar to each other,
#       providing a broader range of information.
print("\n--- Using Max Marginal Relevance (MMR) ---")
query_vector_store(vcs,"chroma_db_with_metadata", query,
                    search_type="mmr", 
                    search_kwargs={"k": 3, "fetch_k": 20, "lambda_mult": 0.5})


--- Using Max Marginal Relevance (MMR) ---

--- Querying the Vector Store chroma_db_with_metadata ---

--- Relevant Documents ---
Document 1:
NURSE.
I saw the wound, I saw it with mine eyes,
God save the mark!—here on his manly breast.
A piteous corse, a bloody piteous corse;
Pale, pale as ashes, all bedaub’d in blood,
All in gore-blood. I swounded at the sight.

JULIET.
O, break, my heart. Poor bankrout, break at once.
To prison, eyes; ne’er look on liberty.
Vile earth to earth resign; end motion here,
And thou and Romeo press one heavy bier.

NURSE.
O Tybalt, Tybalt, the best friend I had.
O courteous Tybalt, honest gentleman!
That ever I should live to see thee dead.

JULIET.
What storm is this that blows so contrary?
Is Romeo slaughter’d and is Tybalt dead?
My dearest cousin, and my dearer lord?
Then dreadful trumpet sound the general doom,
For who is living, if those two are gone?

NURSE.
Tybalt is gone, and Romeo banished,
Romeo that kill’d him, he is banished.

JULIET.
O God! D