In [1]:
%pip install -r ../requirements.txt

Note: you may need to restart the kernel to use updated packages.


Load Env Variables and Secrets

In [2]:
import os
from dotenv import load_dotenv
load_dotenv('../../../azure.env')
os.environ["AZURE_OPENAI_API_VERSION"] = "2024-06-01"
os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"] = 'gpt-4o-mini'
os.environ["AZURE_OPENAI_MODEL_VERSION"] = '2024-06-01'


Import packages

In [3]:
from langchain_openai import AzureChatOpenAI
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    SentenceTransformersTokenTextSplitter,
    TextSplitter,
    TokenTextSplitter,
)
from langchain_community.document_loaders import TextLoader
from langchain_core.vectorstores import InMemoryVectorStore

Initialize the Model

In [4]:
model = AzureChatOpenAI(
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"],
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    model_version=os.environ['AZURE_OPENAI_MODEL_VERSION']
)

In [5]:
from langchain_openai import AzureOpenAIEmbeddings

embeddings = AzureOpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    # dimensions: Optional[int] = None, # Can specify dimensions with new text-embedding-3 models
    # azure_endpoint="https://<your-endpoint>.openai.azure.com/", If not provided, will read env variable AZURE_OPENAI_ENDPOINT
    # api_key=... # Can provide an API key directly. If missing read env variable AZURE_OPENAI_API_KEY
    # openai_api_version=..., # If not provided, will read env variable AZURE_OPENAI_API_VERSION
)

In [6]:
# Define the directory containing the text file and the persistent directory
current_dir = os.path.dirname(os.path.commonpath('.'))
books_dir = os.path.join(current_dir, "books_small")
print(f"Books directory: {books_dir}")

Books directory: books_small


In [7]:
# Ensure the books directory exists
if not os.path.exists(books_dir):
    raise FileNotFoundError(
        f"The directory {books_dir} does not exist. Please check the path."
    )

# List all text files in the directory
book_files = [f for f in os.listdir(books_dir) if f.endswith(".txt")]

# Read the text content from each file and store it with metadata
documents = []
for book_file in book_files:
    file_path = os.path.join(books_dir, book_file)
    loader = TextLoader(file_path)
    book_docs = loader.load()
    for doc in book_docs:
        # Add metadata to each document indicating its source
        doc.metadata = {"source": book_file}
        documents.append(doc)

In [8]:
# Function to create and persist vector store
def create_vector_store(docs, store_name):
    # Create the vector store and persist it automatically
    print(f"\n--- Creating vector store {store_name} ---")
    vectorstore = InMemoryVectorStore.from_documents(
        documents=docs,
        embedding=embeddings,
    )
    print("\n--- Finished creating vector store ---")
    return vectorstore

In [9]:
# Function to query a vector store
def query_vector_store(vcs,store_name, query):
    print(f"\n--- Querying the Vector Store {store_name} ---")
    # Use the vectorstore as a retriever
    retriever = vcs.as_retriever(
        search_kwargs={'k': 1,'score_threshold': 0.8})

    # Retrieve the most similar text
    retrieved_documents = retriever.invoke(query)

    # Display the relevant results with metadata
    print("\n--- Relevant Documents ---")
    for i, doc in enumerate(retrieved_documents, 1):
        print(f"Document {i}:\n{doc.page_content}\n")
        if doc.metadata:
            print(f"Source: {doc.metadata.get('source', 'Unknown')}\n")

In [10]:
# Define the user's question
query = "How did Juliet die?"

In [11]:
# 1. Character-based Splitting
# Splits text into chunks based on a specified number of characters.
# Useful for consistent chunk sizes regardless of content structure.
print("\n--- Using Character-based Splitting ---")
char_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
char_docs = char_splitter.split_documents(documents)
vcs = create_vector_store(char_docs, "db_char")
query_vector_store(vcs,"db_char", query)

Created a chunk of size 1141, which is longer than the specified 1000
Created a chunk of size 2086, which is longer than the specified 1000
Created a chunk of size 1121, which is longer than the specified 1000
Created a chunk of size 1366, which is longer than the specified 1000
Created a chunk of size 1011, which is longer than the specified 1000
Created a chunk of size 1639, which is longer than the specified 1000
Created a chunk of size 1219, which is longer than the specified 1000
Created a chunk of size 1875, which is longer than the specified 1000
Created a chunk of size 1307, which is longer than the specified 1000
Created a chunk of size 2271, which is longer than the specified 1000
Created a chunk of size 1430, which is longer than the specified 1000
Created a chunk of size 1763, which is longer than the specified 1000
Created a chunk of size 1575, which is longer than the specified 1000
Created a chunk of size 1028, which is longer than the specified 1000
Created a chunk of s


--- Using Character-based Splitting ---

--- Creating vector store db_char ---

--- Finished creating vector store ---

--- Querying the Vector Store db_char ---

--- Relevant Documents ---
Document 1:
JULIET.
Shall I speak ill of him that is my husband?
Ah, poor my lord, what tongue shall smooth thy name,
When I thy three-hours’ wife have mangled it?
But wherefore, villain, didst thou kill my cousin?
That villain cousin would have kill’d my husband.
Back, foolish tears, back to your native spring,
Your tributary drops belong to woe,
Which you mistaking offer up to joy.
My husband lives, that Tybalt would have slain,
And Tybalt’s dead, that would have slain my husband.
All this is comfort; wherefore weep I then?
Some word there was, worser than Tybalt’s death,
That murder’d me. I would forget it fain,
But O, it presses to my memory
Like damned guilty deeds to sinners’ minds.
Tybalt is dead, and Romeo banished.
That ‘banished,’ that one word ‘banished,’
Hath slain ten thousand Tybalts.

In [12]:
# 2. Sentence-based Splitting
# Splits text into chunks based on sentences, ensuring chunks end at sentence boundaries.
# Ideal for maintaining semantic coherence within chunks.
print("\n--- Using Sentence-based Splitting ---")
sent_splitter = SentenceTransformersTokenTextSplitter(chunk_size=1000)
sent_docs = sent_splitter.split_documents(documents)
vcs = create_vector_store(sent_docs, "db_sent")
query_vector_store(vcs,"db_sent", query)


--- Using Sentence-based Splitting ---

--- Creating vector store db_sent ---

--- Finished creating vector store ---

--- Querying the Vector Store db_sent ---

--- Relevant Documents ---
Document 1:
am. where is my romeo? [ _ noise within. _ ] friar lawrence. i hear some noise. lady, come from that nest of death, contagion, and unnatural sleep. a greater power than we can contradict hath thwarted our intents. come, come away. thy husband in thy bosom there lies dead ; and paris too. come, i ’ ll dispose of thee among a sisterhood of holy nuns. stay not to question, for the watch is coming. come, go, good juliet. i dare no longer stay. juliet. go, get thee hence, for i will not away. [ _ exit friar lawrence. _ ] what ’ s here? a cup clos ’ d in my true love ’ s hand? poison, i see, hath been his timeless end. o churl. drink all, and left no friendly drop to help me after? i will kiss thy lips. haply some poison yet doth hang on them, to make me die with a restorative. [ _ kisses him.

  from tqdm.autonotebook import tqdm, trange


In [13]:
# 3. Token-based Splitting
# Splits text into chunks based on tokens (words or subwords), using tokenizers like GPT-2.
# Useful for transformer models with strict token limits.
print("\n--- Using Token-based Splitting ---")
token_splitter = TokenTextSplitter(chunk_overlap=0, chunk_size=512)
token_docs = token_splitter.split_documents(documents)
vcs = create_vector_store(token_docs, "db_token")
query_vector_store(vcs,"db_token", query)


--- Using Token-based Splitting ---

--- Creating vector store db_token ---

--- Finished creating vector store ---

--- Querying the Vector Store db_token ---

--- Relevant Documents ---
Document 1:
The cords that Romeo bid thee fetch?

NURSE.
Ay, ay, the cords.

 [_Throws them down._]

JULIET.
Ay me, what news? Why dost thou wring thy hands?

NURSE.
Ah, well-a-day, he’s dead, he’s dead, he’s dead!
We are undone, lady, we are undone.
Alack the day, he’s gone, he’s kill’d, he’s dead.

JULIET.
Can heaven be so envious?

NURSE.
Romeo can,
Though heaven cannot. O Romeo, Romeo.
Who ever would have thought it? Romeo!

JULIET.
What devil art thou, that dost torment me thus?
This torture should be roar’d in dismal hell.
Hath Romeo slain himself? Say thou but Ay,
And that bare vowel I shall poison more
Than the death-darting eye of cockatrice.
I am not I if there be such an I;
Or those eyes shut that make thee answer Ay.
If he be slain, say Ay; or if not, No.
Brief sounds determine of my weal

In [14]:
# 4. Recursive Character-based Splitting
# Attempts to split text at natural boundaries (sentences, paragraphs) within character limit.
# Balances between maintaining coherence and adhering to character limits.
print("\n--- Using Recursive Character-based Splitting ---")
rec_char_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=100)
rec_char_docs = rec_char_splitter.split_documents(documents)
vcs = create_vector_store(rec_char_docs, "db_rec_char")
query_vector_store(vcs,"db_rec_char", query)


--- Using Recursive Character-based Splitting ---

--- Creating vector store db_rec_char ---

--- Finished creating vector store ---

--- Querying the Vector Store db_rec_char ---

--- Relevant Documents ---
Document 1:
NURSE.
I saw the wound, I saw it with mine eyes,
God save the mark!—here on his manly breast.
A piteous corse, a bloody piteous corse;
Pale, pale as ashes, all bedaub’d in blood,
All in gore-blood. I swounded at the sight.

JULIET.
O, break, my heart. Poor bankrout, break at once.
To prison, eyes; ne’er look on liberty.
Vile earth to earth resign; end motion here,
And thou and Romeo press one heavy bier.

NURSE.
O Tybalt, Tybalt, the best friend I had.
O courteous Tybalt, honest gentleman!
That ever I should live to see thee dead.

JULIET.
What storm is this that blows so contrary?
Is Romeo slaughter’d and is Tybalt dead?
My dearest cousin, and my dearer lord?
Then dreadful trumpet sound the general doom,
For who is living, if those two are gone?

NURSE.
Tybalt is gone

In [15]:
# 5. Custom Splitting
# Allows creating custom splitting logic based on specific requirements.
# Useful for documents with unique structure that standard splitters can't handle.
print("\n--- Using Custom Splitting ---")


class CustomTextSplitter(TextSplitter):
    def split_text(self, text):
        # Custom logic for splitting text
        return text.split("\n\n")  # Example: split by paragraphs


custom_splitter = CustomTextSplitter()
custom_docs = custom_splitter.split_documents(documents)
vcs = create_vector_store(custom_docs, "db_custom")
query_vector_store(vcs,"db_custom", query)


--- Using Custom Splitting ---

--- Creating vector store db_custom ---
