In [2]:
!pip install --upgrade pip
!pip install transformers sentence-transformers faiss-cpu accelerate bitsandbytes


Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting accelerate
  Downloading accelerate-1.7.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.32.4-py3-none-any.whl.metadata (14 kB)
Collecting numpy>=1.17 (from transformers)
  Downloading numpy-2.2.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting pyyaml>=5.1 (from transformers)
  Using cached PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinu

In [11]:
import json

with open("./index/shakespeare_chunked.json", "r", encoding="utf-8") as f:
    raw_chunks = json.load(f)

print(f"Total chunks: {len(raw_chunks)}")
print("Sample entry:", raw_chunks[0])

Total chunks: 5621
Sample entry: {'Play': 'Henry IV', 'PlayerLine': "ACT I\nSCENE I. London. The palace.\nEnter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others\n\nKING HENRY IV: So shaken as we are, so wan with care,\nFind we a time for frighted peace to pant,\nAnd breathe short-winded accents of new broils\nTo be commenced in strands afar remote.\nNo more the thirsty entrance of this soil\nShall daub her lips with her own children's blood,\nNor more shall trenching war channel her fields,\nNor bruise her flowerets with the armed hoofs\nOf hostile paces: those opposed eyes,\nWhich, like the meteors of a troubled heaven,\nAll of one nature, of one substance bred,\nDid lately meet in the intestine shock\nAnd furious close of civil butchery\nShall now, in mutual well-beseeming ranks,\nMarch all one way and be no more opposed\nAgainst acquaintance, kindred and allies:", 'Act': '1', 'Scene': '1', 'Speakers': ['KING HENRY IV'], 'firstLine': '1', 'la

In [48]:
def normalize(text: str) -> str:
    return "\n".join([line.strip() for line in text.strip().splitlines() if line.strip()])

documents = []
for idx, chunk in enumerate(raw_chunks):
    # doc_id = f"play: {chunk.get("Play", "")}, act: {chunk.get("Act", "")}, scene: {chunk.get("Scene", "")}, lines: {chunk.get("firstLine", "")} - {chunk.get("lastLine", "")}"
    
    # if not doc_id:
    doc_id = f"chunk_{idx}"
        
    documents.append({
        "id": doc_id,
        "text": normalize(chunk.get("PlayerLine", "")),
        "metadata": {
            "details": f"This play is: {chunk.get("Play", "")}, This section is Act {chunk.get("Act", "")}, Scene {chunk.get("Scene", "")}, from line {chunk.get("firstLine", "")} to line {chunk.get("lastLine", "")}",
            "play": chunk.get("Play", ""),
            "act": chunk.get("Act", ""),
            "scene": chunk.get("Scene", ""),
            "firstLine": chunk.get("firstLine", ""),
            "lastLine": chunk.get("lastLine", ""),
            "characters present": "The characters present on set are " + ", ".join(chunk.get("CharactersPresent", [])),
            "speakers": "The characters speaking in these lines are " + ", ".join(chunk.get("Speakers", []))
        }
    })

print(f"Built {len(documents)} documents.")
print("Example:", json.dumps(documents[0], indent=4, ensure_ascii=False))


Built 5621 documents.
Example: {
    "id": "chunk_0",
    "text": "ACT I\nSCENE I. London. The palace.\nEnter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others\nKING HENRY IV: So shaken as we are, so wan with care,\nFind we a time for frighted peace to pant,\nAnd breathe short-winded accents of new broils\nTo be commenced in strands afar remote.\nNo more the thirsty entrance of this soil\nShall daub her lips with her own children's blood,\nNor more shall trenching war channel her fields,\nNor bruise her flowerets with the armed hoofs\nOf hostile paces: those opposed eyes,\nWhich, like the meteors of a troubled heaven,\nAll of one nature, of one substance bred,\nDid lately meet in the intestine shock\nAnd furious close of civil butchery\nShall now, in mutual well-beseeming ranks,\nMarch all one way and be no more opposed\nAgainst acquaintance, kindred and allies:",
    "metadata": {
        "details": "This play is: Henry IV, This section is Act 

In [49]:
from sentence_transformers import SentenceTransformer
import numpy as np
import json

embedder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

texts = [doc["text"] for doc in documents]
embeddings = embedder.encode(
    texts,
    batch_size=32,
    show_progress_bar=True,
    convert_to_numpy=True
)

print("Embeddings shape:", embeddings.shape)

# Save to disk (these will be gone when session is reseted)
np.save("./index/shakespeare_chunked_embeddings.npy", embeddings)
with open("./index/shakespeare_chunked_emb.json", "w", encoding="utf-8") as f:
    json.dump(documents, f, ensure_ascii=False, indent=2)


Batches: 100%|██████████| 176/176 [00:24<00:00,  7.07it/s]


Embeddings shape: (5621, 768)


In [50]:
import faiss
import numpy as np

embeddings = np.load("./index/shakespeare_chunked_embeddings.npy")
D = embeddings.shape[1]
index = faiss.IndexFlatL2(D)
index.add(embeddings.astype("float32"))

print("Indexed vectors count:", index.ntotal)

faiss.write_index(index, "./index/shakespeare_chunked_index.bin")
print("FAISS index saved.")


Indexed vectors count: 5621
FAISS index saved.


In [42]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

# model_id = "Qwen/Qwen1.5-7B-Instruct"
model_id = "Qwen/Qwen2.5-7B-Instruct"
# Configure 4-bit quantization via bitsandbytes for free gpu size
# todo: i think i can increase it a bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True

    # load_in_8bit=True,
    # bnb_8bit_compute_dtype=torch.float16,
    # bnb_8bit_quant_type="nf8",
    # bnb_8bit_use_double_quant=True

    # load_in_16bit=True,
    # bnb_16bit_compute_dtype=torch.float32,
    # bnb_16bit_quant_type="nf16",
    # bnb_16bit_use_double_quant=True
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

print("Model loaded with 4-bit quantization. Device(s):", model.device)


Loading checkpoint shards: 100%|██████████| 4/4 [00:09<00:00,  2.34s/it]


Model loaded with 4-bit quantization. Device(s): cuda:0


In [51]:
import faiss
import json
import numpy as np

# same here, file will be removed when session resets
index = faiss.read_index("./index/shakespeare_chunked_index.bin")
with open("./index/shakespeare_chunked_emb.json", "r", encoding="utf-8") as f:
    documents = json.load(f)
embeddings = np.load("./index/shakespeare_chunked_embeddings.npy")

print("Index, metadata, and embeddings reloaded. Total vectors:", index.ntotal)


Index, metadata, and embeddings reloaded. Total vectors: 5621


In [None]:
import torch

def embed_query(query: str):
    vec = embedder.encode([query], convert_to_numpy=True)
    return vec.astype("float32")

# def retrieve_top_k(query: str, k: int = 5):
#     q_vec = embed_query(query)
#     distances, indices = index.search(q_vec, k)
#     print(distances, indices)
#     results = []
#     for dist, idx in zip(distances[0], indices[0]):
#         doc = documents[idx]
#         results.append({
#             "id": doc["id"],
#             "text": doc["text"],
#             "metadata": doc["metadata"],
#             "score": float(dist)
#         })
#     return results

def retrieve_top_k(query: str, k: int = 5, act: str = None, scene: str = None, play: str = None,  documents: list = []):
    q_vec = embed_query(query)  # shape (1, D)

    # Filter documents by Act and Scene if specified
    if act or scene:
        filtered_docs = []
        filtered_embeddings = []
        for i, doc in enumerate(documents):
            metadata = doc.get("metadata", {})
            # if play and metadata.get("play") != str(play):
            #     continue
            if act and metadata.get("act") != str(act):
                continue
            if scene and metadata.get("scene") != str(scene):
                continue
            filtered_docs.append((i, doc))
            filtered_embeddings.append(embeddings[i])
        
        if not filtered_docs:
            return [{"error": "No documents match the Act and Scene filter"}]
        
        # Convert to np array for FAISS-like search
        filtered_embeddings_np = np.vstack(filtered_embeddings)
        
        # Search using cosine similarity
        from sklearn.metrics.pairwise import cosine_similarity
        sims = cosine_similarity(q_vec, filtered_embeddings_np)[0]
        top_k_indices = np.argsort(sims)[-k:][::-1]

        results = []
        for i in top_k_indices:
            idx, doc = filtered_docs[i]
            results.append({
                # "id": doc.get("id", idx),
                "text": doc["text"],
                "metadata": doc["metadata"],
                "score": float(sims[i])
            })
        return results

    # Fallback: No filtering, use entire index
    distances, indices = index.search(q_vec, k)
    results = []
    for dist, idx in zip(distances[0], indices[0]):
        doc = documents[idx]
        results.append({
            "id": doc.get("id", idx),
            "text": doc["text"],
            "metadata": doc["metadata"],
            "score": float(dist)
        })
    return results

def build_prompt(retrieved_chunks, user_question, style="shake"):
    if style == "shake":
        tone_instruction = "Respond in Shakespearean English, quoting from these passages."
    else:
        tone_instruction = "Respond clearly as a Shakespeare expert, quoting from these passages."

    header = f"You are a Shakespeare expert. {tone_instruction}\n\n"
    passages = ""
    for i, chunk in enumerate(retrieved_chunks, start=1):
        passages += f"[Passage {i}]\n{chunk['text']}\n\n"
    question_block = f"### Question:\n{user_question}\n\n### Answer:\n"
    return header + passages + question_block

def generate_answer(prompt: str, max_new_tokens: int = 300, temperature: float = 0.8):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device)
    with torch.no_grad():
        output_ids = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=True,
            top_p=0.9,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    generated = output_ids[0][ inputs["input_ids"].shape[1]: ]
    return tokenizer.decode(generated, skip_special_tokens=True).strip()


In [None]:

def user_query(user_question: str, documents: list=[]):
  top_chunks = retrieve_top_k(user_question, k=5, act=3, scene=3, documents=documents)
  print("Retrieved passages (top 5):")
  for i, c in enumerate(top_chunks, start=1):
    print(i)
    print(c)
      # snippet = c["text"].replace("\n", " ")[:200]
      # print(f" {i}. [score {c['score']:.2f}] {snippet}...")

  prompt = build_prompt(top_chunks, user_question, style="no-shake")

  print("\nGenerating answer...\n")
  answer = generate_answer(prompt, max_new_tokens=200, temperature=0.8)
  print("=== ShakespeareBot Answer ===")
  print(answer)
  print("#"*50)

# user_query("Why does Hamlet kill Polonius?")
with open('./index/shakespeare_chunked_emb.json', 'r') as f:
  document = json.load(f)
    
print(document[0]['metadata'])
user_query("Why does Hamlet not kill Claudius. Please use only sourced form Play Hamlet, Act 3, Scene 3?", documents=document)

# user_query("How and why does Hamlet get his friends Rosencrantz and Guildenstern killed?", documents=document)
# user_query("How did Ophelia die?", documents=document)
user_query("Why was Hamlet angry with his mother Gertrude?" , documents=document)

{'details': 'This play is: Henry IV, This section is Act 1, Scene 1, from line 1 to line 16', 'play': 'Henry IV', 'act': '1', 'scene': '1', 'firstLine': '1', 'lastLine': '16', 'characters present': 'The characters present on set are LORD JOHN OF LANCASTER, EARL WESTMORELAND, SIR WALTER BLUNT, KING HENRY', 'speakers': 'The characters speaking in these lines are KING HENRY IV'}
Retrieved passages (top 5):
1
{'text': "HAMLET: SCENE III. A room in the castle.\nEnter KING CLAUDIUS, ROSENCRANTZ, and GUILDENSTERN\nKING CLAUDIUS: I like him not, nor stands it safe with us\nTo let his madness range. Therefore prepare you,\nI your commission will forthwith dispatch,\nAnd he to England shall along with you:\nThe terms of our estate may not endure\nHazard so dangerous as doth hourly grow\nOut of his lunacies.\nGUILDENSTERN: We will ourselves provide:\nMost holy and religious fear it is\nTo keep those many many bodies safe\nThat live and feed upon your majesty.\nROSENCRANTZ: The single and peculiar