In [65]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss

In [66]:
import requests

url = "https://www.gutenberg.org/files/1661/1661-0.txt"
text = requests.get(url).text
#text = text.replace("\ufeff", "")


**CHUNKING**

In [67]:
start_marker = "*** START OF THE PROJECT GUTENBERG EBOOK THE ADVENTURES OF SHERLOCK HOLMES ***"
end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK THE ADVENTURES OF SHERLOCK HOLMES ***"

text = text.split(start_marker)[-1]
text = text.split(end_marker)[0]

# Chunking
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
    separators=["\n\n", "\n", ".", " ", ""]
)

#Convert long document into chunks
chunks = text_splitter.split_text(text)

**EMBEDDINGS**

In [68]:
from sentence_transformers import SentenceTransformer
import numpy as np

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")


chunk_embeddings = embedding_model.encode(
    chunks,
    convert_to_numpy=True,
    show_progress_bar=True
)

print("Embeddings shape:", chunk_embeddings.shape)

Batches: 100%|█████████████████████████████████████████████████████████████████████████| 47/47 [00:40<00:00,  1.16it/s]

Embeddings shape: (1488, 384)





**VECTOR DATA BASE - FAISS**

In [97]:
import faiss
faiss.normalize_L2(chunk_embeddings)


In [98]:
dimension = chunk_embeddings.shape[1]  # 384
index = faiss.IndexFlatIP(dimension)

index.add(chunk_embeddings)

print("Total vectors in FAISS index:", index.ntotal)


Total vectors in FAISS index: 1488


**Similarity search (Retrieval) question to embedding**

In [99]:
query = "How is Sherlock Holmes described?"

query_embedding = embedding_model.encode(
    [query],
    convert_to_numpy=True
)

faiss.normalize_L2(query_embedding)

k = 3
scores, indices = index.search(query_embedding, k) #using argmax here

print("Cosine similarity scores:", scores)

for i, idx in enumerate(indices[0]):
    print(f"\n--- Retrieved Chunk {i+1} ---")
    print(chunks[idx][:300])

Cosine similarity scores: [[0.682717  0.6724535 0.6599988]]

--- Retrieved Chunk 1 ---
opposing windows loomed like dark, shapeless blurs through the heavy
yellow wreaths. Our gas was lit and shone on the white cloth and
glimmer of china and metal, for the table had not been cleared yet.
Sherlock Holmes had been silent all the morning, dipping continuously
into the advertisement c

--- Retrieved Chunk 2 ---
He had hardly spoken before there rushed into the room one of the most
lovely young women that I have ever seen in my life. Her violet eyes
shining, her lips parted, a pink flush upon her cheeks, all thought of
her natural reserve lost in her overpowering excitement and concern.

“Oh, Mr. Sherl

--- Retrieved Chunk 3 ---
I wished to be absolutely clear. We shall now have a little supper and
then retire, for we may have a very busy day to-morrow.”

A large and comfortable double-bedded room had been placed at our
disposal, and I was quickly between the sheets, for I was weary after
m

**Context Construction**

In [100]:
context = "\n\n".join([chunks[idx] for idx in indices[0]])

prompt = f"""
Answer the question ONLY using the given context.
If the answer is not clearly stated, say "Not explicitly mentioned in the context."

Context:
{context}

Question:
{query}

Answer:
"""

**GENERATION**

In [104]:
from transformers import pipeline

generator = pipeline(
    "text2text-generation", 
    model="google/flan-t5-base",
    max_new_tokens=150
)

response = generator(prompt)[0]["generated_text"]
print(response)

Device set to use cpu


A man, however, who, when he had an unsolved problem upon his mind, would go for days, and even for a week, without rest, turning it over, rearranging his facts, looking


In [106]:
reference_answer = (
    "Sherlock Holmes is described as a man who becomes deeply absorbed "
    "in unsolved problems and spends long periods analyzing facts."
)

**EVALUATION METRICS - ROUGE**

In [102]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(
    ["rouge1", "rougeL"],
    use_stemmer=True
)

scores = scorer.score(reference_answer, response)

print("ROUGE-1:", scores["rouge1"])
print("ROUGE-L:", scores["rougeL"])


ROUGE-1: Score(precision=0.03225806451612903, recall=0.09090909090909091, fmeasure=0.047619047619047616)
ROUGE-L: Score(precision=0.03225806451612903, recall=0.09090909090909091, fmeasure=0.047619047619047616)


The ROUGE scores are low because ROUGE measures exact n-gram overlap, while the generated answer is a paraphrased description rather than a word-for-word match with the reference answer.