In [1]:
import fitz  # pymupdf
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

# Load model + tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [None]:
# Function to embed text
def embed(texts):
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        return F.normalize(embedding, p=2, dim=1)

In [None]:
def embed(texts, batch_size=32):
    all_embeddings = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)

        with torch.no_grad():
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state[:, 0, :]
            embeddings = F.normalize(embeddings, p=2, dim=1)

        all_embeddings.append(embeddings)

    return torch.cat(all_embeddings, dim=0)

In [None]:
from urllib import request

# Step 1: Download the book
url = "https://www.gutenberg.org/files/2554/2554-0.txt"
raw = request.urlopen(url).read().decode("utf8")

# Step 2: Strip the Project Gutenberg boilerplate
start = raw.find("PART I")
end = raw.rfind("End of the Project Gutenberg EBook")
text = raw[start:end]

# Step 3: Count paragraphs (split by double newlines)
paragraphs = [p for p in text.split("\n\n") if p.strip()]
print(f"Total paragraphs: {len(paragraphs)}")

Total paragraphs: 3953


In [16]:
paragraph_embeddings = embed(paragraphs)

In [None]:
paragraph_data = [
    {
        "paragraph": para,
        "embedding": embedding.tolist()  # Convert torch.Tensor to Python list
    }
    for para, embedding in zip(paragraphs, paragraph_embeddings)
]

In [None]:
query = "when is he finally caught"
query_embedding = embed([query])[0]  # Shape: (384,)

# Find the smallest distance between the query embed and the embeds in the list.


# Compute similarity scores between query and all paragraphs
scores = torch.matmul(paragraph_embeddings, query_embedding)  # Shape: (N,)


top_k = 5
top_indices = torch.topk(scores, k=top_k).indices  # Top-k most similar

results = [
    {
        "score": scores[i].item(),
        "paragraph": paragraphs[i],
        "index": i
    }
    for i in top_indices
]

for result in results:
    print(f"Score: {result['score']:.4f}")
    print(f"Paragraph: {result['paragraph']}\n")

Score: 0.8912
Paragraph: “And what if we do catch him?”

Score: 0.8773
Paragraph: “Well, they will catch him.”

Score: 0.8521
Paragraph: “Nothing easier. It is in just such stupid things clever people are most
easily caught. The more cunning a man is, the less he suspects that he
will be caught in a simple thing. The more cunning a man is, the simpler
the trap he must be caught in. Porfiry is not such a fool as you
think....”

Score: 0.8340
Paragraph: It was only in that that he recognised his criminality, only in the fact
that he had been unsuccessful and had confessed it.

Score: 0.8316
Paragraph: At first--long before indeed--he had been much occupied with one
question; why almost all crimes are so badly concealed and so easily
detected, and why almost all criminals leave such obvious traces? He
had come gradually to many different and curious conclusions, and in his
opinion the chief reason lay not so much in the material impossibility
of concealing the crime, as in the criminal hi