# Embedding

Demonstrate how to create a vector embedding representation of a collection of source documents.



In [None]:
import dspy
from sentence_transformers import SentenceTransformer

# Load an extremely efficient local model for retrieval
model = SentenceTransformer("sentence-transformers/static-retrieval-mrl-en-v1", device="cpu")

embedder = dspy.Embedder(model.encode)
embeddings = embedder(["hello", "world"], batch_size=1)

assert embeddings.shape == (2, 1024)

In [None]:
# Traverse a directory and read html files - extract text from the html files
import os
from bs4 import BeautifulSoup
def read_html_files(directory):
    texts = []
    for filename in os.listdir(directory):
        if filename.endswith(".html"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                soup = BeautifulSoup(file, 'html.parser')
                texts.append(soup.get_text())
    return texts


In [None]:
zelda = read_html_files("../PragmatiCQA-sources/The Legend of Zelda")

print(len(zelda), " documents loaded from The Legend of Zelda")

In [None]:
print(zelda[0][:10000])  # Print the first 10000 characters of the first document

In [None]:
max_characters = 10000  # for truncating >99th percentile of documents
topk_docs_to_retrieve = 5  # number of documents to retrieve per search query

corpus = read_html_files("../PragmatiCQA-sources/The Legend of Zelda")
print(f"Loaded {len(corpus)} documents. Will encode them below.")

# embedder = dspy.Embedder('openai/text-embedding-3-small', dimensions=512)
search = dspy.retrievers.Embeddings(embedder=embedder, corpus=corpus, k=topk_docs_to_retrieve)


In [None]:
search("What is the main quest in The Legend of Zelda?")
# This will return the top 5 documents related to the query about the main quest in The Legend of Zelda.
# You can adjust the query to test different aspects of the corpus.
