In [None]:
%pip install openai python-dotenv --quiet

In [1]:
import os
from dotenv import load_dotenv
from langchain.document_loaders import DirectoryLoader
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain.document_loaders import WebBaseLoader
from langchain.prompts import PromptTemplate

load_dotenv()

True

In [2]:
llm = AzureChatOpenAI(
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    openai_api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    openai_api_version=os.getenv("OPENAI_API_VERSION"),
    azure_deployment="gpt-4",
    temperature=0,
)

# Loading a single website
loader = WebBaseLoader("http://www.paulgraham.com/superlinear.html")

superlinear = loader.load()

print (f"You have {len(superlinear)} document with length {len(superlinear[0].page_content)} characters or roughly {len(superlinear[0].page_content) / 4} tokens")


You have 1 document with length 24854 characters or roughly 6213.5 tokens


In [3]:
# Split your website into big chunks
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1000 * 4, chunk_overlap=0)

# This text splitter is used to create the child documents. They should be small chunk size.
child_splitter = RecursiveCharacterTextSplitter(chunk_size=125*4)

In [5]:
# Embed and store your docs/vectors
embeddings = AzureOpenAIEmbeddings(
    azure_endpoint=os.getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT"),
    openai_api_key=os.getenv("AZURE_OPENAI_EMBEDDING_API_KEY"),
    openai_api_version=os.getenv("AZURE_OPENAI_EMBEDDING_API_VERSION"),
    azure_deployment="text-embedding-3-small",
)

# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="parent_document_splits",
    embedding_function=embeddings
)

# The storage layer for the parent documents
docstore = InMemoryStore()

In [6]:
# We'll add a large document
# It will be split into large chunks (check out the code for that here)
# Those chunks will get an id assigned to them
# Those chunks will be further split into small chunks and the id from the parent the chunks were split from will be assigned to the child docs

retriever = ParentDocumentRetriever(
    vectorstore=vectorstore, 
    docstore=docstore,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter
)

retriever.add_documents(superlinear)

In [7]:
num_parent_docs = len(retriever.docstore.store.items())
num_child_docs = len(set(retriever.vectorstore.get()['documents']))
print (f"You have {num_parent_docs} parent docs and {num_child_docs} child docs")

You have 8 parent docs and 82 child docs


In [8]:
child_docs = retriever.vectorstore.similarity_search("what is some investing advice?")

print (f"{len(child_docs)} child docs were found") 
child_docs[0]

4 child docs were found


Document(page_content="as true in investing, for example. It's only useful to believe that\na company will do well if most other investors don't; if everyone\nelse thinks the company will do well, then its stock price will\nalready reflect that, and there's no room to make money.What else can we learn from these fields? In all of them you have\nto put in the initial effort. Superlinear returns seem small at\nfirst. At this rate, you find yourself thinking, I'll never get", metadata={'doc_id': '4533d7c0-11c8-4107-aea0-cf62b0b34e1e', 'language': 'No language found.', 'source': 'http://www.paulgraham.com/superlinear.html', 'title': 'Superlinear Returns'})

In [9]:
retriever.docstore.store.get(child_docs[0].metadata['doc_id']).page_content[:500]

"science. It has exponential growth, in the form of learning, combined\nwith thresholds at the extreme edge of performance — literally at\nthe limits of knowledge.The result has been a level of inequality in scientific discovery\nthat makes the wealth inequality of even the most stratified societies\nseem mild by comparison. Newton's discoveries were arguably greater\nthan all his contemporaries' combined.\n[11]This point may seem obvious, but it might be just as well to spell\nit out. Superlinear retur"

In [10]:
retrieved_docs = retriever.get_relevant_documents("what is some investing advice?")
print (f"{len(retrieved_docs)} retrieved docs were found")

2 retrieved docs were found
