In [1]:
%pip install openai python-dotenv --quiet

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
from dotenv import load_dotenv
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
from langchain.schema.document import Document
from langchain.chains.summarize import load_summarize_chain
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
import uuid

load_dotenv()

True

In [2]:
# Get your LLM and summarize chain going

llm = AzureChatOpenAI(
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    openai_api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    openai_api_version=os.getenv("OPENAI_API_VERSION"),
    azure_deployment="gpt-4",
    temperature=0,
)

# Loading a single website
loader = WebBaseLoader("http://www.paulgraham.com/superlinear.html")
docs = loader.load()

# Split your website into big chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7000, chunk_overlap=0)
chunks = text_splitter.split_documents(docs)

print (f"Your {len(docs)} documents have been split into {len(chunks)} chunks")

Your 1 documents have been split into 5 chunks


In [6]:
chain = load_summarize_chain(llm)

id_key = "doc_id" # This is the key that we will tell the retriever to connect the summaries and original docs on

summaries = [] # To hold our summaries

for chunk in chunks:
    # First let's get an ID that we'll assign to the chunk and summary. You don't need a UUID here, use whatever you want
    unique_id = str(uuid.uuid4())

    # Then let's get the summary
    chunk_summary = chain.invoke([chunk])
    chunk_summary_document = Document(page_content=chunk_summary["output_text"], metadata={id_key: unique_id}) # Give the ID to the summary
    summaries.append(chunk_summary_document)

    # Then finnally add that same id to your chunk
    chunk.metadata[id_key] = unique_id

print (f"You have {len(summaries)} summaries to go along with your {len(chunks)} chunks")

You have 5 summaries to go along with your 5 chunks


In [7]:
# Embed and store your docs/vectors
embeddings = AzureOpenAIEmbeddings(
    azure_endpoint=os.getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT"),
    openai_api_key=os.getenv("AZURE_OPENAI_EMBEDDING_API_KEY"),
    openai_api_version=os.getenv("AZURE_OPENAI_EMBEDDING_API_VERSION"),
    azure_deployment="text-embedding-3-small",
)

# The vectorstore to use to index the summary chunks
vectorstore = Chroma(collection_name="summaries", embedding_function=embeddings)

# The storage layer for the parent documents
docstore = InMemoryStore()

In [8]:
# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=docstore,
    id_key=id_key, # "Hey, what should we join on?"
)

# Add your summary docs (with their ids) to the vectorstore. I'm unsure why a list of a uuid is returned, might be a bug.
retriever.vectorstore.add_documents(summaries)

['77655c25-f65d-441a-bd84-827c342c8918',
 'ae4bbe5d-5b4e-4b0e-a3c0-fb58a35b7e60',
 'b7ce35a0-3aa7-4ba9-acfd-4eb4968831e2',
 'fedc8195-5beb-4a3d-9112-5b26a5ef071d',
 'b96a04cf-f19b-4271-a655-7a30b24827fb']

In [9]:
_similar_docs = retriever.vectorstore.similarity_search("What is is the influence of organizations on equity?")
_similar_docs[0]

Document(page_content='The text discusses the concept of gradual improvements in technique over time, rather than breakthroughs by a few individuals. It explains that a step function can represent the reward curve for effort, where the rewards increase significantly after a certain point, encouraging rational actors to strive for that level. Competition is seen as a motivating factor and an indicator of valuable problems, but it\'s not always reliable as popularity can be driven by factors like monopoly, regulation, or poor consumer choice. The author reflects on their own motivation to become an artist due to the perceived independence from organizations.\n\nThe text also touches on the idea that while everyone has the potential for superlinear returns through compounding learning, few people push themselves to the point where they see significant benefits. There\'s a discussion on the ambiguous meaning of "equity" and how it contrasts with a world where a few outliers succeed signifi

In [10]:
# This will give each of your splits the ID you made earlier
retriever.docstore.mset([(x.metadata[id_key], x) for x in chunks])
retrieved_docs = retriever.get_relevant_documents("What is is the influence of organizations on equity?")
print (retrieved_docs[0].page_content[:500])
print (retrieved_docs[0].metadata)

gradual improvements in technique, not the discoveries of a few
exceptionally learned people.[3]
It's not mathematically correct to describe a step function as
superlinear, but a step function starting from zero works like a
superlinear function when it describes the reward curve for effort
by a rational actor. If it starts at zero then the part before the
step is below any linearly increasing return, and the part after
the step must be above the necessary return at that point or no one
would bo
{'source': 'http://www.paulgraham.com/superlinear.html', 'title': 'Superlinear Returns', 'language': 'No language found.', 'doc_id': '63c5fca7-93fa-4718-96c3-51c0395359bc'}
