In [5]:
import os
import openai

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [6]:
from langchain.document_loaders import PyPDFLoader

# Load PDF
loaders = [
    PyPDFLoader("data/getting-real/getting-real-01-introduction.pdf"),
    PyPDFLoader("data/getting-real/getting-real-02-starting-line.pdf"),
    PyPDFLoader("data/getting-real/getting-real-03-stay-lean.pdf"),  # Duplicate documents on purpose
    PyPDFLoader("data/getting-real/getting-real-03-stay-lean.pdf"),  # Duplicate documents on purpose
    PyPDFLoader("data/getting-real/getting-real-04-priorities.pdf"),
    #PyPDFLoader("data/getting-real/getting-real-05-feature-selection.pdf"),
]

docs = []
for loader in loaders:
    docs.extend(loader.load())

# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 100
)

splits = text_splitter.split_documents(docs)

len(splits)

180

In [7]:
splits[1]

Document(page_content='that’s not essential (and most of what you think is essential actually isn’t).\nGetting Real is staying small and being agile.\nGetting Real starts with the interface, the real screens that people are going to use. It begins\nwith what the customer actually experiences and builds backwards from there. This lets you\nget the interface right before you get the software wrong.\nGetting Real is about iterations and lowering the cost of change. Getting Real is all about', metadata={'source': 'data/getting-real/getting-real-01-introduction.pdf', 'page': 0})

In [8]:
# embeddings
# https://platform.openai.com/docs/guides/embeddings/what-are-embeddings
# By default, the length of the embedding vector will be 1536 for text-embedding-3-small or 
# 3072 for text-embedding-3-large. 
# text-embedding-3-small	$0.02 / 1M tokens
# text-embedding-3-large	$0.13 / 1M tokens

from langchain_openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings(model="text-embedding-3-small")

In [9]:
from langchain.vectorstores import Chroma
persist_directory = './data/chroma/'

# database is created and data embedded and loaded
bookdb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [10]:
bookdb._collection.count()

259

In [11]:
bookdb.persist()

  warn_deprecated(


In [12]:
question = "what means staying lean?"
docs = bookdb.similarity_search(question, k=3)
docs  # docs[0] and docs[1] are the same document

[Document(page_content='10: Less Mass\nThe leaner you are, the easier it is to change\nThe more massive an object, the more energy is required to change its direction. It’s as true in\nthe business world as it is in the physical world.\nWhen it comes to web technology, change must be easy and cheap. If you can’t change on the\nfly, you’ll lose ground to someone who can. That’s why you need to shoot for less mass.\nMass is increased by…\nLong term contracts\nExcess staff\nPermanent decisions\nMeetings about other meetings', metadata={'page': 0, 'source': 'data/getting-real/getting-real-03-stay-lean.pdf'}),
 Document(page_content='10: Less Mass\nThe leaner you are, the easier it is to change\nThe more massive an object, the more energy is required to change its direction. It’s as true in\nthe business world as it is in the physical world.\nWhen it comes to web technology, change must be easy and cheap. If you can’t change on the\nfly, you’ll lose ground to someone who can. That’s why you

In [13]:
# Maximum marginal relevance - duplicate is gone
bookdb.max_marginal_relevance_search(question, k=3)

[Document(page_content='10: Less Mass\nThe leaner you are, the easier it is to change\nThe more massive an object, the more energy is required to change its direction. It’s as true in\nthe business world as it is in the physical world.\nWhen it comes to web technology, change must be easy and cheap. If you can’t change on the\nfly, you’ll lose ground to someone who can. That’s why you need to shoot for less mass.\nMass is increased by…\nLong term contracts\nExcess staff\nPermanent decisions\nMeetings about other meetings', metadata={'page': 0, 'source': 'data/getting-real/getting-real-03-stay-lean.pdf'}),
 Document(page_content='the big advantages of staying small.\nBe proudly, defiantly truthful\nThough you may think that a customer can be f ooled by exaggerations on the number of staffers\nin your company or the breadth of your offerings, the smart ones, the ones you really want, will\nalways learn the truth – whether through intuition or deduction. Embarrassingly, I’ve been a\npart 

In [14]:
# cosine distance, short the better
bookdb.similarity_search_with_score(question, k=3)

[(Document(page_content='10: Less Mass\nThe leaner you are, the easier it is to change\nThe more massive an object, the more energy is required to change its direction. It’s as true in\nthe business world as it is in the physical world.\nWhen it comes to web technology, change must be easy and cheap. If you can’t change on the\nfly, you’ll lose ground to someone who can. That’s why you need to shoot for less mass.\nMass is increased by…\nLong term contracts\nExcess staff\nPermanent decisions\nMeetings about other meetings', metadata={'page': 0, 'source': 'data/getting-real/getting-real-03-stay-lean.pdf'}),
  1.1880751848220825),
 (Document(page_content='10: Less Mass\nThe leaner you are, the easier it is to change\nThe more massive an object, the more energy is required to change its direction. It’s as true in\nthe business world as it is in the physical world.\nWhen it comes to web technology, change must be easy and cheap. If you can’t change on the\nfly, you’ll lose ground to someon

In [15]:
### Addressing Specificity: working with metadata

docs = bookdb.similarity_search(
    "what's about building software",
    k=3,
    #filter={"source":"data/getting-real/getting-real-02-starting-line.pdf"}
)

for d in docs:
    print(d.metadata)

{'page': 1, 'source': 'data/getting-real/getting-real-02-starting-line.pdf'}
{'page': 1, 'source': 'data/getting-real/getting-real-02-starting-line.pdf'}
{'page': 5, 'source': 'data/getting-real/getting-real-05-feature-selection.pdf'}


In [16]:
### Addressing Specificity: working with metadata

docs = bookdb.similarity_search(
    "what's about building software",
    k=3,
    filter={"source":"data/getting-real/getting-real-02-starting-line.pdf"}
)

for d in docs:
    print(d.metadata)

{'page': 1, 'source': 'data/getting-real/getting-real-02-starting-line.pdf'}
{'page': 1, 'source': 'data/getting-real/getting-real-02-starting-line.pdf'}
{'page': 1, 'source': 'data/getting-real/getting-real-02-starting-line.pdf'}


In [17]:
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [18]:
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The lecture the chunk is from, should be one of `docs/cs229_lectures/MachineLearning-Lecture01.pdf`, `docs/cs229_lectures/MachineLearning-Lecture02.pdf`, or `docs/cs229_lectures/MachineLearning-Lecture03.pdf`",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page from the lecture",
        type="integer",
    ),
]

In [19]:
# pip install lark
document_content_description = "book chapters"

llm = OpenAI(model='gpt-3.5-turbo-instruct', temperature=0)

retriever = SelfQueryRetriever.from_llm(
    llm,
    bookdb,
    document_content_description,
    metadata_field_info,
    verbose=True
)

  warn_deprecated(


In [20]:
question = "what's about building software"
docs = retriever.get_relevant_documents(question)
for d in docs:
    print(d.metadata)

  warn_deprecated(


{'page': 0, 'source': 'data/getting-real/getting-real-01-introduction.pdf'}
{'page': 1, 'source': 'data/getting-real/getting-real-02-starting-line.pdf'}
{'page': 1, 'source': 'data/getting-real/getting-real-01-introduction.pdf'}
{'page': 3, 'source': 'data/getting-real/getting-real-01-introduction.pdf'}


In [21]:
from langchain.retrievers import SVMRetriever
from langchain.retrievers import TFIDFRetriever
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [22]:
# Load PDF
loader = PyPDFLoader("data/getting-real/getting-real-02-starting-line.pdf")
pages = loader.load()
all_page_text=[p.page_content for p in pages]
joined_page_text=" ".join(all_page_text)

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500,chunk_overlap = 150)
splits = text_splitter.split_text(joined_page_text)

In [23]:
# Retrieve
from langchain_openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings(model="text-embedding-3-small")

svm_retriever = SVMRetriever.from_texts(splits, embedding)
tfidf_retriever = TFIDFRetriever.from_texts(splits)

In [24]:
question = "What are topics for this book?"
docs_svm=svm_retriever.get_relevant_documents(question)
docs_svm[0]



Document(page_content='My idea was: Enjoy baking, sell your bread, people like it, sell more. Keep the bakery going\nbecause you’re making good food and people are happy.\n—Ian MacKaye, member of Fugazi and co-owner of Dischord Records \n(from Salon.com People | Ian MacKaye)')

In [25]:
question = "what did they say about starting line?"
docs_tfidf=tfidf_retriever.get_relevant_documents(question)
docs_tfidf[0]

Document(page_content='hundreds of thousands of others are in the same boat. There’s your market. Wasn’t that easy?\nBasecamp originated in a problem: As a design firm we needed a simple way to communicate\nwith our clients about projects. We started out doing this via client extranets which we would\nupdate manually. But changing the html by hand every time a project needed to be updated just\nwasn’t working. These project sites always seemed to go stale and eventually were abandoned. It\nwas frustrating because it left us disorganized and left clients in the dark.\nSo we started looking at other options. Yet every tool we found either 1) didn’t do what we\nneeded or 2) was bloated with features we didn’t need — like billing, strict access controls,\ncharts, graphs, etc. We knew there had to be a better way so we decided to build our own.\nWhen you solve your own problem, you create a tool that you’re passionate about. And passion\nis key. Passion means you’ll truly use it and care ab