In [2]:
import os
from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI

In [1]:
import os
os.environ["OPENAI_API_KEY"] = "..."

### Load Data

In [3]:
loader = UnstructuredPDFLoader("./stripe-2022-update.pdf")

In [4]:
data = loader.load()

In [5]:
print(f'There are {len(data)} docs in the file')
print(f'There are {len(data[0].page_content)} chars in the doc')

There are 1 docs in the file
There are 21882 chars in the doc


### Chunk Data

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [7]:
print(f'There are now {len(texts)} docs')

There are now 30 docs


### Create Embeddings

In [8]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

In [9]:
embeddings = HuggingFaceEmbeddings()

In [10]:
db = Chroma.from_documents(texts, embeddings)

Using embedded DuckDB without persistence: data will be transient


In [11]:
retriever = db.as_retriever()

In [12]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2)
qa = RetrievalQA.from_chain_type(llm = llm, chain_type = "stuff", retriever=retriever, )

In [13]:
query = "What are 5 key takeaways?"
qa.run(query)

'1. Stripe is trying to play a role in the world by helping make it richer in every sense of the word with the help of the dedication of the doers. \n\n2. Stripe is optimistic about the future and believes that greater rates of change will allow returns to agility in every pursuit.\n\n3. There are more breakout companies headquartered outside a top 20 startup hub than there are in the Bay Area.\n\n4. Stripe\'s first operating principle is "Users First." \n\n5. Stripe listens to customer feedback and shipped 244 new user-facing features and 336 API updates last year.'

In [14]:
query2 = "What points were made about breakout companies?"
qa.run(query2)

'According to the context, the following points were made about breakout companies:\n\n- There are more breakout companies headquartered outside the top 20 startup hubs than in the Silicon Valley.\n- The technology community has been successful in disseminating the knowledge required to build successful startups.\n- Investment scenes outside of Silicon Valley are considerably more robust than they were just five years ago.\n- The trends in the dispersion of startups strike the authors as interesting, and they looked at their data on "breakout startups", which are new companies with unusually rapid revenue ramps.\n- In the three years before the pandemic, a high percentage of new breakouts were based in the city of San Francisco itself, but since then, only a low percentage have been.\n- Around 25% of the US-based breakout companies that the authors see are based in California, but three-quarters are not based in California.\n- California\'s share of breakout companies appears to have b