In [1]:
# pip install langchain --upgrade
# Version: 0.0.164

# !pip install pypdf

In [1]:
# PDF Loaders. If unstructured gives you a hard time, try PyPDFLoader
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

### Load your data

In [7]:
loader = PyPDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")

## Other options for loaders 
# loader = UnstructuredPDFLoader("../data/field-guide-to-data-science.pdf")
# loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")

In [8]:
data = loader.load()

In [9]:
# Note: If you're using PyPDFLoader then it will split by page for you already
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[30].page_content)} characters in your document')

You have 126 document(s) in your data
There are 2812 characters in your document


### Chunk your data up into smaller documents

In [10]:
# Note: If you're using PyPDFLoader then we'll be splitting for the 2nd time.
# This is optional, test out on your own data.

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [11]:
print (f'Now you have {len(texts)} documents')

Now you have 258 documents


In [12]:
texts[0]

Document(page_content='DATA SCIENCEtoTHE\nFIEL D GUIDE\n    \n \nSECOND  \nEDITION\n© COPYRIGHT 2015 BOOZ ALLEN HAMILTON INC. ALL RIGHTS RESERVED.', metadata={'source': '/var/folders/1r/rhq_4g0968d5kvn584v1md400000gn/T/tmpztu3ulsx', 'page': 1})

### Create embeddings of your documents to get ready for semantic search

In [25]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

In [26]:
# Check to see if there is an environment variable with you API keys, if not, use what you put below
# OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', 'YourAPIKey')

PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', 'YourAPIKey')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'us-central1-gcp') # You may need to switch with your env

In [27]:
embeddings = OpenAIEmbeddings()

In [34]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)

In [35]:
index_name = "langchaintest" # put in the name of your pinecone index here

In [36]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [39]:
query = "What are examples of good data science teams?"
docs = docsearch.similarity_search(query)

In [40]:
docs

[Document(page_content='tested, updated and improved until better models are found. \n››Data Science is necessary for companies to stay with the \npack and compete in the future.   \nOrganizations are constantly making decisions based on gut \ninstinct, loudest voice and best argument – sometimes they are \neven informed by real information. The winners and the losers in \nthe emerging data economy are going to be determined by their \nData Science teams. \n››Data Science capabilities can be built over time.  \nOrganizations mature through a series of stages – Collect, \nDescribe, Discover, Predict, Advise – as they move from data \ndeluge to full Data Science maturity. At each stage, they can \ntackle increasingly complex analytic goals with a wider breadth \nof analytic capabilities. However, organizations need not reach \nmaximum Data Science maturity to achieve success. Significant \ngains can be found in every stage.\n››Data Science is a different kind of team sport.', metadata={}

In [41]:
# Here's an example of the first document that was returned
print(docs[0].page_content[:450])

tested, updated and improved until better models are found. 
››Data Science is necessary for companies to stay with the 
pack and compete in the future.   
Organizations are constantly making decisions based on gut 
instinct, loudest voice and best argument – sometimes they are 
even informed by real information. The winners and the losers in 
the emerging data economy are going to be determined by their 
Data Science teams. 
››Data Science capab


### Query those docs to get your answer back

In [42]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [43]:
llm = OpenAI(temperature=0)
chain = load_qa_chain(llm, chain_type="stuff")

In [44]:
query = "What are examples of good data science teams?"
docs = docsearch.similarity_search(query)

In [45]:
chain.run(input_documents=docs, question=query)

' Good data science teams are multidisciplinary, with computer scientists, mathematicians, and domain experts working together. They should also have a broad view of the organization, with leaders who are key advocates and meet with stakeholders to identify challenges and gain buy-in.'