In [1]:
# pip install langchain --upgrade
# Version: 0.0.164

# !pip install pypdf

In [1]:
# PDF Loaders. If unstructured gives you a hard time, try PyPDFLoader
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

### Load your data

In [2]:
loader = PyPDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")

## Other options for loaders 
# loader = UnstructuredPDFLoader("../data/field-guide-to-data-science.pdf")
# loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")

In [3]:
data = loader.load()

In [4]:
# Note: If you're using PyPDFLoader then it will split by page for you already
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[30].page_content)} characters in your document')

You have 126 document(s) in your data
There are 2812 characters in your document


### Chunk your data up into smaller documents

In [5]:
# Note: If you're using PyPDFLoader then we'll be splitting for the 2nd time.
# This is optional, test out on your own data.

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [6]:
print (f'Now you have {len(texts)} documents')

Now you have 258 documents


In [7]:
texts[0]

Document(page_content='DATA SCIENCEtoTHE\nFIEL D GUIDE\n    \n \nSECOND  \nEDITION\n© COPYRIGHT 2015 BOOZ ALLEN HAMILTON INC. ALL RIGHTS RESERVED.', metadata={'source': '/var/folders/1r/rhq_4g0968d5kvn584v1md400000gn/T/tmp1txgiesn', 'page': 1})

### Create embeddings of your documents to get ready for semantic search

In [8]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

  from tqdm.autonotebook import tqdm


In [18]:
# Check to see if there is an environment variable with you API keys, if not, use what you put below
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', 'YourAPIKey')

PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', 'YourAPIKey')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'us-central1-gcp') # You may need to switch with your env

In [19]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [20]:
embeddings

OpenAIEmbeddings(client=<class 'openai.api_resources.embedding.Embedding'>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, embedding_ctx_length=8191, openai_api_key='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', openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=6, request_timeout=None, headers=None)

In [21]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "langchaintest" # put in the name of your pinecone index here

In [22]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [23]:
query = "What are examples of good data science teams?"
docs = docsearch.similarity_search(query)

In [24]:
# Here's an example of the first document that was returned
print(docs[0].page_content[:450])

tested, updated and improved until better models are found. 
››Data Science is necessary for companies to stay with the 
pack and compete in the future.   
Organizations are constantly making decisions based on gut 
instinct, loudest voice and best argument – sometimes they are 
even informed by real information. The winners and the losers in 
the emerging data economy are going to be determined by their 
Data Science teams. 
››Data Science capab


### Query those docs to get your answer back

In [25]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [26]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [27]:
query = "What is the collect stage of data maturity?"
docs = docsearch.similarity_search(query)

In [28]:
chain.run(input_documents=docs, question=query)

' The collect stage of data maturity focuses on collecting internal or external datasets. An example of this is gathering sales records and corresponding weather data.'