In [1]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import openai

### Load your data

In [3]:
loader = UnstructuredPDFLoader("../data/field-guide-to-data-science.pdf")
# loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")

In [4]:
data = loader.load()

In [8]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 1 document(s) in your data
There are 176584 characters in your document


### Chunk your data up into smaller documents

In [9]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [10]:
print (f'Now you have {len(texts)} documents')

Now you have 240 documents


### Create embeddings of your documents to get ready for semantic search

In [12]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

In [68]:
OPENAI_API_KEY = ''
PINECONE_API_KEY = ''
PINECONE_API_ENV = 'eu-west1-gcp'
import openai
openai.api_base = 'https://openai.imchao.top/v1'

In [69]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [64]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "langchain-openai"

In [31]:

docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [47]:
query = "How to explain AI to a 5 years old?"
docs = docsearch.similarity_search(query, include_metadata=True)

### Query those docs to get your answer back

In [70]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [71]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [72]:
query = "什么是Execution Models?"
docs = docsearch.similarity_search(query, include_metadata=True)

In [73]:
chain.run(input_documents=docs, question=query)

' Execution models描述了如何操纵数据来执行分析功能。它们可以按照多个维度分类，包括批处理和流处理执行模型。批处理执行模型意味着数据以大块的形式分析，分析在运行和不运行之间有一个状态，并且在执行之间在内存中保持少量状态。流处理执行模型意味着在正常操作下'