In [28]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader

In [None]:
# Currently using langchain UnstructuredPDFLoader to load the PDF from local, could use the OnlinePDFLoader to load from a URL
loader = UnstructuredPDFLoader("/Users/jj/Documents/dev/vector-search/vector-search-algorithms-book/data/algorithms_text_book.pdf")
data = loader.load()
len(data[0].page_content)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split the text into chunks of 1000 characters, with 200 characters of overlap
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
)

texts = text_splitter.split_documents(data)
print(texts[1])

In [None]:
import os
import getpass

# Set the environment variables from user input
PINECONE_API_KEY = getpass.getpass('Pinecone API Key:')
PINECONE_ENVIRONMENT = getpass.getpass("Pinecone API Environment:")
os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')

In [16]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
import pinecone

# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_ENVIRONMENT  # next to api key in console
)

index_name = "pdf-algorithms-text-book"

In [19]:
embeddings = OpenAIEmbeddings()
# With the embeddings and the texts, create the Pinecone index
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [61]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

# create the LLM and load the chain
llm = OpenAI(temperature=0, openai_api_key=os.environ['OPENAI_API_KEY'])
chain = load_qa_chain(llm, chain_type='stuff')

In [31]:
query = "What is big o notation? Format response as json object."

# similarity search from pinecone index
docs = docsearch.similarity_search(query)

In [32]:
# run the chain
response = chain.run(input_documents=docs, question=query)
response

' {\n"definition": "Big-O notation is a way of expressing the complexity of an algorithm. It is used to describe the rate of growth of a function, and is expressed as O(f(n)), where f(n) is the function that describes the rate of growth. It is used to compare the relative complexity of different algorithms, and is often used to simplify complicated functions by omitting dominated terms."\n}'