In [4]:
# PDF Loaders. If unstructured gives you a hard time, try PyPDFLoader
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain_pinecone import PineconeVectorStore
from dotenv import load_dotenv
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
import os
load_dotenv()

openai_api_key = os.getenv('OPENAI_API_KEY')

loader = PyPDFLoader('./charlie-and-the-chocolate-factory-by-roald-dahl.pdf')
# loader = PyPDFLoader('./IntoThinAirBook.pdf')
pages = loader.load()
text = "".join(page.page_content for page in pages)
text = text.replace('\t', ' ')
# print(text)
print(pages[1])
# Note: If you're using PyPDFLoader then it will split by page for you already
print (f'You have {len(pages)} document(s) in your data')
# print (f'There are {len(pages[1].page_content)} characters in your sample document')
# print (f'Here is a sample: {pages[1].page_content[:200]}')

# We'll split our data into chunks around 500 characters each with a 50 character overlap. These are relatively small.
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", "\t"], chunk_size=10000, chunk_overlap=3000)
docs = text_splitter.create_documents([text])
# Let's see how many small chunks we have
print (f'Now you have {len(docs)} documents')
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
# vectors = embeddings.embed_documents([x.page_content for x in docs])

page_content='Annotation\nThis\tbook\tis\tfantastic\tit\tis\tabout\ta\tvery\tpoor\tboy\tnamed\tCharlie\tBucket.\tHe\talways\ngoes\tto\tschool\twith\tout\ta\tjacket\tbecause\tthey\tdon’t\thave\tmoney\tto\tbuy\tCharlie\tthings.\tThe\nsetting\tof\tthe\tbook\tis\tan\tunnamed\tcity;\tsmall\twooden\thouse\ton\tthe\tedge\tof\ta\tgreat\tcity,a\nfabled\tchocolate\tfactory.\tThe\tconflict\tis\tfive\tchildren\twho\thave\tfound\tgolden\ttickets\ncompete\tto\tsee\twho\twill\ttake\tover\tMr.\tWonka’s\tchocolate\tfactory.\tIt\tall\tstarted\twhen\tthe\nnewpaper\tannounces\tthat\tthe\tWonka\tchocolate\tfactory\twill\thide\tfive\tgolden\ttickets\tin\tthe\nWonka\tchocolate\tbars.\tCharlie\tdesperately\thopes\the\twill\tfind\ta\tgolden\tticket.\tThe\tproblem\nis\tthat\teach\tyear\the\tgets\ta\tchocolate\ton\this\tbirthday,\tand\the\tdoesn’t\thave\tmoney\tto\tbuy\none.\tCharlie\tfather\tloses\this\tjob\tand\tthe\tpoor\tfamily\tis\ton\tbrink\tof\tstarvation.\tCharlie\tfinds\na\tdollar\tbill\ton\tthe\tstreet

In [5]:

os.environ['PINECONE_API_KEY'] = os.getenv('PINECONE_API_KEY')
index_name = 'hkpineconeindex1'
vectorstore_from_texts = PineconeVectorStore.from_texts(
    [x.page_content for x in docs],
    index_name=index_name,
    embedding=embeddings
)

query='how many people appeared in this book?'
docs = vectorstore_from_texts.similarity_search(query)
llm = ChatOpenAI(temperature=0, openai_api_key=os.getenv('OPENAI_API_KEY'))
chain = load_qa_chain(llm, chain_type="stuff")
chain.run(input_documents=docs, question=query)

'In the provided context, there are several characters mentioned in the book:\n1. Charlie Bucket\n2. Grandpa Joe\n3. Grandma Josephine\n4. Grandpa George\n5. Grandma Georgina\n6. Mr. Bucket\n7. Mrs. Bucket\n8. Augustus Gloop\n9. Veruca Salt\n10. Violet Beauregarde\n11. Mike Teavee\n12. Mr. Willy Wonka\n\nThese are the main characters mentioned in the context provided.'