#### Installs & Imports

In [None]:
# Install libraries
%pip install langchain
%pip install openai
%pip install PyPDF2
%pip install pinecone-client
%pip install tiktoken

from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
import os
import tqdm
import pinecone

#### Enter API KEYS

In [None]:
# Enter your API key & region from Pinecone. 
# Link to keys: https://platform.openai.com/account/billing/overview
PINECONE_API_KEY = 'API KEY'
PINECONE_API_ENV = 'REGION'

# Enter your API key from Openai. 
# Link to keys: https://platform.openai.com/account/billing/overview
os.environ["OPENAI_API_KEY"] = "API KEY"

#### Mount Google Drive

In [None]:
# Access to Google Drive
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

#### Load PDF documents

In [3]:
# Location of the pdf file/files. 
reader = PdfReader('/content/gdrive/MyDrive/Notebooks/chatgpt/financial_report_2016.pdf')

In [None]:
# Iterate through the PDF pages, extract the text and hold in a variable - raw_text
raw_text = ''
for i, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

In [None]:
# Check the text - returns 200 characters
raw_text[:200]

In [None]:
# Set the params for text spitter - RecursiveCharacterTextSplitter with an overlap of 100 between the documents to ensure context
# This splits the raw text into documents based on words, sentences, paragraphs
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 100,
    length_function = len,
)
documents = text_splitter.split_text(raw_text)

In [None]:
# Should have 266 documents with 198759 characters
print (f'You have {len(documents)} document(s) loaded')
print (f'There are {len(documents[0])} characters in the first document')

#### Create embeddings for storing vectors in Pinecone

In [None]:
# Embed - convert to vectors (266)
embeddings = OpenAIEmbeddings(openai_api_key=os.environ['OPENAI_API_KEY'])

In [20]:
# Connect to Pinecone and set namespace
pinecone.init(
    api_key=PINECONE_API_KEY,  
    environment=PINECONE_API_ENV  
)
# index name must match index created in Pinecone
index_name = "financial"
# to make semantic searching easier add namespace
namespace = "FR_2016"

In [21]:
# load up the embeddings into Pinecone index - namespace
# check in Pinecone - you should have 247 vectors
docsearch = Pinecone.from_texts(documents, embeddings, index_name=index_name, namespace=namespace)

#### Query 'documents'

In [22]:
# using llm.chain you can now query the document
# chain_type=stuff 
# temperature=0 to cut down waffle
llm = OpenAI(temperature=0, openai_api_key=os.environ['OPENAI_API_KEY'])
chain = load_qa_chain(llm, chain_type="stuff")

In [23]:
# build query
# conduct a similarity search against vectors in Pinecone
query = "break the revenue figure down"
docs = docsearch.similarity_search(query, include_metadata=True)

In [24]:
# run question / query
chain.run(input_documents=docs, question=query)