#### Installs & Imports

In [11]:
# Install libraries
%pip install langchain
%pip install openai
%pip install PyPDF2
%pip install pinecone-client
%pip install tiktoken

from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
import os
import tqdm
import pinecone

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting langchain
  Downloading langchain-0.0.163-py3-none-any.whl (781 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m782.0/782.0 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain)
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting async-timeout<5.0.0,>=4.0.0 (from langchain)
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting dataclasses-json<0.6.0,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.5.7-py3-none-any.whl (25 kB)
Collecting openapi-schema-pydantic<2.0,>=1.2 (from langchain)
  Downloading openapi_schema_pydantic-1.2.4-py3-none-any.whl (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

  from tqdm.autonotebook import tqdm


#### Enter API KEYS

In [19]:
# Enter your API key & region from Pinecone. 
# Link to keys: https://platform.openai.com/account/billing/overview
PINECONE_API_KEY = 'API KEY'
PINECONE_API_ENV = 'REGION'

# Enter your API key from Openai. 
# Link to keys: https://platform.openai.com/account/billing/overview
os.environ['OPENAI_API_KEY'] = 'API KEY'

#### Mount Google Drive

In [8]:
# Locally upload PDF
# Access to Google Drive
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


#### Load PDF documents

In [13]:
# Location of the pdf file/files. 
reader = PdfReader('/content/gdrive/MyDrive/Notebooks/chatgpt/financial_report_2016.pdf')

In [14]:
# Iterate through the PDF pages, extract the text and hold in a variable - raw_text
raw_text = ''
for i, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

In [15]:
# Check the text - returns 200 characters
raw_text[:200]

'Annual financial report \nand financial statements\nYear to December 31, 2016 \n  \n \n \n \nWORLD INTELLECTUAL P ROPERTY ORGANIZATION  \n \n \nANNUAL FINANCIAL REPORT  \nAND FINANCIAL STATEMENTS  \n \nYEAR TO DEC'

In [16]:
# Set the params for text spitter - RecursiveCharacterTextSplitter with an overlap of 100 between the documents to ensure context
# This splits the raw text into documents based on words, sentences, paragraphs
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 100,
    length_function = len,
)
documents = text_splitter.split_text(raw_text)

In [17]:
# Should have 266 documents with 975 characters in first document
print (f'You have {len(documents)} document(s) loaded')
print (f'There are {len(documents[0])} characters in the first document')

You have 266 document(s) loaded
There are 975 characters in the first document


#### Create embeddings for storing vectors in Pinecone

In [20]:
# Embed - convert to vectors (266)
embeddings = OpenAIEmbeddings(openai_api_key=os.environ['OPENAI_API_KEY'])

In [21]:
# Connect to Pinecone and set namespace
pinecone.init(
    api_key=PINECONE_API_KEY,  
    environment=PINECONE_API_ENV  
)
# index name must match index created in Pinecone
index_name = 'financial'
# to make semantic searching easier add namespace
namespace = 'FR_2016'

In [None]:
# DO NOT execute this function. Not applicable to the free tier in Pinecone.
# https://docs.pinecone.io/docs
# The following example creates an index without a metadata configuration.
# By default, Pinecone indexes all metadata.
pinecone.create_index('financial', 
                      dimension=1536, 
                      metric='cosine', 
                      pods=1, 
                      replicas=1, 
                      pod_type='p1.x1')

In [22]:
# load up the embeddings into Pinecone index - namespace
# Check in Pinecone - you should have 247 vectors
docsearch = Pinecone.from_texts(documents, embeddings, index_name=index_name, namespace=namespace)

#### Query 'documents'

In [23]:
# Using llm.chain you can now query the document
# chain_type=stuff 
# temperature=0 to cut down waffle
llm = OpenAI(temperature=0, openai_api_key=os.environ['OPENAI_API_KEY'])
chain = load_qa_chain(llm, chain_type="stuff")

In [24]:
# Build query
# Conduct a similarity search against vectors in Pinecone
query = "summerize the report"
docs = docsearch.similarity_search(query, include_metadata=True)

In [25]:
# Run question / query
chain.run(input_documents=docs, question=query)

" The report is an annual financial report and financial statements for the World Intellectual Property Organization for the year ending December 31, 2016. It includes an overview of the organization's operations and environment, financial objectives and strategies, risk management strategy, financial performance and financial position during the year, financial statement discussion and analysis, and the financial statements themselves. The report also includes an independent auditor's report, which states that the financial statements present fairly, in all material respects, the financial position of WIPO as of December 31, 2016, and its financial performance and cash flows for the year then ended in accordance with International Public Sector Accounting Standards (IPSAS)."