## Installs & Imports

In [None]:
%pip install langchain openai unstructured python-magic chromadb pinecone-client tiktoken -q
%pip install "unstructured[local-inference]" -q
%pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2" -q
%pip install layoutparser[layoutmodels,tesseract] -q
#!pip uninstall PIL
#!pip uninstall Pillow
#!pip install Pillow
%pip install --upgrade Pillow -q # there's a version 9.5 bug with Python 3.10
!apt-get install -q libmagic-dev
!apt-get install -q poppler-utils
!apt-get install -q tesseract-ocr

In [13]:
import os
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

import magic
import nltk
nltk.download('punkt')

import pinecone
PINECONE_API_KEY = 'PINECONE API KEY'
PINECONE_API_ENV = 'REGION'

os.environ["OPENAI_API_KEY"] = "OPENAI API KEY"

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load documents

In [3]:
# Update with the drive path with the documents
loader = DirectoryLoader('/content/drive/MyDrive/Notebooks/chatgpt/', glob='**/*.pdf')

In [5]:
# takes about 14 mins to load .. the loader runs through the directory for any type of unstructured data
documents = loader.load()

Downloading model_final.pth:   0%|          | 0.00/330M [00:00<?, ?B/s]

Downloading (…)50_FPN_3x/config.yml:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

In [6]:
# should have 1 document
print (f'You have {len(documents)} document(s) loaded')
print (f'There are {len(documents[0].page_content)} characters in your document')

You have 1 document(s) in your PDF
There are 198759 characters in your document


In [8]:
# check document
documents[:1]

[Document(page_content='Annual financial report and financial statements\n\nYear to December 31, 2016\n\nINTELLECTUAL PROPERTY ORGANIZATION\n\nANNUAL FINANCIAL REPORT AND FINANCIAL STATEMENTS\n\nINTRODUCTION FINANCIAL STATEMENT DISCUSSION AND ANALYSIS\n\nINDEPENDENT AUDITOR’S REPORT\n\nSTATEMENT I - Statement of Financial Position STATEMENT II – Statement of Financial Performance STATEMENT III – Statement of Changes in Net Assets STATEMENT IV – Statement of Cash Flow STATEMENT V – Statement of Comparison of Budget and NOTES TO THE FINANCIAL STATEMENTS\n\nI - Statement of Financial Position II – Statement of Financial Performance III – Statement of Changes in Net Assets IV – Statement of Cash Flow V – Statement of Comparison of Budget and Actual Amounts TO THE FINANCIAL STATEMENTS Note\n\n: Objectives and Budget of the Organization Note\n\n: Significant Accounting Policies Note\n\n: Cash and Cash Equivalents Note\n\n: Investments Note\n\n: Receivables Note\n\n: Inventories Note\n\n: Equ

In [20]:
# set the params for text spitter - RecursiveCharacterTextSplitter with an overlap of 100 between the documents to ensure context
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

In [21]:
# split the document(s) into words, sentences, paragraphs
texts = text_splitter.split_documents(documents)

## Get embeddings for semantic search

In [22]:
# embed - convert to vectors
# should have 247 vectors
embeddings = OpenAIEmbeddings(openai_api_key=os.environ['OPENAI_API_KEY'])

In [23]:
# connect to Pinecone and set namespace
pinecone.init(
    api_key=PINECONE_API_KEY,  
    environment=PINECONE_API_ENV  
)
# index name must match index created in Pinecone
index_name = "financial"
# to make semantic searching easier add namespace
namespace = "FR_2016"

In [24]:
# load up the embeddings into Pinecone index - namespace
# check in Pinecone - you should have 247 vectors
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name, namespace=namespace)

## Query documents

In [25]:
# using llm.chain you can now query the document
# chain_type=stuff 
# temperature=0 to cut down waffle
llm = OpenAI(temperature=0, openai_api_key=os.environ['OPENAI_API_KEY'])
chain = load_qa_chain(llm, chain_type="stuff")

In [29]:
# build query
# conduct docsearch against vectors for similarity
query = "what is the total revenue broken down"
docs = docsearch.similarity_search(query, include_metadata=True)

In [30]:
# run question / query
chain.run(input_documents=docs, question=query)

' The total revenue for 2016 was 387.7 million Swiss francs, which was broken down as follows: Assessed contributions 17.3 million Swiss francs, Voluntary contributions 10.2 million Swiss francs, Publications revenue 0.5 million Swiss francs, Investment revenue 0 million Swiss francs, Fees PCT system 290.7 million Swiss francs, Madrid system 59.6 million Swiss francs, Hague system 5.0 million Swiss francs, Sub-total fees 355.3 million Swiss francs, Arbitration and Mediation 1.6 million Swiss francs, and Other/miscellaneous revenue 2.8 million Swiss francs.'