In [14]:
import os
import logging
import sys
from pprint import pprint

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [2]:
with open(os.path.join(os.getcwd(), 'secrets', 'google_api_key.txt')) as f:
    api_key = f.read()

In [3]:
from llama_index.llms.palm import PaLM

llm = PaLM(api_key=api_key)

In [4]:
llm.complete('What is a Form 10K ?')

CompletionResponse(text="A Form 10K is an annual report that publicly traded companies are required to file with the Securities and Exchange Commission (SEC). It includes detailed information about the company's financial condition, operations, and management. Form 10Ks are important because they provide investors with a comprehensive overview of a company's business and its prospects for future growth.", additional_kwargs={}, raw={'output': "A Form 10K is an annual report that publicly traded companies are required to file with the Securities and Exchange Commission (SEC). It includes detailed information about the company's financial condition, operations, and management. Form 10Ks are important because they provide investors with a comprehensive overview of a company's business and its prospects for future growth.", 'safety_ratings': [{'category': <HarmCategory.HARM_CATEGORY_DEROGATORY: 1>, 'probability': <HarmProbability.NEGLIGIBLE: 1>}, {'category': <HarmCategory.HARM_CATEGORY_TOX

In [5]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: BAAI/bge-small-en
Load pretrained SentenceTransformer: BAAI/bge-small-en


In [6]:
from llama_index import LangchainEmbedding, ServiceContext

embed_model = LangchainEmbedding(
  hf
)

In [7]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext

In [9]:
documents = SimpleDirectoryReader(input_files=['./tmp/kyc_docs/amazon_10k.pdf']).load_data()

In [10]:
# Length of documents is number of pages in PDF
len(documents)

194

In [12]:
documents[0].metadata

{'page_label': '1',
 'file_name': 'amazon_10k.pdf',
 'file_path': 'tmp\\kyc_docs\\amazon_10k.pdf',
 'creation_date': '2023-11-04',
 'last_modified_date': '2023-11-04',
 'last_accessed_date': '2023-11-06'}

In [15]:
pprint(documents[0].get_content())

('Table of ContentsUNITED STATES\n'
 'SECURITIES AND EXCHANGE COMMISSION\n'
 'Washington, D.C. 20549\n'
 ' \n'
 '____________________________________FORM 10-K\n'
 '____________________________________ \n'
 '(Mark One)\n'
 '☒\n'
 'ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT '
 'OF 1934 For the fiscal year ended December 31, 2021\n'
 'or\n'
 '☐\n'
 'TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE '
 'ACT OF 1934 For the transition period from            to             .\n'
 'Commission File No. 000-22513\n'
 '____________________________________\n'
 'AMAZON.COM, INC.\n'
 '(Exact name of registrant as specified in its charter)\n'
 'Delaware\n'
 ' 91-1646860 (State or other jurisdiction of \n'
 'incorporation or organization)\n'
 ' (I.R.S. Employer Identification No.)\n'
 '410 Terry Avenue North\n'
 'Seattle, Washington 98109-5210\n'
 '(206) 266-1000\n'
 '(Address and telephone number, including area code, of registran\n'
 't’s 

In [16]:
service_context = ServiceContext.from_defaults(
    chunk_size=600,
    chunk_overlap=50,
    llm=llm,
    embed_model=embed_model
)

[nltk_data] Error loading punkt: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


In [17]:
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

DEBUG:llama_index.node_parser.node_utils:> Adding chunk: Table of ContentsUNITED STATES
SECURITIES AND E...
> Adding chunk: Table of ContentsUNITED STATES
SECURITIES AND E...
> Adding chunk: Table of ContentsUNITED STATES
SECURITIES AND E...
DEBUG:llama_index.node_parser.node_utils:> Adding chunk: Yes  
☒     No  ☐Indicate by check mark whether...
> Adding chunk: Yes  
☒     No  ☐Indicate by check mark whether...
> Adding chunk: Yes  
☒     No  ☐Indicate by check mark whether...
DEBUG:llama_index.node_parser.node_utils:> Adding chunk: Table of ContentsAMAZON.COM, INC.
FORM 10-K
For...
> Adding chunk: Table of ContentsAMAZON.COM, INC.
FORM 10-K
For...
> Adding chunk: Table of ContentsAMAZON.COM, INC.
FORM 10-K
For...
DEBUG:llama_index.node_parser.node_utils:> Adding chunk: Table of ContentsAMAZON.COM, INC.
PART I
Item 1...
> Adding chunk: Table of ContentsAMAZON.COM, INC.
PART I
Item 1...
> Adding chunk: Table of ContentsAMAZON.COM, INC.
PART I
Item 1...
DEBUG:llama_index.node_parser.no

DEBUG:llama_index.node_parser.node_utils:> Adding chunk: Table of ContentsAvailable Information
Our inve...
> Adding chunk: Table of ContentsAvailable Information
Our inve...
> Adding chunk: Table of ContentsAvailable Information
Our inve...
DEBUG:llama_index.node_parser.node_utils:> Adding chunk: Ms. Reynolds has served as Vice President, Worl...
> Adding chunk: Ms. Reynolds has served as Vice President, Worl...
> Adding chunk: Ms. Reynolds has served as Vice President, Worl...
DEBUG:llama_index.node_parser.node_utils:> Adding chunk: Table of ContentsItem 1A.
Risk Factors Please c...
> Adding chunk: Table of ContentsItem 1A.
Risk Factors Please c...
> Adding chunk: Table of ContentsItem 1A.
Risk Factors Please c...
DEBUG:llama_index.node_parser.node_utils:> Adding chunk: The Intern
et facilitates competitive entry and...
> Adding chunk: The Intern
et facilitates competitive entry and...
> Adding chunk: The Intern
et facilitates competitive entry and...
DEBUG:llama_index.node_parser.no

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [18]:
from llama_index import StorageContext, load_index_from_storage

index_file_path = os.path.join(os.getcwd(), 'tmp', 'kyc_index')

# Persist index to disk
index.storage_context.persist(index_file_path)

# Rebuild storage context
#storage_context = StorageContext.from_defaults(persist_dir=index_file_path)

# Load index from the storage context
#new_index = load_index_from_storage(storage_context)

DEBUG:fsspec.local:open file: c:/Users/knallathambi_dev/workspace/python-notebooks/llm-document-qna/tmp/kyc_index/docstore.json
open file: c:/Users/knallathambi_dev/workspace/python-notebooks/llm-document-qna/tmp/kyc_index/docstore.json
open file: c:/Users/knallathambi_dev/workspace/python-notebooks/llm-document-qna/tmp/kyc_index/docstore.json
DEBUG:fsspec.local:open file: c:/Users/knallathambi_dev/workspace/python-notebooks/llm-document-qna/tmp/kyc_index/index_store.json
open file: c:/Users/knallathambi_dev/workspace/python-notebooks/llm-document-qna/tmp/kyc_index/index_store.json
open file: c:/Users/knallathambi_dev/workspace/python-notebooks/llm-document-qna/tmp/kyc_index/index_store.json
DEBUG:fsspec.local:open file: c:/Users/knallathambi_dev/workspace/python-notebooks/llm-document-qna/tmp/kyc_index/vector_store.json
open file: c:/Users/knallathambi_dev/workspace/python-notebooks/llm-document-qna/tmp/kyc_index/vector_store.json
open file: c:/Users/knallathambi_dev/workspace/python-

In [19]:
from IPython.display import Markdown, display

# define prompt viewing function
def display_prompt_dict(prompts_dict):
    for k, p in prompts_dict.items():
        text_md = f"**Prompt Key**: {k}<br>" f"**Text:** <br>"
        display(Markdown(text_md))
        print(p.get_template())
        display(Markdown("<br><br>"))

In [20]:
query_engine = index.as_query_engine()

In [15]:
display_prompt_dict(query_engine.get_prompts())

**Prompt Key**: response_synthesizer:text_qa_template<br>**Text:** <br>

Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the query.
Query: {query_str}
Answer: 


<br><br>

**Prompt Key**: response_synthesizer:refine_template<br>**Text:** <br>

The original query is as follows: {query_str}
We have provided an existing answer: {existing_answer}
We have the opportunity to refine the existing answer (only if needed) with some more context below.
------------
{context_msg}
------------
Given the new context, refine the original answer to better answer the query. If the context isn't useful, return the original answer.
Refined Answer: 


<br><br>

In [23]:
response = query_engine.query("Give me detailed overview of the document ?")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

DEBUG:llama_index.indices.utils:> Top 2 nodes:
> [Node 3f068f44-75b7-4425-9767-8826f7c96fea] [Similarity score:             0.866272] ARTICLE XI. MISCELLANEOUS11.01    Amendments, Etc
66 11.02    Notices; Effectiveness; Electronic ...
> [Node 70ad03ed-61c8-4af3-8090-80ecdc12acbe] [Similarity score:             0.855469] ACCEPTANCE AND ACKNOWLEDGMENTIf the Company requests that you
r acceptance of this Agreement be e...
> Top 2 nodes:
> [Node 3f068f44-75b7-4425-9767-8826f7c96fea] [Similarity score:             0.866272] ARTICLE XI. MISCELLANEOUS11.01    Amendments, Etc
66 11.02    Notices; Effectiveness; Electronic ...
> [Node 70ad03ed-61c8-4af3-8090-80ecdc12acbe] [Similarity score:             0.855469] ACCEPTANCE AND ACKNOWLEDGMENTIf the Company requests that you
r acceptance of this Agreement be e...
> Top 2 nodes:
> [Node 3f068f44-75b7-4425-9767-8826f7c96fea] [Similarity score:             0.866272] ARTICLE XI. MISCELLANEOUS11.01    Amendments, Etc
66 11.02    Notices; Effectiveness

In [24]:
print(response)

This is a restricted stock unit award agreement between amazon.com, inc and the participant.


In [27]:
### Customize prompts

from llama_index.prompts import PromptTemplate

# New Table
new_summary_tmpl_str = (
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, answer the query. "
    "Do not make up answers.\n"
    "Query: {query_str}\n"
    "Answer: "
)
new_summary_tmpl = PromptTemplate(new_summary_tmpl_str)

query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": new_summary_tmpl}
)

In [28]:
display_prompt_dict(query_engine.get_prompts())

**Prompt Key**: response_synthesizer:text_qa_template<br>**Text:** <br>

Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the query. Do not make up answers.
Query: {query_str}
Answer: 


<br><br>

**Prompt Key**: response_synthesizer:refine_template<br>**Text:** <br>

The original query is as follows: {query_str}
We have provided an existing answer: {existing_answer}
We have the opportunity to refine the existing answer (only if needed) with some more context below.
------------
{context_msg}
------------
Given the new context, refine the original answer to better answer the query. If the context isn't useful, return the original answer.
Refined Answer: 


<br><br>

In [29]:
while True:
  query=input()
  print(f'Input ==> {query}')
  if query == 'exit':
    break
  response = query_engine.query(query)
  print(f'Response ==> {str(response)}')

Input ==> what is the trading symbol of amazon


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

DEBUG:llama_index.indices.utils:> Top 2 nodes:
> [Node 0d10ec91-722f-4fd8-89c8-87f1dbcb5a73] [Similarity score:             0.875206] Exhibit 32.1Certification Pursuant to 18 U.S.C. Section 1350
In connection with the Annual Report...
> [Node 64bf2cfa-bc62-419a-9627-a0f92d4634af] [Similarity score:             0.867533] Table of ContentsUNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
 
______...
> Top 2 nodes:
> [Node 0d10ec91-722f-4fd8-89c8-87f1dbcb5a73] [Similarity score:             0.875206] Exhibit 32.1Certification Pursuant to 18 U.S.C. Section 1350
In connection with the Annual Report...
> [Node 64bf2cfa-bc62-419a-9627-a0f92d4634af] [Similarity score:             0.867533] Table of ContentsUNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
 
______...
> Top 2 nodes:
> [Node 0d10ec91-722f-4fd8-89c8-87f1dbcb5a73] [Similarity score:             0.875206] Exhibit 32.1Certification Pursuant to 18 U.S.C. Section 1350
In connection with the 

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

DEBUG:llama_index.indices.utils:> Top 2 nodes:
> [Node b38ca4e9-872d-4044-9e31-348484b283e5] [Similarity score:             0.887033] Table of ContentsAMAZON.COM, INC.
PART I
Item 1.
Business This Annual Report on Form 10-K and the...
> [Node 62c89f8b-193d-46f3-ac57-d9fd324cb724] [Similarity score:             0.88363] We operate customer service centers globally, which are supplementedby co-sourced arrangemen
ts. ...
> Top 2 nodes:
> [Node b38ca4e9-872d-4044-9e31-348484b283e5] [Similarity score:             0.887033] Table of ContentsAMAZON.COM, INC.
PART I
Item 1.
Business This Annual Report on Form 10-K and the...
> [Node 62c89f8b-193d-46f3-ac57-d9fd324cb724] [Similarity score:             0.88363] We operate customer service centers globally, which are supplementedby co-sourced arrangemen
ts. ...
> Top 2 nodes:
> [Node b38ca4e9-872d-4044-9e31-348484b283e5] [Similarity score:             0.887033] Table of ContentsAMAZON.COM, INC.
PART I
Item 1.
Business This Annual Report on Form 1

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

DEBUG:llama_index.indices.utils:> Top 2 nodes:
> [Node b38ca4e9-872d-4044-9e31-348484b283e5] [Similarity score:             0.85363] Table of ContentsAMAZON.COM, INC.
PART I
Item 1.
Business This Annual Report on Form 10-K and the...
> [Node 884925c1-3452-4fc6-be14-03a4ac803eeb] [Similarity score:             0.849568] For example, in order to meet local ownership, regulatory licensing, and cybersecurity requiremen...
> Top 2 nodes:
> [Node b38ca4e9-872d-4044-9e31-348484b283e5] [Similarity score:             0.85363] Table of ContentsAMAZON.COM, INC.
PART I
Item 1.
Business This Annual Report on Form 10-K and the...
> [Node 884925c1-3452-4fc6-be14-03a4ac803eeb] [Similarity score:             0.849568] For example, in order to meet local ownership, regulatory licensing, and cybersecurity requiremen...
> Top 2 nodes:
> [Node b38ca4e9-872d-4044-9e31-348484b283e5] [Similarity score:             0.85363] Table of ContentsAMAZON.COM, INC.
PART I
Item 1.
Business This Annual Report on Form 10