In [10]:
!pip3 install -U -q langchain opensearch-py regex requests_aws4auth PyPDF2 boto3 sagemaker pypdf

[0m

In [30]:
!pip3 install -U -q "ai21[AWS]"

[0m

In [31]:
import os
import sagemaker
import boto3
from opensearch import get_stack_details, get_credentials, opensearch_index_name
from PyPDF2 import PdfReader
import io
from langchain.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.vectorstores import OpenSearchVectorSearch
from load_embeddings import create_sagemaker_embeddings_from_js_model, encoder_endpoint_name, encoder_name
import urllib
import json
import ai21

In [4]:
boto3_session = boto3.session.Session()
results = get_stack_details(boto3_session.region_name)
results

opensearch-embedding-stack stack found: True


{'opensearch_domain_endpoint': 'https://search-opensearchservi-ttpuslrpxaz9-exltly5br64gbamsfduci2bwsy.us-west-2.es.amazonaws.com',
 'opensearch_domain_name': 'arn:aws:es:us-west-2:102048127330:domain/opensearchservi-ttpuslrpxaz9',
 'opensearch_secretid': 'arn:aws:secretsmanager:us-west-2:102048127330:secret:OpenSearchSecret-opensearch-embedding-stack-Go1IVo',
 'os_creds_secretid_in_secrets_manager': 'OpenSearchSecret-opensearch-embedding-stack'}

In [5]:
print(f"""
    Opensearch_domain_name={results['opensearch_domain_name']},
    Opensearch_domain_endpoint={results['opensearch_domain_endpoint']}
    opensearch_secretid={results['opensearch_secretid']}
""")


    Opensearch_domain_name=arn:aws:es:us-west-2:102048127330:domain/opensearchservi-ttpuslrpxaz9,
    Opensearch_domain_endpoint=https://search-opensearchservi-ttpuslrpxaz9-exltly5br64gbamsfduci2bwsy.us-west-2.es.amazonaws.com
    opensearch_secretid=arn:aws:secretsmanager:us-west-2:102048127330:secret:OpenSearchSecret-opensearch-embedding-stack-Go1IVo



In [8]:
## retrieve PDF artificat from internet using http (in this case, US Constitution)


doc_urls = [
  'https://s2.q4cdn.com/299287126/files/doc_financials/2023/ar/2022-Shareholder-Letter.pdf',
  'https://s2.q4cdn.com/299287126/files/doc_financials/2022/ar/2021-Shareholder-Letter.pdf'
]

## download the file locally and then load it using pypdfloader 

pdf_local_paths = []
for doc_url in doc_urls:
  local_path = f"""docs/{doc_url.split('/')[-1]}"""
  pdf = urllib.request.urlopen(doc_url)
  with open(local_path,'wb') as output:
    output.write(pdf.read())
    print(f'Downloaded {doc_url} and stored locally at {local_path}')
    pdf_local_paths.append(local_path)

Downloaded https://s2.q4cdn.com/299287126/files/doc_financials/2023/ar/2022-Shareholder-Letter.pdf and stored locally at docs/2022-Shareholder-Letter.pdf
Downloaded https://s2.q4cdn.com/299287126/files/doc_financials/2022/ar/2021-Shareholder-Letter.pdf and stored locally at docs/2021-Shareholder-Letter.pdf


In [11]:
loader = PyPDFDirectoryLoader('docs/')
data = loader.load()
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 19 document(s) in your data
There are 4328 characters in your document


In [13]:
## chunk size and overlap values should be finetune
CHUNK_SIZE=1024
CHUNK_OVERLAP=64
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, 
                                               chunk_overlap=CHUNK_OVERLAP,
                                               length_function = len,
                                               add_start_index = True,)
docs = text_splitter.split_documents(data)
print (f'Now you have {len(docs)} documents')
print (f'{docs[0]}')
print (f'{docs[1]}')
print (f'{docs[2]}')

Now you have 112 documents
page_content='Dear shareholders:\nOver the past 25 years at Amazon, I’ve had the opportunity to write many narratives, emails, letters, and\nkeynotes for employees, customers, and partners. But, this is the first time I’ve had the honor of writing ourannual shareholder letter as CEO of Amazon. Jeff set the bar high on these letters, and I will try to keepthem worth reading.\nWhen the pandemic started in early 2020, few people thought it would be as expansive or long-running as' metadata={'source': 'docs/2021-Shareholder-Letter.pdf', 'page': 0, 'start_index': 0}
page_content='it’s been. Whatever role Amazon played in the world up to that point became further magnified as mostphysical venues shut down for long periods of time and people spent their days at home. This meant thathundreds of millions of people relied on Amazon for PPE, food, clothing, and various other items thathelped them navigate this unprecedented time. Businesses and governments also had to s

In [18]:
# Helper function to clean up document

import regex as re

def postproc(s):
    s = s.replace(u'\xa0', u' ') # no-break space 
    s = s.replace('\n', ' ') # new-line
    return s

for doc in docs:
    doc.page_content = postproc(doc.page_content)

In [20]:
embeddings = create_sagemaker_embeddings_from_js_model(encoder_endpoint_name, 
                                                       boto3_session.region_name)

In [24]:
creds = get_credentials(results['opensearch_secretid'], boto3_session.region_name)
http_auth = (creds['username'], creds['password'])

In [25]:
docsearch = OpenSearchVectorSearch(index_name=opensearch_index_name,
                                       embedding_function=embeddings,
                                       opensearch_url=results['opensearch_domain_endpoint'],
                                       http_auth=http_auth) 

In [26]:
docsearch.add_documents(documents=docs)

['78f19356-85a7-457d-a143-2dd4334a0a2d',
 'fbec7e02-5485-41bd-abfa-a40a8f5a1dea',
 '32f32650-436e-4194-9fc0-ddc6bc6a82a8',
 '6bc9bf8c-7bff-42a5-a394-bb16601ecbaf',
 'fbfb4544-2767-4f6a-bedd-966929860469',
 'e1c74bf3-30ac-4851-8ef3-5574d8536033',
 'f919f8c4-35bd-4ac5-a75d-9ac9868ea972',
 '2bfb44f6-fbe3-4f42-b24a-0c671c4d6e65',
 'f838da99-354e-49a3-8fcf-224bd6c54790',
 'affbb984-cb30-442d-9dc7-e222012b06d2',
 '4de09c64-00c9-4b1c-bed4-9a0b52f16b15',
 '430e6d9e-e33a-4ee9-abae-0c478c9a30bd',
 'c7ca6dbe-3293-41e5-b58e-2ff4e8d3b342',
 'c6bb1700-7419-4f63-82a1-e56e352a5a72',
 '7a72a0e1-3f7c-4c89-8c55-7f3f27166e8d',
 'f01a31b2-7352-476d-847d-2346c1e0db57',
 '8bd7393e-0e2b-4211-ae5b-9fe3332fca9e',
 '2aa65678-f411-46c6-9347-d91f49f556f7',
 '8b5ba917-8d01-4464-9b77-a0fe826e70ab',
 '02508f76-192a-4153-8d77-c98d66c26ce0',
 '1479bb8e-6162-44f7-bd40-be9e7b6b8161',
 '8cad90b2-6951-43a7-996c-ef54161d81cf',
 '1f955357-8bc8-48f8-85a0-2a7e6df7e026',
 '1331e192-502e-457c-bf2b-07eea34a6c57',
 '278bff51-7a8e-

In [27]:
len(docs)

112

In [28]:
query = "Tell me about the return of office policy"
docsearch = OpenSearchVectorSearch(index_name=opensearch_index_name,
                                   embedding_function=embeddings,
                                   opensearch_url=results['opensearch_domain_endpoint'],
                                   http_auth=http_auth)
similar_docs = docsearch.similarity_search(query, k=5, include_metadata=True)
print(f'Found : {len(similar_docs)} docs')
for doc in similar_docs:
    print(doc)

Found : 5 docs
page_content='We also looked hard at how we were working together as a team and asked our corporate employees to come back to the office at least three days a week , beginning in May. During the pandemic, our employees rallied to' metadata={'source': 'docs/2022-Shareholder-Letter.pdf', 'page': 1, 'start_index': 1111}
page_content='commitment and effort from our employees all over the world. I’m not sure any of us would have gotten' metadata={'source': 'docs/2021-Shareholder-Letter.pdf', 'page': 0, 'start_index': 4227}
page_content='Ironically, just before COVID started, we’d made the decision to invest billions of incremental dollars over' metadata={'source': 'docs/2021-Shareholder-Letter.pdf', 'page': 1, 'start_index': 3252}
page_content='the surface of what’s possible to date, and plan to keep building the features ourbusiness customers tell us they need and want.' metadata={'source': 'docs/2022-Shareholder-Letter.pdf', 'page': 4, 'start_index': 2879}
page_content='the

In [32]:

model_name = "contextual-answers"
endpoint_name = f'{model_name}-endpoint'

context = similar_docs[0].page_content
print(context)


We also looked hard at how we were working together as a team and asked our corporate employees to come back to the office at least three days a week , beginning in May. During the pandemic, our employees rallied to


In [34]:
response = ai21.Answer.execute(
    context=context,
    question=query,
    destination=ai21.SageMakerDestination(endpoint_name)
)

print(response.answer)


We will require corporate employees to be in the office at least three days a week, beginning in May.
