## [Demo 1]: Read data on S3 and Talk

### 1. RAG
- Load Document (PDF, docx, text) from S3 
- Store into Vector store 
- QnA using LLM with RetrievalQA chain provided by LangChain

Ref: https://python.langchain.com/docs/use_cases/question_answering/

In [None]:


# Read a PDF document from S3 using S3FileLoader
from langchain.document_loaders import S3FileLoader
loader = S3FileLoader("<bucket-name>", "<file-name>")
all_splits = loader.load_and_split()
print(f"Original: Number of document splits = {len(all_splits)}")


# Embedding and Store into Vector Store
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())

## TODO: use AOS as vector store
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import OpenSearchVectorSearch
from opensearchpy import RequestsHttpConnection

service = 'es' # must set the service as 'aoss'
region = 'us-east-1'
# credentials = boto3.Session(aws_access_key_id='xxxxxx',aws_secret_access_key='xxxxx').get_credentials()
# awsauth = AWS4Auth('xxxxx', 'xxxxxx', region,service, session_token=credentials.token)

vectorstore = OpenSearchVectorSearch.from_documents(
    all_splits,
    OpenAIEmbeddings(),
    opensearch_url="https://<opensearch-domain-name>-<region-name>.<service-name>.amazonaws.com",
    http_auth=("<username>", "<password>"),
    timeout = 300,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection,
    index_name="<index-name>",
    engine="faiss"
)




In [None]:
# question = "Why need a Modern Data architecture? summarize in 100 words"
# question = "What are the pillars of a Modern Data architecture? summarize in 100 words"
question = "Explain about Modern Data Architecture with a 5 year-old kid. summarize in 100 words"
docs = vectorstore.similarity_search(question, k=10)
print(f"Vector search: Number of document related to the question = {len(docs)}")

# QnA the content using RetrievalQA chain provided by Langchain
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
qa_chain = RetrievalQA.from_chain_type(llm,retriever=vectorstore.as_retriever())
output = qa_chain({"query": question})
print(output['result'])