Building a simple RAG chatbot with LangChain, Hugging Face, FAISS, Amazon SageMaker and Amazon Textract

In [10]:
%%sh
pip install sagemaker langchain langchain-community amazon-textract-caller amazon-textract-textractor sentence-transformers pypdf faiss-cpu==1.8.0 -qU

Imports required for RAG Implementation

In [2]:
import boto3, json, sagemaker
import os

from sagemaker.jumpstart.model import JumpStartModel
from transformers import AutoConfig
from typing import Dict

from langchain import LLMChain
from langchain.llms import SagemakerEndpoint
from langchain.llms.sagemaker_endpoint import LLMContentHandler
from langchain.prompts import PromptTemplate

from langchain_community.document_loaders import AmazonTextractPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [12]:
#SageMaker JumpStart provides APIs as part of SageMaker SDK that allow you to deploy and fine-tune models in network isolation using scripts that SageMaker maintains.

model = JumpStartModel(model_id="huggingface-llm-mistral-7b-instruct-v3", instance_type='ml.g5.2xlarge',model_version='1.1.1')
example_payloads = model.retrieve_all_examples()

predictor = model.deploy()

-----------!

 Configure the LangChain input and output handlers for our LLM

In [13]:
model_kwargs = {"max_new_tokens": 512, "top_p": 0.8, "temperature": 0.8}

class ContentHandler(LLMContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt: str, model_kwargs: Dict) -> bytes:
        input_str = json.dumps(
            # Mistral prompt, see https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
            {"inputs": f"<s>[INST] {prompt} [/INST]", "parameters": {**model_kwargs}}
        )
        return input_str.encode("utf-8")

    def transform_output(self, output: bytes) -> str:
        response_json = json.loads(output.read().decode("utf-8"))
        splits = response_json[0]["generated_text"].split("[/INST] ")
        return splits[1]

content_handler = ContentHandler()

In [14]:
sm_client = boto3.client('sagemaker')
smrt_client = boto3.client("sagemaker-runtime")

llm = SagemakerEndpoint(
    endpoint_name=predictor.endpoint_name,
    model_kwargs=model_kwargs,
    content_handler=content_handler,
    client=smrt_client,
)

In [15]:
print(llm, llm.endpoint_name)

[1mSagemakerEndpoint[0m
Params: {'endpoint_name': 'hf-llm-mistral-7b-instruct-v3-2024-11-03-11-11-44-874', 'model_kwargs': {'max_new_tokens': 512, 'top_p': 0.8, 'temperature': 0.8}} hf-llm-mistral-7b-instruct-v3-2024-11-03-11-11-44-874


RAG example with PDF files

Upload local PDF files to S3
Sources:
https://www.iea.org/reports/world-energy-investment-2024
Feel free to use your own files, the code below should work without any change.

In [11]:
# Define S3 bucket and prefix for PDF storage

bucket = "aits-mr-tankwar-chatbot-730335476518-us-east-1"
prefix = "RAG"

In [12]:
%%sh -s $bucket $prefix

aws s3 ls s3://$1/$2/
aws s3 cp --recursive pdfs s3://$1/$2/

2024-11-03 02:41:29          0 
2024-11-03 11:21:50     178490 Coverage_Medical.pdf
upload: pdfs/.ipynb_checkpoints/Coverage_Medical-checkpoint.pdf to s3://aits-mr-tankwar-chatbot-730335476518-us-east-1/RAG/.ipynb_checkpoints/Coverage_Medical-checkpoint.pdf
upload: pdfs/Coverage_Medical.pdf to s3://aits-mr-tankwar-chatbot-730335476518-us-east-1/RAG/Coverage_Medical.pdf


In [13]:
# Build list of S3 URIs

s3 = boto3.client("s3")
objs = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)

# Extract 'Contents' if any objects are found
if 'Contents' in objs:
    objs = objs['Contents']
    uris = [f's3://{bucket}/{obj["Key"]}' for obj in objs if not obj["Key"].endswith('/')]
else:
    uris = []

uris

['s3://aits-mr-tankwar-chatbot-730335476518-us-east-1/RAG/.ipynb_checkpoints/Coverage_Medical-checkpoint.pdf',
 's3://aits-mr-tankwar-chatbot-730335476518-us-east-1/RAG/Coverage_Medical.pdf']

In [24]:
from langchain_community.document_loaders import PyPDFLoader
splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=0)
all_chunks = []

loader = PyPDFLoader("./pdfs/Coverage_Medical.pdf")
documents = loader.load()
chunks = splitter.split_documents(documents)
all_chunks += chunks
print(f"Loaded {len(documents)} pages, {len(chunks)} chunks")

Loaded 7 pages, 67 chunks


In [14]:
#not working
%%time
textract_client = boto3.client('textract', region_name="us-east-1")
splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=0)

all_chunks = []
for uri in uris:
    try:
        loader = AmazonTextractPDFLoader(uri, client=textract_client)
        print(f"{uri} started")
        documents = loader.load()
        print(len(documents))
    except Exception as e:
        print(f"An error occurred: {e}")

s3://aits-mr-tankwar-chatbot-730335476518-us-east-1/RAG/.ipynb_checkpoints/Coverage_Medical-checkpoint.pdf started
An error occurred: Read timeout on endpoint URL: "https://textract.us-east-1.amazonaws.com/"
s3://aits-mr-tankwar-chatbot-730335476518-us-east-1/RAG/Coverage_Medical.pdf started
An error occurred: Read timeout on endpoint URL: "https://textract.us-east-1.amazonaws.com/"
CPU times: user 149 ms, sys: 23.9 ms, total: 173 ms
Wall time: 10min 9s


Embed document chunks and store them in FAISS
https://github.com/facebookresearch/faiss

In [None]:
%%time
# Embed chunks
embeddings_db = FAISS.from_documents(all_chunks, embeddings)

In [None]:
# Save database
embeddings_db.save_local("faiss_index")

Shortcut : load existing embedding database

In [None]:
embeddings_db = FAISS.load_local("faiss_index", embeddings)