In [19]:
!pip install -qU \
    boto3 \
    sagemaker \
    pinecone-client \
    langchain \
    flask \
    "pandas<2.0.0"

10369.16s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


In [20]:
PINECONE_API_KEY="pineconekey"
BUCKET_NAME="assetsbucket"
DOCUMENTATION_FOLDER_NAME="sagemaker_documentation/"

In [21]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONSerializer
from typing import List
import json
import numpy as np

class ModelEncoder:
    def __init__(self):
        self.endpoint_name = "minilm-embedding"
        self.model_id = "sentence-transformers/all-MiniLM-L6-v2"
        self.embedding_dimension = 384
        self.encoder = Predictor(endpoint_name=self.endpoint_name, serializer=JSONSerializer())
    
    def embed_docs(self, docs) -> List[List[float]]:
        out = self.encoder.predict({"inputs": docs})
        decoded_string = out.decode('utf-8')
        array = json.loads(decoded_string)
        embeddings = np.mean(np.array(array), axis=1)
        return embeddings.tolist()

# Create LLM Generator endpoint

In [22]:
import sagemaker
import boto3
from sagemaker.jumpstart.model import JumpStartModel

#role_arn = "arn:aws:iam::${AWS::AccountId}:role/${SageMakerExecutionRole}"
role = sagemaker.get_execution_role()
generator_endpoint_name = "llama-2-generator"
model_id, model_version = "meta-textgeneration-llama-2-7b-f", "2.*"

sagemaker_client = boto3.client('sagemaker')

endpoint_configs = sagemaker_client.list_endpoint_configs()['EndpointConfigs']
config_names = [cfg['EndpointConfigName'] for cfg in endpoint_configs]
endpoints = sagemaker_client.list_endpoints()['Endpoints']
endpoint_names = [ep['EndpointName'] for ep in endpoints]

if generator_endpoint_name in config_names and generator_endpoint_name not in endpoint_names:
    # Delete config and endpoint and recreate all
    sagemaker_client.delete_endpoint_config(EndpointConfigName=generator_endpoint_name)
    model = JumpStartModel(model_id=model_id, model_version=model_version)
    model.deploy(initial_instance_count=1, instance_type="ml.g5.2xlarge", endpoint_name=generator_endpoint_name)

elif generator_endpoint_name not in endpoint_names:
    model = JumpStartModel(model_id=model_id, model_version=model_version)
    model.deploy(initial_instance_count=1, instance_type="ml.g5.2xlarge", endpoint_name=generator_endpoint_name)

# Create Embedding Model Endpoint

In [23]:
import sagemaker
import boto3
from sagemaker.jumpstart.model import JumpStartModel
from sagemaker.huggingface import HuggingFaceModel

#role_arn = "arn:aws:iam::${AWS::AccountId}:role/${SageMakerExecutionRole}"
role = sagemaker.get_execution_role()

embedding_endpoint_name = "minilm-embedding"

hub_config = {
    "HF_MODEL_ID": "sentence-transformers/all-MiniLM-L6-v2",
    "HF_TASK": "feature-extraction"
}

huggingface_model = HuggingFaceModel(
    env=hub_config,
    #role=role_arn,
    role=role,
    transformers_version= "4.6",
    pytorch_version = "1.7",
    py_version = "py36"
)

sagemaker_client = boto3.client('sagemaker')

endpoint_configs = sagemaker_client.list_endpoint_configs()['EndpointConfigs']
config_names = [cfg['EndpointConfigName'] for cfg in endpoint_configs]
endpoints = sagemaker_client.list_endpoints()['Endpoints']
endpoint_names = [ep['EndpointName'] for ep in endpoints]

if embedding_endpoint_name in config_names and embedding_endpoint_name not in endpoint_names:
    # Delete config and endpoint and recreate all
    sagemaker_client.delete_endpoint_config(EndpointConfigName=embedding_endpoint_name)
    encoder = huggingface_model.deploy(
            initial_instance_count=1,
            instance_type="ml.m5.large",
            endpoint_name=embedding_endpoint_name
        )

elif embedding_endpoint_name not in endpoint_names:
    encoder = huggingface_model.deploy(
            initial_instance_count=1,
            instance_type="ml.m5.large",
            endpoint_name=embedding_endpoint_name
        )

# ModelEncoder Class

In [24]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONSerializer
from typing import List
import json
import numpy as np

class ModelEncoder:
    def __init__(self):
        self.endpoint_name = "minilm-embedding"
        self.model_id = "sentence-transformers/all-MiniLM-L6-v2"
        self.embedding_dimension = 384
        self.encoder = Predictor(endpoint_name=self.endpoint_name, serializer=JSONSerializer())
    
    def embed_docs(self, docs) -> List[List[float]]:
        out = self.encoder.predict({"inputs": docs})
        decoded_string = out.decode('utf-8')
        array = json.loads(decoded_string)
        embeddings = np.mean(np.array(array), axis=1)
        return embeddings.tolist()

# Vector Class

In [25]:
import pinecone
import time
from typing import List
from tqdm.auto import tqdm
import os
#from model_enc import ModelEncoder

class Vector:
    def __init__(self, api_key, environment, index_name):
        #os.getenv('PINECONE_API_KEY') 
        self.api_key = api_key
        self.environment = environment # "gcp-starter"
        self.index_name = index_name #"rag-aws-poc-index"
        self.k = 4
        self.embedding_dimension = 384 #must match ModelEncoder embeding dimension
        self.batch_size = 1
        self.model_encoder = ModelEncoder()
        print(api_key)
        pinecone.init(api_key=api_key, environment=environment)
        self.index = pinecone.Index(index_name)

    
    def query(self, query_vec):
        top_k_results = self.index.query(query_vec, top_k=self.k, include_metadata=True)
        return top_k_results
        
    def create_vector_store(self):
        if self.index_name in pinecone.list_indexes():
            pinecone.delete_index(self.index_name)
        pinecone.create_index(name=self.index_name, dimension=self.embedding_dimension, metric='cosine')
        while not pinecone.describe_index(self.index_name).status['ready']:
            time.sleep(1)
        return "Index updated"
    
    def embeed_chunks(self, chunks):
        for i in tqdm(range(0, len(chunks), self.batch_size)):
            i_end = min(i+ self.batch_size, len(chunks))
            ids = [str(x) for x in range(i, i_end)]
            metas = [{"text": text, "category": metadata} for text, metadata in zip(chunks['page_content'][i:i_end], chunks['metadata'][i:i_end])]
            texts = [text for text in chunks['page_content'][i:i_end].tolist()]
            embeddings = self.model_encoder.embed_docs(texts)
            records = zip(ids, embeddings, metas)
            self.index.upsert(vectors=records)

        print("Chunks embedded to vector store Succesfully")    
        print(self.index.describe_index_stats())

    def chunks_to_vector_store(self, chunks):
        self.create_vector_store()
        self.embeed_chunks(chunks)
        print("Index updated")

# ModelGen Class

In [26]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONSerializer
from typing import List
from flask import jsonify
#from model_enc import ModelEncoder
#from vector import Vector
import os
import json

class ModelGenerator:
    def __init__(self):
        self.endpoint_name = "llama-2-generator"
        self.predictor = Predictor(endpoint_name=self.endpoint_name, serializer=JSONSerializer())
        self.model_encoder = ModelEncoder()
        self.vector_store = Vector(
            api_key=PINECONE_API_KEY,
            environment="gcp-starter",
            index_name="rag-aws-poc-index"
        )
        self.max_section_len = 2000
        self.separator = "\n"

    def construct_unified_context(self, contexts:List[str]) -> str:
        chosen_sections = []
        chosen_sections_len = 0
        for text in contexts:
            text = text.strip()
            chosen_sections_len+=len(text)+2
            if chosen_sections_len >self.max_section_len:
                break
            chosen_sections.append(text)
        concatenated_doc = self.separator.join(chosen_sections)
        print(f"Chunks used {len(chosen_sections)}, chunks: \n {concatenated_doc}")
        return concatenated_doc

    def create_payload(self, question, context_str) -> dict:
        #llama2 compatible prompt
        prompt_template = """Answer the following QUESTION based on the CONTEXT given. If you do not know the answer and the CONTEXT doesn't
        contain the answer truthfully say "I don't know".

        CONTEXT:
        {context}

        ANSWER:
        """
        text_input = prompt_template.replace("{context}", context_str).replace("{question}", question)
        payload = {
            "inputs":
            [
                [
                {"role": "system", "content": text_input},
                {"role": "user", "content": question},
                ]
            ],
            "parameters": {"max_new_tokens":256, "top_p": 0.9, "temperature":0.6, "return_full_text":False}
        }
        return payload
    
    def predict_with_enriched_context(self, question):
        query_vec = self.model_encoder.embed_docs(question)[0]
        print("queryvec")
        print(query_vec)
        top_k_results = self.vector_store.query(query_vec)
        print("topkresults")
        print(top_k_results)
        contexts = [match.metadata["text"] for match in top_k_results.matches]
        contexts_str = self.construct_unified_context(contexts)
        payload = self.create_payload(question, contexts_str)
        out = self.predictor.predict(payload, custom_attributes="accept_eula=true")
        decoded_string = out.decode('utf-8')
        json_response = json.loads(decoded_string)
        return json_response[0]['generation']['content'], payload

# Chunk Class

In [27]:
import pandas as pd
import boto3
from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from io import StringIO
import os

class Chunks:
    def __init__(self):
        self.s3_bucket_name = BUCKET_NAME
        self.s3_folder_path = DOCUMENTATION_FOLDER_NAME
        self.s3_file_key = 'chunks/pdf_docs_chunks.csv'
        self.s3 = boto3.client('s3')
        self.chunk_size = 250
        self.chunk_overlap = 30

    def chunk_markdown_file_by_headers(self, markdown_text):
        headers_to_split_on = [
            ("#", "Header 1"),
            ("##", "Header 2"),
        ]
        markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
        md_header_splits = markdown_splitter.split_text(markdown_text)
        
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap
        )
        
        splits = text_splitter.split_documents(md_header_splits)
        return splits

    def read_s3_md_files_to_chunks(self, bucket_name, folder_path):
        chunks_all_files = []
        objects = self.s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_path)

        if 'Contents' in objects:
            for obj in objects['Contents']:
                file_key = obj['Key']
                if file_key.endswith('.md'):
                    file_obj = self.s3.get_object(Bucket=bucket_name, Key=file_key)
                    md_content = file_obj['Body'].read().decode('utf-8')
                    chunks = self.chunk_markdown_file_by_headers(md_content)
                    chunks_all_files.extend(chunks)

        return chunks_all_files
    
    def consolidate_all_chunks_to_csv_and_upload_to_s3(self):
        # Read chunks from the specified folder in the S3 bucket
        all_chunks = self.read_s3_md_files_to_chunks(self.s3_bucket_name, self.s3_folder_path)
        data = [{'page_content': doc.page_content, 'metadata': doc.metadata} for doc in all_chunks]
        pdf_docs_chunks = pd.DataFrame(data)
        pdf_docs_chunks.to_csv('pdf_docs_chunks.csv', index=False)
        self.s3.upload_file('pdf_docs_chunks.csv', self.s3_bucket_name, self.s3_file_key)
    
    def download_all_csv_chunks_from_s3(self):
        file_obj = self.s3.get_object(Bucket=self.s3_bucket_name, Key=self.s3_file_key)
        file_content = file_obj['Body'].read().decode('utf-8')
        csv_file = StringIO(file_content)
        pdf_docs_chunks = pd.read_csv(csv_file)
        pdf_docs_chunks.head()
        self.s3.download_file(self.s3_bucket_name, self.s3_file_key, 'pdf_docs_chunks.csv')
        return pd.read_csv('pdf_docs_chunks.csv')

# Inference Test

In [28]:
from flask import Flask, request, jsonify, render_template
#from chunks import Chunks
#from model_gen import ModelGenerator
#from vector import Vector
import os
#app = Flask(__name__)

model_gen = ModelGenerator()
chunk_manager = Chunks()

vector_store = Vector(
    api_key=PINECONE_API_KEY,
    environment="gcp-starter",
    index_name="rag-aws-poc-index"
)
#data = request.get_json(force=True)
question = "What are SageMaker Geospatial capabilities?"
#if not question:
#    return jsonify({"error": "No question provided"})
out, payload = model_gen.predict_with_enriched_context(question)
print(out)
print("payload")
print(payload)

daa40622-3b52-4ed0-9601-97e2b659dcb1
daa40622-3b52-4ed0-9601-97e2b659dcb1
queryvec
[-0.24050272814929485, -0.34249835337201756, -0.01771362265571952, 0.018120714463293552, -0.1402135300450027, -0.48117613171537715, -0.28832402794311446, -0.008354688373704752, -0.44838136713951826, -0.043116857297718525, -0.10974471007163326, 0.0672216285020113, -0.2832355760037899, 0.12407688323097925, 0.23758704401552677, 0.27310478314757347, -0.13266861718147993, -0.2280914324025313, 0.05726583036206042, -0.10050984813521306, -0.18836369194711247, -0.431924885759751, -0.07180790537192176, 0.35127414266268414, 0.3615265501042207, -0.24020902424429855, -0.1477752561137701, 0.13350998734434447, 0.2778825505326192, 0.027802172194545467, 0.2937460858374834, 0.259618762259682, 0.1474143092830976, 0.16202358823890486, -0.40236254936705035, 0.6843451193223397, -0.3196535559448724, 0.2189745387683312, 0.07986460765823722, 0.1896475109582146, 0.020299059338867664, -0.4585521768312901, -0.01953426011217137, -0.