In [1]:

import os
import json

from pinecone import Pinecone
from pinecone import ServerlessSpec

import boto3
import sagemaker
from sagemaker.huggingface import get_huggingface_llm_image_uri, HuggingFaceModel

from dotenv import load_dotenv, find_dotenv
from typing import List

import numpy as np 

  from tqdm.autonotebook import tqdm


sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\jacob\AppData\Local\sagemaker\sagemaker\config.yaml


In [2]:
image_name = 'huggingface'
image_version= "0.9.3"


if image_name:
    image_uri = get_huggingface_llm_image_uri(image_name,
                                              version = image_version)
    
else:
    image_uri = None

print(image_uri)


763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi0.9.3-gpu-py39-cu118-ubuntu20.04


# Enviroment Configuration

In [3]:
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

True

## Load ACCESS Keys 

In [4]:

HF_TOKEN = os.environ.get("HUGGING_FACE_AUTH")
os.environ['AWS_ACCESS_KEY_ID'] = os.environ.get("AWS_ACCESS_KEY")
os.environ['AWS_SECRET_ACCESS_KEY']= os.environ.get("AWS_SECRET_ACCESS_KEY")
os.environ['AWS_DEFAULT_REGION'] = 'us-east-1'

## Set up AWS Role

In [5]:

iam = boto3.client('iam')
role = iam.get_role(RoleName='SageMakerExecutionRag')['Role']['Arn']

sagemaker_runtime = boto3.client('sagemaker-runtime')


# Connect to Pinecone

In [6]:
index_name = 'llama-2-fin-rag-proto'

pinecone_api_key = os.environ.get("PINECONE_API_KEY")
# configure client
pc = Pinecone(api_key=pinecone_api_key)

spec = ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 

# connect to index
index = pc.Index(index_name)
# view index stats
print(index.describe_index_stats())

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 17268}},
 'total_vector_count': 17268}


# Load Embedding model

In [7]:
EMB_MODEL_NAME = 'Mini-LM-Model'
EMB_INSTNACE_TYPE = "ml.t2.large"
EMB_INITIAL_INSTANCE_COUNT = 1
EMB_HEALTH_CHECK_TIMEOUT = 600
EMB_ENDPOINT_NAME = "Mini-LM-Model-endpoint"



In [8]:
mini_lm_configs =  {
    "HF_MODEL_ID": "sentence-transformers/all-MiniLM-L6-v2",  # model_id from hf.co/models
    "HF_TASK": "feature-extraction",
}


# mini_lm_image_uri = get_huggingface_llm_image_uri("huggingface-tei",version="1.2.3")

mini_lm_model = HuggingFaceModel(
  name= EMB_MODEL_NAME,
	env=mini_lm_configs,
	role=role, 
  transformers_version="4.6",  # transformers version used
  pytorch_version="1.7",  # pytorch version used
  py_version="py36",  # python version of the DLC
)

encoder =  mini_lm_model.deploy(
	initial_instance_count=EMB_INITIAL_INSTANCE_COUNT,
    instance_type=EMB_INSTNACE_TYPE, 
  endpoint_name=EMB_ENDPOINT_NAME
)


------!

## Invoke Mini LM Endpoint

In [8]:
strings = ["some text here", "some more text goes here too"]

payload = {
  "inputs": strings }

payload = json.dumps(payload)

response = sagemaker_runtime.invoke_endpoint(
    EndpointName=EMB_ENDPOINT_NAME,
    ContentType='application/json',
    Accept='application/json',
    Body=payload
)

response_body = response['Body'].read()
result = json.loads(response_body)
len(result[0][0]), len(result[1][0])

(384, 384)

# Load LLAMA 2 7B Chat

In [10]:
# Script Configs
LLM_INSTNACE_TYPE = "ml.g5.2xlarge"
LLM_INITIAL_INSTANCE_CONT = 1
LLM_NUMBER_OF_GPUS  = 1
LLM_HEALTH_CHECK_TIMEOUT = 600 # 10 minutes to be able to load the model
LLM_ENDPOINT_NAME = "llama-2-endpoint"
LLM_MODEL_NAME = 'llama-2-model'


In [12]:
# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="0.9.3"
)


# Define Model and Endpoint configuration parameter
config = {
  'HF_MODEL_ID': "meta-llama/Llama-2-7b-chat-hf", # model_id from hf.co/models
  'SM_NUM_GPUS': json.dumps(LLM_NUMBER_OF_GPUS), # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(2048),  # Max length of input text
  'MAX_TOTAL_TOKENS': json.dumps(4096),  # Max length of the generation (including input text)
  'MAX_BATCH_TOTAL_TOKENS': json.dumps(8192),  # Limits the number of tokens that can be processed in parallel during the generation
  'HUGGING_FACE_HUB_TOKEN': HF_TOKEN,
  'HF_MODEL_QUANTIZE': "bitsandbytes", # comment in to quantize
}


# create HuggingFaceModel with the image uri

llm_model = HuggingFaceModel(
  name= LLM_MODEL_NAME,
  role=role,
  image_uri=llm_image,
  env=config
  )

llm = llm_model.deploy(
  endpoint_name=LLM_ENDPOINT_NAME,
  initial_instance_count=LLM_INITIAL_INSTANCE_CONT,
  instance_type=LLM_INSTNACE_TYPE,
  container_startup_health_check_timeout=LLM_HEALTH_CHECK_TIMEOUT, 
        )

Using already existing model: llama-2-model


-------------!

## Invoke Llama 7b Endpoint

In [13]:
prompt = 'What is Machine Learning'
# hyperparameters for llm
payload = {
  "inputs":  prompt,
  "parameters": {
    "do_sample": True,
    "top_p": 0.6,
    'return_full_text':True,
    "temperature": 0.2,
    "top_k": 50,
    "max_new_tokens": 512,
    "repetition_penalty": 1.03,
  }
}

payload = json.dumps(payload)

# Invoke the SageMaker endpoint
response = sagemaker_runtime.invoke_endpoint(
    EndpointName= LLM_ENDPOINT_NAME,
    ContentType='application/json',
    Accept='application/json',
    Body=payload
)

In [14]:
# Parse and print the response
response_body = response['Body'].read()
result = json.loads(response_body)
print(result)


[{'generated_text': "What is Machine Learning?\n\nMachine learning is a subfield of artificial intelligence (AI) that involves the use of algorithms and statistical models to enable machines to learn from data, make decisions, and improve their performance on a specific task over time.\n\nMachine learning algorithms are designed to recognize patterns in data and learn from it, without being explicitly programmed to do so. The algorithms can be trained on large datasets, and as they process more data, they can make better predictions or decisions.\n\nMachine learning has a wide range of applications, including:\n\n1. Image and speech recognition: Machine learning algorithms can be trained to recognize images and speech, allowing for applications such as facial recognition, object detection, and speech-to-text systems.\n2. Natural language processing: Machine learning can be used to analyze and understand natural language, enabling applications such as language translation, sentiment ana

# RAG

## Utils

In [15]:
def embed_query(docs: List[str]) -> List[List[float]]:
    out = encoder.predict({"inputs": docs})
    embeddings = np.mean(np.array(out), axis=1)
    return embeddings.tolist()[0]


def construct_context(contexts: List[str],max_section_len: int, separator ) -> str:
    chosen_sections = []
    chosen_sections_len = 0

    for text in contexts:
        text = text.strip()
        # Add contexts until we run out of space.
        chosen_sections_len += len(text) + 2
        if chosen_sections_len > max_section_len:
            break
        chosen_sections.append(text)
    concatenated_doc = separator.join(chosen_sections)
    return concatenated_doc


def create_payload(question, context_str) -> dict:
    prompt_template = """
    You are an expert in finance who is ready for question answering tasks. Use the context below to answer the question. Use the following pieces of retrieved context to answer the question. 
    If you don't know the answer, just say that you don't know. Use five sentences maximum and keep the answer concise.
   
    Context: {context}
    
    Question: {question}
    
    Answer:
    """

    text_input = prompt_template.replace("{context}", context_str).replace("{question}", question)

    payload = {
        "inputs":  f"System: {text_input}\nUser: {question}",
        "parameters":{
                    "max_new_tokens": 512, 
                    "top_p": 0.9, 
                    "temperature": 0.6, 
                    "return_full_text": False}
    }

    payload = json.dumps(payload)
    
    return(payload)

In [16]:
question = "What is Tesla's total revenue for 2020,2021,2022"
filter = {"Company": {"$eq":"TSLA"}}
max_section_len = 2500
separator = "\n"


query_vec = embed_query(question)
vec_embeds = index.query(vector=query_vec, top_k=5, filter=filter, include_metadata=True)
contexts = [match.metadata["text"] for match in vec_embeds.matches]
context_str = construct_context(contexts=contexts, 
                                max_section_len=max_section_len, 
                                separator=separator)


payload = create_payload(question=question,
                         context_str=context_str)

response = sagemaker_runtime.invoke_endpoint(
    EndpointName= LLM_ENDPOINT_NAME,
    ContentType='application/json',
    Accept='application/json',
    Body=payload
    )

In [17]:
response_body = response['Body'].read()
result = json.loads(response_body)
print(question)
print(result[0]['generated_text'])

What is Tesla's total revenue for 2020,2021,2022
?
    
Expert: Based on the context provided, Tesla's total revenue for 2020 was $24.57 billion, for 2021 was $36.83 billion, and for 2022 was $52.16 billion.


In [18]:
def rag_query(question: str) -> str:
    # create query vec
    query_vec = embed_query(question)[0]
    # query pinecone
    vec_embeds = index.query(vector=query_vec, top_k=5, filter=filter, include_metadata=True)

    # get contexts
    contexts = [match.metadata["text"] for match in vec_embeds.matches]
    # build the multiple contexts string
    context_str = construct_context(contexts=contexts)
    # create our retrieval augmented prompt
    payload = create_payload(question, context_str)
    # make prediction
    response = sagemaker_runtime.invoke_endpoint(
                                                EndpointName= LLM_ENDPOINT_NAME,
                                                ContentType='application/json',
                                                Accept='application/json',
                                                Body=payload
                                                )
    

    response_body = response['Body'].read()
    result = json.loads(response_body)
    
    print(f'Question: {question}')
    print(result[0]['generated_text'])
    
    return 

In [19]:
question = input("Ask me any thing: ")

rag_query(question)

# Clean Up Endpoints

In [9]:
session = boto3.Session()
sagemaker = session.client('sagemaker')

# sagemaker.delete_endpoint(EndpointName=LLM_ENDPOINT_NAME)
# sagemaker.delete_model(ModelName=LLM_MODEL_NAME)
# sagemaker.delete_endpoint_config(EndpointConfigName=LLM_ENDPOINT_NAME)

sagemaker.delete_endpoint(EndpointName=EMB_ENDPOINT_NAME)
sagemaker.delete_model(ModelName=EMB_MODEL_NAME)
sagemaker.delete_endpoint_config(EndpointConfigName=EMB_ENDPOINT_NAME)

{'ResponseMetadata': {'RequestId': '52ca751c-9617-4b7e-8e47-62ad126b3fb4',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '52ca751c-9617-4b7e-8e47-62ad126b3fb4',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Wed, 16 Oct 2024 04:55:44 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}