In [None]:
!pip install sagemaker python-dotenv --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.1/82.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.6/147.6 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m59.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.2/82.2 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━

In [None]:
import sagemaker
import boto3

import os
from dotenv import load_dotenv

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [None]:
# environment variables
# Option 1
# os.environ["aws_access_key_id"]='aws_access_key_id'
# os.environ["aws_secret_access_key"]='aws_secret_access_key'

# Option 2
load_dotenv()

True

In [None]:
REGION_NAME = "us-east-1"
os.environ["AWS_DEFAULT_REGION"] = REGION_NAME
ROLE_NAME =  'Sagemaker-ExecutionRole'

auth_arguments = {
    'aws_access_key_id':os.environ["aws_access_key_id"],
    'aws_secret_access_key':os.environ["aws_secret_access_key"],
    'region_name':REGION_NAME
}


In [None]:
iam = boto3.client('iam', **auth_arguments)
role = iam.get_role(RoleName=ROLE_NAME)['Role']['Arn']

session = sagemaker.Session(boto3.Session(**auth_arguments))


In [None]:
from sagemaker.huggingface import get_huggingface_llm_image_uri

# image uri
llm_image = get_huggingface_llm_image_uri("huggingface")

print(f"image uri: {llm_image}")

image uri: 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi1.4.2-gpu-py310-cu121-ubuntu22.04


In [None]:
from sagemaker.huggingface import HuggingFaceModel

# Falcon 7b
hub = {'HF_MODEL_ID':'tiiuae/falcon-7b'}

# Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   env=hub,
   role=role,  # iam role from AWS
   image_uri=llm_image,
   sagemaker_session=session
)

In [None]:
# deploy model to SageMaker
predictor = huggingface_model.deploy(
	initial_instance_count=1, # number of instances
	instance_type='ml.g5.16xlarge', #'ml.g5.4xlarge'
 	container_startup_health_check_timeout=300
)

-----------!

In [None]:
# Placeholder for conversation history; in a real application, this would be more dynamically managed
conversation_history = [{"role": "system", "content": "You are a helpful assistant."}]

def generate_response(user_input):
    """
    Generate a response from the model based on the user's input and the conversation history.
    """
    global conversation_history
    # Add the latest user input to the conversation history
    conversation_history.append({"role": "user", "content": user_input})

    # Prepare the prompt by including the conversation history
    prompt_with_history = ""
    for entry in conversation_history:
        prompt_with_history += f"{entry['role']}: {entry['content']}\n"
    prompt_with_history += "assistant:"

    # Hyperparameters for the LLM request, with the updated prompt
    request = {
        "inputs": prompt_with_history,
        "parameters": {
            "do_sample": True,
            "top_p": 0.9,
            "temperature": 0.7,
            "max_new_tokens": 512,
            "stop": ["\nUser:","",""]
        }
    }

    # Request to the endpoint
    response = predictor.predict(request)

    # Extracting model response
    model_response = response[0]["generated_text"].split("assistant:")[-1].strip()

    # Add model response to the conversation history
    conversation_history.append({"role": "assistant", "content": model_response})

    # Return the model's response
    return model_response

# Example usage
user_input = "What is the capital of Spain?"
assistant_response = generate_response(user_input)

In [None]:
print(assistant_response)

Madrid


In [None]:
user_input = "What is the most famous street in Madrid?"
assistant_response = generate_response(user_input)
print(assistant_response)

Gran


In [None]:
user_input = "What is the most famous house in Gran Street in Madrid?"
assistant_response = generate_response(user_input)
print(assistant_response)

Casa


In [None]:
user_input = "What country did I ask about above?"
assistant_response = generate_response(user_input)
print(assistant_response)

Spain


In [None]:
# DELETE ENDPOINT to avoid unnecessary expenses
predictor.delete_model()
predictor.delete_endpoint()