In [None]:
import sagemaker
from sagemaker.utils import name_from_base
import boto3
import json
import sagemaker
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri
import time

sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name
role = sagemaker.get_execution_role()


In [None]:
image_uri = get_huggingface_llm_image_uri(backend="huggingface", region=region)

print(image_uri)

In [None]:
instance_type = "ml.g4dn.12xlarge" # instance type to use for deployment

In [None]:
import json
from sagemaker.huggingface import HuggingFaceModel

# Define Model and Endpoint configuration parameter
hf_model_id = "tiiuae/falcon-40b" # model id from huggingface.co/models
number_of_gpu = 4 # number of gpus to use for inference and tensor parallelism
health_check_timeout = 600 # Increase the timeout for the health check to 5 minutes for downloading the model
falcon_model_name = name_from_base(hf_model_id.split("/")[-1].replace(".", "-"))

# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
    role=role,
    name=falcon_model_name,
    image_uri=image_uri,
    env={
        'HF_MODEL_ID': hf_model_id,
        # 'HF_MODEL_QUANTIZE': "bitsandbytes", # comment in to quantize
        'SM_NUM_GPUS': json.dumps(number_of_gpu),
        'MAX_INPUT_LENGTH': json.dumps(1024),  # Max length of input text
        #'HF_MODEL_REVISION': 'eb410fb6ffa9028e97adb801f0d6ec46d02f8b07',  
        #'HF_MODEL_QUANTIZE': 'bitsandbytes-nf4',
        'HF_MODEL_QUANTIZE': 'bitsandbytes-nf4',
        'HUGGINGFACE_HUB_CACHE': "/tmp/huggingface",  
        'SAGEMAKER_CONTAINER_LOG_LEVEL': "20"
    }  
)
falcon_endpoint_name = falcon_model_name
falcon_endpoint_name

In [None]:
falcon_endpoint_name = 'hf-llm-falcon-40b-instruct-bf16-2024-03-11-04-39-29-535'

In [None]:
llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout,
  endpoint_name=falcon_endpoint_name
)

In [None]:
from datetime import datetime

sm_client = boto3.client("sagemaker")

In [None]:
# wait for the endpoint to be deployed successfully
def wait_for_endpoint(endpoint_name=None):
    describe_endpoint_response = sm_client.describe_endpoint(EndpointName=endpoint_name)

    while describe_endpoint_response["EndpointStatus"] == "Creating":
        describe_endpoint_response = sm_client.describe_endpoint(EndpointName=endpoint_name)
        print(describe_endpoint_response["EndpointStatus"])
        time.sleep(15)

    print(f"endpoint {endpoint_name} is in service now.")
    return

In [None]:
for ep_name in [falcon_endpoint_name]:
    wait_for_endpoint(ep_name)

In [None]:
from sagemaker.predictor import Predictor
import boto3
import json

def query_endpoint(payload):
    # Create a low-level client representing Amazon SageMaker Runtime
    sagemaker_runtime = boto3.client(
        "sagemaker-runtime", region_name='us-east-1')

    # Gets inference from the model hosted at the specified endpoint:
    response = sagemaker_runtime.invoke_endpoint(
        EndpointName=falcon_endpoint_name, 
        Body=json.dumps(payload),
        ContentType="application/json"
        )

    # Decodes and prints the response body:
    print(response['Body'].read().decode('utf-8'))

In [None]:
payload = {
    "inputs": """Starting today, the state-of-the-art Falcon 40B foundation model from Technology
    Innovation Institute (TII) is available on Amazon SageMaker JumpStart, SageMaker's machine learning (ML) hub
    that offers pre-trained models, built-in algorithms, and pre-built solution templates to help you quickly get
    started with ML. You can deploy and use this Falcon LLM with a few clicks in SageMaker Studio or
    programmatically through the SageMaker Python SDK.
    Falcon 40B is a 40-billion-parameter large language model (LLM) available under the Apache 2.0 license that
    ranked #1 in Hugging Face Open LLM leaderboard, which tracks, ranks, and evaluates LLMs across multiple
    benchmarks to identify top performing models. Since its release in May 2023, Falcon 40B has demonstrated
    exceptional performance without specialized fine-tuning. To make it easier for customers to access this
    state-of-the-art model, AWS has made Falcon 40B available to customers via Amazon SageMaker JumpStart.
    Now customers can quickly and easily deploy their own Falcon 40B model and customize it to fit their specific
    needs for applications such as translation, question answering, and summarizing information.
    Falcon 40B are generally available today through Amazon SageMaker JumpStart in US East (Ohio),
    US East (N. Virginia), US West (Oregon), Asia Pacific (Tokyo), Asia Pacific (Seoul), Asia Pacific (Mumbai),
    Europe (London), Europe (Frankfurt), Europe (Ireland), and Canada (Central),
    with availability in additional AWS Regions coming soon. To learn how to use this new feature,
    please see SageMaker JumpStart documentation, the Introduction to SageMaker JumpStart –
    Text Generation with Falcon LLMs example notebook, and the blog Technology Innovation Institute trainsthe
    state-of-the-art Falcon LLM 40B foundation model on Amazon SageMaker. Summarize the article above:""",
    "parameters": {"max_new_tokens": 200},
}
query_endpoint(payload)

In [None]:
prompt = "Tell me about Amazon SageMaker."

payload = {
    "inputs": prompt,
    "parameters": {
        "do_sample": True,
        "top_p": 0.9,
        "temperature": 0.8,
        "max_new_tokens": 1024,
        "stop": ["<|endoftext|>", "</s>"],
    },
}

query_endpoint(payload)

In [None]:
!pip install langchain

In [None]:
with open("doc.txt") as f:
    text_to_summarize = f.read()

In [None]:
import langchain
from langchain import SagemakerEndpoint, PromptTemplate
from langchain.llms.sagemaker_endpoint import LLMContentHandler
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

text_splitter = RecursiveCharacterTextSplitter(
                    chunk_size = 500,
                    chunk_overlap  = 20,
                    separators = [" "],
                    length_function = len
                )
input_documents = text_splitter.create_documents([text_to_summarize])

In [None]:
class ContentHandlerTextSummarization(LLMContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt: str, model_kwargs={}) -> bytes:
        input_str = json.dumps({"inputs": prompt, **model_kwargs})
        return input_str.encode("utf-8")

    def transform_output(self, output: bytes) -> json:
        response_json = json.loads(output.read().decode("utf-8"))
        generated_text = response_json[0]['generated_text']
        return generated_text.split("summary:")[-1]
    
content_handler = ContentHandlerTextSummarization()

In [None]:
map_prompt = """Write a concise summary of this text in a few complete sentences:

{text}

Concise summary:"""

map_prompt_template = PromptTemplate(
                        template=map_prompt, 
                        input_variables=["text"]
                      )


combine_prompt = """Combine all these following summaries and generate a final summary of them in a few complete sentences:

{text}

Final summary:"""

combine_prompt_template = PromptTemplate(
                            template=combine_prompt, 
                            input_variables=["text"]
                          ) 

In [None]:
summary_model = SagemakerEndpoint(
                    endpoint_name = falcon_endpoint_name,
                    region_name= "us-east-1",
                    model_kwargs= {},
                    content_handler=content_handler
                )

In [None]:
summary_chain = load_summarize_chain(llm=summary_model,
                                     chain_type="map_reduce", 
                                     map_prompt=map_prompt_template,
                                     combine_prompt=combine_prompt_template,
                                     verbose=True
                                    ) 
summary = summary_chain({"input_documents": input_documents, 'token_max': 700}, return_only_outputs=True)
print(summary["output_text"])   

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=0)
texts = text_splitter.split_text(text_to_summarize) 
print('texts[0]: ', texts[0])
        
docs = [
            Document(
                page_content=t
            ) for t in texts[:3]
        ]

In [None]:
prompt_template = """Write a concise summary of the following:

        {text}
        
        CONCISE SUMMARY """

In [None]:
PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
chain = load_summarize_chain(summary_model, chain_type="stuff", prompt=PROMPT)
summary = chain.run(docs)

In [None]:
summary

### Created helper function, but conceptually it is the same as above

In [None]:
stuff_prompt_template = """
Please provide a summary of the following text.
TEXT: {text}
SUMMARY:
"""

chuck_prompt_template = """
Please provide a summary of the following text.
Please answer in one sentence.
TEXT: {text}
SUMMARY:
"""

chunk_prompt = PromptTemplate(
    template=chuck_prompt_template,
    input_variables=["text"]
)

combine_prompt_template = """
Write a concise summary of the following text.
Return your response in bullet points which covers the key points of the text.
TEXT: {text}
SUMMARY:
"""

combine_prompt = PromptTemplate(
    template=combine_prompt_template,
    input_variables=["text"]
)


In [None]:
chain = load_summarize_chain(
            summary_model,
            chain_type="refine",
            question_prompt=chunk_prompt,
            refine_prompt=combine_prompt,
            return_intermediate_steps=True,
            verbose=True
        )

In [None]:
def summary_chain_init(chain_type, llm):
    
    if chain_type == "STUFF":
        chain = load_summarize_chain(
            llm,
            chain_type="stuff",
            verbose=True
        )
        
    elif chain_type == "MAP_REDUCE":
        chain = load_summarize_chain(
            llm,
            chain_type="map_reduce",
            map_prompt=chunk_prompt,
            combine_prompt=combine_prompt,
            return_intermediate_steps=True,
            verbose=True
        )
    elif chain_type == "REFINE":
        chain = load_summarize_chain(
            llm,
            chain_type="refine",
            question_prompt=chunk_prompt,
            refine_prompt=combine_prompt,
            return_intermediate_steps=True,
            verbose=True
        )
        
    return chain


In [None]:
def long_call_analysis(llm, transcript, params, template="", chain_type="MAP_REDUCE", max_tokens=50):

    
    llm.model_kwargs = params
    num_tokens = llm.get_num_tokens(transcript) #raise warnning

    if num_tokens > max_tokens:
        text_splitter = RecursiveCharacterTextSplitter(
            separators=["\n\n\n"],
            chunk_size=500,
            chunk_overlap=100
        )
        docs = text_splitter.create_documents([transcript])
        num_docs = len(docs)
        num_tokens_first_doc = llm.get_num_tokens(docs[0].page_content)

        print(f"Now we have {num_docs} documents and the first one has {num_tokens_first_doc} tokens")

        
        summary_chain = summary_chain_init(
            chain_type=chain_type, 
            llm=llm
        )
        response = summary_chain(
            {"input_documents": docs}
        )
        
        print ("Intermediate_steps: \n")
        for idx, step in enumerate(response["intermediate_steps"]):
            print (colored(f'step {idx}: \n', "green"))
            print (colored(f'{step}\n', "green"))
        
        return response["output_text"]
    
    else:
        
        prompt = PromptTemplate(template=stuff_prompt_template, input_variables=["text"])
        analysis_prompt = prompt.format(text=transcript)
        print (colored(analysis_prompt, 'green'))
        
        response = llm(analysis_prompt)
        
        return response
        

In [None]:
PARAMS = {
    "FALCON-40B": {
        "max_new_tokens": 2048,
        "max_length": 2048,
        "top_p": 0.95,
        "do_sample": False,
        "temperature": 0.2,
        "return_full_text": False,
        "include_prompt_in_result": False
    },
    "LLAMA2-7B": {
        'max_new_tokens': 128,
        'top_p': 0.9,
        'temperature': 0.1,
        'return_full_text': False
    },
}

In [None]:
MODEL_NAME = "FALCON-40B"

In [None]:
summary_template = """
Analyze the retail support call transcript below. Provide a detail summary of the conversation in complete sentence:

context: {transcript}

summary:"""

In [None]:
!pip install transformers

In [None]:
res = long_call_analysis(
    llm=summary_model,
    transcript=text_to_summarize,
    params=PARAMS[MODEL_NAME],
    template=summary_template,
    chain_type="MAP_REDUCE" # REFINE, MAP_REDUCE
)

print ("Results: \n")
print (res)

## Using Bedrock Haiku

In [None]:
import boto3
import json
import base64

# Create a BedrockRuntime client
bedrock_runtime = boto3.client('bedrock-runtime')

payload = {
    "modelId": "anthropic.claude-3-sonnet-20240229-v1:0",
    "contentType": "application/json",
    "accept": "application/json",
    "body": {
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 2048,
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": text_to_summarize
                    }
                ]
            }
        ]
    }
}

# Convert the payload to bytes
body_bytes = json.dumps(payload['body']).encode('utf-8')

# Invoke the model
response = bedrock_runtime.invoke_model(
    body=body_bytes,
    contentType=payload['contentType'],
    accept=payload['accept'],
    modelId=payload['modelId']
)

# Process the response
response_body = response['body'].read().decode('utf-8')
print(response_body)