In [None]:
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0

# Start the provisioned throughput for the finetuned model and make inferences


In [None]:
! pip install -r ../requirements.txt

### import packages

In [None]:
%pylab inline
import os
import tarfile
import json
import seaborn as sns
from tqdm import tqdm
from datetime import datetime
import boto3
from botocore.config import Config
import logging
from enum import Enum
import ast
import boto3
from botocore.config import Config

### Define our tools

We will need these to test out our endpoint!

To properly train our model on tool usage we need to define our tool definitions. We can do so by defining functions with explicit typed inputs and structured docstrings. 

We are going to define 8 tools:
- weather_api_call
- stat_pull
- text_to_sql
- terminal
- wikipedia
- duckduckgo_results_json
- youtube_search
- pubmed_search

While we are defining 8 tools, we are only going to train our model on 7 of them. This is so that we can test out our performance on unseen tools after training.

In [None]:
import weather_api_call, stat_pull,terminal,text_to_sql,wikipidea,youtube_search, pubmed_search, duckduckgo_results_json


### Deploy fine tuned model using bedrock 

Retrieve the fine-tuned model ID from the fine tuning job’s output, and create a Provisioned Throughput model instance with the desired model units.

In [None]:
PROVISIONED_MODEL_NAME ="" # change accordingly
FT_MODEL_ARN= "" # check the FT model ARN from Amazon Bedrock after FT finishes

my_config = Config(
    region_name = 'us-east-1',
    retries = {
        'max_attempts': 5,
        'mode': 'standard'
    }
)

bedrock = boto3.client(service_name="bedrock", config=my_config)
bedrock_runtime = boto3.client(service_name="bedrock-runtime")


# create provisioned model for the ft model (remember to stop it later)
provisioned_model_id = bedrock.create_provisioned_model_throughput(
                                    modelUnits=1,
                                    provisionedModelName=PROVISIONED_MODEL_NAME,
                                    modelId= FT_MODEL_ARN"
                        )
print(provisioned_model_id['provisionedModelArn'])


### FT Amazon Nova model id

In [None]:
model_id = provisioned_model_id['provisionedModelArn']

### Define the system prompt and messages

In [None]:

sys_msg ="""You are a bot that can handle different requests with tools."""
system_prompt = [{"text": sys_msg}]

# Prepare the tool configuration with the weather tool's specification
tool_config = {"tools": [weather_api_call.get_tool_spec(),
                         stat_pull.get_tool_spec(),
                         terminal.get_tool_spec(),
                         text_to_sql.get_tool_spec(),
                         wikipidea.get_tool_spec(),
                         youtube_search.get_tool_spec(),
                         pubmed_search.get_tool_spec(),
                         duckduckgo_results_json.get_tool_spec()                        
                        ]
              }

# appropriate prompt template for tool calling 

promt_template = """
Given the following functions within <tools>, please respond with a JSON for a function call with its proper arguments that best answers the given prompt.
Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.Do not use variables. Donot give any explanations. 
ONLY output the resulting JSON structure and nothing else.Donot use the word 'json' anywhere in the result.

<tools>{tool_config}</tools>

Generate answer for the following question.
<question>{question}</question>
"""
# Convert tools configuration to JSON string
formatted_tool_config = json.dumps(tool_config, indent=2)

### Test single question

Let's run a single question through our endpoint.

In [None]:
# define a single question to check inference
question = "What research is available on the effects of music therapy for autism spectrum disorders?"

prompt = promt_template.replace("{question}", question)
prompt = prompt.replace("{tool_config}", formatted_tool_config)

# define model_kwargs
messages = [
        {
            "role": "user",
            "content": [
                {
                   "text": prompt
                }
            ]
        }
    ]

max_tokens= 4096
temperature= 0.2
inferenceConfig = {
                "max_new_tokens": max_tokens,
                "temperature": temperature, 
                # "top_p": float,
                # "top_k": 1
            }

# Prepare request body
model_kwargs = {"system":system_prompt,
                "messages": messages,
                 "inferenceConfig": inferenceConfig,}
body = json.dumps(model_kwargs)

accept = "application/json"
contentType = "application/json"

# invoke the model to make inference
response = bedrock_runtime.invoke_model(
        body=body,
        modelId=model_id,
        accept=accept,
        contentType=contentType
    )

ft_response_body = json.loads(response.get("body").read())
# Parse response
ft_response_text = ft_response_body['output']['message']['content'][0]['text']
ast.literal_eval(ft_response_text)


In [None]:
print(messages)

### Load our test set

Let's load our full test set that we can run inference against.

In [None]:
test_question_bank_path = "../assets/bedrock_nova_ft/test_ft.jsonl"


test_question_list_jsonl = []
with open(test_question_bank_path, 'r', encoding='utf-8') as file:
    for line in file:
        # Parse each line as a JSON object
        line = json.loads(line.strip())
        test_question_list_jsonl.append(line)

### Run test set

In [None]:
import time

output_list = []
elapsed_time = 0 
data =[]
for question_dict in tqdm(test_question_list_jsonl):# Next, create a chat and apply the chat template
    
    temp={}
    question = question_dict['messages'][0]['content']
    prompt = promt_template.replace("{question}", question)
    prompt = prompt.replace("{tool_config}", formatted_tool_config)
    # define model_kwargs
    messages = [
        {
            "role": "user",
            "content": [
                {
                   "text": prompt
                }
            ]
        }
    ]

    max_tokens= 4096
    temperature= 0.2
    inferenceConfig = {
                "max_new_tokens": max_tokens,
                "temperature": temperature, 
                # "top_p": float,
                # "top_k": 1
            }

    # Prepare request body
    model_kwargs = {"system":system_prompt,
                "messages": messages,
                 "inferenceConfig": inferenceConfig,}
    body = json.dumps(model_kwargs)

    accept = "application/json"
    contentType = "application/json"    
    # invoke the model to make inference
    start_time = time.time()     
    response = bedrock_runtime.invoke_model(
        body=body,
        modelId=model_id,
        accept=accept,
        contentType=contentType
        )   
    
    elapsed_time = time.time() - start_time
    ft_response_body = json.loads(response.get("body").read())
    
    in_tok = ft_response_body[ 'usage' ]['inputTokens']
    out_tok = ft_response_body[ 'usage' ]['outputTokens']   #response['ResponseMetadata']['HTTPHeaders']['x-amzn-bedrock-output-token-count']
    tot_tok = ft_response_body[ 'usage' ]['totalTokens']
    ft_response_text = ft_response_body['output']['message']['content'][0]['text']
        
    print(f"{in_tok}, {out_tok}, {tot_tok},{elapsed_time}\n")
    # Parse response
    ft_response_text = ft_response_body['output']['message']['content'][0]['text']
    temp['user_question'] =question       
    temp['response']= ft_response_text
    temp['input_tokens']= in_tok
    temp['output_tokens']= out_tok
    temp['total_tokens']= tot_tok
    temp['latency']= elapsed_time
    data.append(temp)
    output_list.append(ft_response_text)
    print(ft_response_text)
    time.sleep(5)
   

print(f" avg latency {round((elapsed_time/len(test_question_list_jsonl)),2)} \n")
data

In [None]:
import csv
# Get the keys for the CSV header (assuming all dictionaries have the same keys)
fieldnames = data[0].keys()

# Open a new CSV file for writing
output_file='./results/micro_output.csv'
with open(output_file, 'w', newline='') as csvfile:
    # Create a DictWriter object, specifying the fieldnames
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)   
    writer.writeheader()    
    writer.writerows(data)

print(f"Data has been written to {output_file} \n")

In [None]:
# parsing the output
print(eval(output_list[0]))
print(eval(output_list[0])['name'])
print(eval(output_list[0])['parameters'])


### Assess tool calling accuracy

Let's now grade our model on its performance.

## new 

In [None]:
test_question_bank_path = "../assets/test_data.txt"

test_question_list = []
with open(test_question_bank_path) as f:
    for line in f.readlines():
        test_question_list.append(eval(line))

In [None]:
test_question_list[0]

In [None]:
import re
verbose = True
# need to have a way to evaluate both regex patterns and inclusives 
# maybe you have the regex have an index it corresponds to?

remap_dict = {
    "terminal":"shell_tool",
    "wikipedia":"wiki_tool",
    "youtube_search":"youtube_tool",
    "pubmed_search":"pubmed_tool",
    "stat_pull":"stat_pull",
    "text_to_sql":"text_to_sql",
    "create_plan":"create_plan",
    "duckduckgo_results_json":"internet_search_tool",
    "weather_api_call":"weather_api_call",
}

# need to have set numbers of correct arguments/tool calls
correct_tool = [0] * len(test_question_list)
correct_args = [0] * len(test_question_list)
for i, question_dict in tqdm(enumerate(test_question_list)):
    # loop through the questions
    
    question =  question_dict['question']
    answer =  question_dict['answer']
    args = question_dict["args"]
    tool_keys = list(args.keys())
    num_tools = len(tool_keys)
    print(f"User question: {question}\n")
    
    try:
        out_dict = eval(output_list[i]) #llm output list
       
        try:
            # give credit to printing out a function instead of function string
            name = str(out_dict['name'].__name__)
            
            tool_calls = out_dict['parameters']
        except:
            name = str(out_dict['name'])
            
            tool_calls = out_dict['parameters']

        
        print(f"GT tool: {answer}   LLM output tool: {name} \n")
        
        if name == answer:
            correct_tool[i] = 1
        else:
            print("TOOL FAIL")
            correct_tool[i] = 0
        
        for j, tool_key in enumerate(tool_keys): # need to loop through the tool arguments 
            if (isinstance(tool_calls[tool_key], list))&(len(args[tool_key])>1):
                # if multiple arguments, join them together, this is for things like code where a list of args is passed back
                pred_args = [" && ".join(tool_calls[tool_key])]
                if verbose:
                    print("new args: ", pred_args)
            elif isinstance(tool_calls[tool_key], str):
                # add list around tool calls if string
                pred_args = [tool_calls[tool_key]]
            else:
                pred_args = tool_calls[tool_key]
            if verbose:
                print("pred args: ", pred_args)
                #print()
            if test_question_list[i]["arg_pattern"]:
                # if there are regex patterns, evaluate them
                gt_arg = test_question_list[i]["arg_pattern"] 
                for gt in gt_arg:
                    # loop through each valid arg pattern
                    if verbose:
                        
                        print("ground truth pattern:",gt)
                        #print()
                    if (re.match(gt, pred_args[0].lower()) != None)|(gt == pred_args[0]):
                        correct_args[i] += 1/num_tools
                        if verbose:
                            print("regex match")
                            print("arg score", correct_args[i])
                            #print()
                        break
                    else:
                        correct_args[i] += 0
                        if verbose:
                            print("Failed regex match")
            else:
                # need to loop through 
                gt_arg = test_question_list[i]["args"][tool_key][0].lower()
                if gt_arg in pred_args[0].lower():
                    correct_args[i] += 1/num_tools
                    if verbose:
                        print("straight match")
                        print("arg score", correct_args[i])

                else:
                    correct_args[i] += 0
                    if verbose:
                        print("Failed straight match")
            
                    
    except Exception as e:
        print("exception")
        correct_tool[i] = 0
        correct_args[i] = 0
        print(e)
    print("--------")
          
print(np.mean(correct_tool))
print(np.mean(correct_args))

### Cleanup and delete the provisioned throughput model you created

In [None]:

bedrock.delete_provisioned_model_throughput(provisionedModelId=model_id)