## After training the model, we can use this notebook to test the model. 

### 1. Direct download the model and test. Run this notebook in A100 GPU machine (NC24adsA100 compute instance)

In [None]:
# import required libraries
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
import time
from azure.ai.ml import MLClient, Input
from azure.ai.ml.dsl import pipeline
from azure.ai.ml import load_component
credential = DefaultAzureCredential()
subscription_id = "" # your subscription id
resource_group = ""#your resource group
workspace = "" #your workspace name
workspace_ml_client = MLClient(credential, subscription_id, resource_group, workspace)

#### For regular model 

In [None]:
model_name = "llama2_13b_fine_tuned"
model_path="./"
workspace_ml_client.models.download(model_name, version="2",download_path=model_path)
#after this step, remove the redundant parent folder name "llama2_13b_fine_tuned" so that the downloaded folder only has one 

In [None]:
import mlflow
import pandas as pd
model = mlflow.pyfunc.load_model(model_name)
prompt = """
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Generate a chessboard with the given size and with pieces at the specified positions.
### Input:
Size: 8x8 \nPiece positions:\nWhite Rook at H8\nWhite Pawn at B3\nBlack Pawn at A3\nBlack Rook at H2

"""
model_input= {"text":[prompt], "max_length":100}
model.predict(model_input)


#### For Chat Model

In [None]:
import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:
#download model from the fine-tuning pipepline  
model_name = "llama2_13b_chat_sql_tuned"
model_path="."
workspace_ml_client.models.download(model_name, version="2",download_path=model_path)
#after this step, remove the redundant parent folder name "llama2_13b_chat_sql_tuned" 

In [None]:

device_map = {"": 0}
chat_model = AutoModelForCausalLM.from_pretrained(
    f"{model_name}/{model_name}/artifacts/trained_model",
    local_files_only=True,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)

tokenizer = AutoTokenizer.from_pretrained(
    f"{model_name}/{model_name}/artifacts/trained_model",
    local_files_only=True,
    device_map=device_map
)


In [None]:
from transformers.pipelines import ConversationalPipeline, Conversation

def predict(data, model, tokenizer, **kwargs):
    import torch
    import pandas as pd

    TEMPERATURE_KEY = "temperature"
    MAX_GEN_LEN_KEY = "max_gen_len"
    DO_SAMPLE_KEY = "do_sample"
    MAX_NEW_TOKENS_KEY = "max_new_tokens"
    MAX_LENGTH_KEY = "max_length"
    TOP_P_KEY = "top_p"
    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
    
    if isinstance(data, pd.DataFrame):
        data = data[data.columns[0]].tolist()

    addn_args = kwargs.get("addn_args", {})
    max_gen_len = addn_args.pop(MAX_GEN_LEN_KEY, 256)
    addn_args[MAX_NEW_TOKENS_KEY] = addn_args.get(MAX_NEW_TOKENS_KEY, max_gen_len)
    addn_args[MAX_LENGTH_KEY] = addn_args.get(MAX_LENGTH_KEY, 4096)
    addn_args[TEMPERATURE_KEY] = addn_args.get(TEMPERATURE_KEY, 0.9)
    addn_args[TOP_P_KEY] = addn_args.get(TOP_P_KEY, 0.6)
    addn_args[DO_SAMPLE_KEY] = addn_args.get(DO_SAMPLE_KEY, True)

    model.eval()
    conv_arr = data
    # validations
    assert len(conv_arr) > 0
    assert conv_arr[-1]["role"] == "user"
    next_turn = "system" if conv_arr[0]["role"] == "system" else "user"
    # Build conversation
    conversation = Conversation()
    conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer)
    for i, conv in enumerate(conv_arr):
        if conv["role"] == "system":
            assert next_turn == "system", "System prompts can only be set at the start of the conversation"
            next_turn = "user"
            conversation.add_user_input(B_SYS + conv_arr[0]["content"].strip() + E_SYS)
            conversation.mark_processed()
        if conv["role"] == "assistant":
            assert next_turn == "assistant", "Invalid Turn. Expected user input"
            next_turn = "user"
            conversation.append_response(conv["content"].strip())
        elif conv["role"] == "user":
            assert next_turn == "user", "Invalid Turn. Expected assistant input"
            next_turn = "assistant"
            conversation.add_user_input(conv["content"].strip())
            if i != len(conv_arr[0:]) - 1:
                conversation.mark_processed()
    result = conversation_agent(conversation, use_cache=True, **addn_args)
    return {'output': result.generated_responses[-1]}



In [None]:
import pandas as pd
#show case that the model has learned the schema of the database 
instruction ="You are querying the sales database"
input = "What are columns of Products table?"
content = f"<s>[INST]\n{instruction}\n\n### Input:\n{input}\n[/INST]"
model_input = pd.DataFrame({"input":[
                    {
                        "role": "user",
                        "content": content,
                    },
                ],})
predict(model_input,chat_model, tokenizer)

### 2.Deploy to managed online endpoint and test

1. Create online endpoint: ```az ml online-endpoint create -f deployment/endpoint.yml```
2. Create the deployment: ```az ml online-deployment update -f deployment/deployment.yml```

In [2]:
import urllib.request
import json
import os
import ssl

def allowSelfSignedHttps(allowed):
    # bypass the server certificate verification on client side
    if allowed and not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None):
        ssl._create_default_https_context = ssl._create_unverified_context

allowSelfSignedHttps(True) # this line is needed if you use self-signed certificate in your scoring service.

# Request data goes here
# The example below assumes JSON formatting which may be updated
# depending on the format your endpoint expects.
# More information can be found here:
# https://docs.microsoft.com/azure/machine-learning/how-to-deploy-advanced-entry-script
prompt = """
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Summarize the following input to less than 30 words .
### Input:
In general, perplexity is a measurement of how well a probability model predicts a sample. In the context of Natural Language Processing, perplexity is one way to evaluate language models.
A language model is a probability distribution over sentences: it’s both able to generate plausible human-written sentences (if it’s a good language model) and to evaluate the goodness of already written sentences. Presented with a well-written document, a good language model should be able to give it a higher probability than a badly written document, i.e. it should not be “perplexed” when presented with a well-written document.
Thus, the perplexity metric in NLP is a way to capture the degree of ‘uncertainty’ a model has in predicting (i.e. assigning probabilities to) text."""
data= {"data":{"text":[prompt], "max_length":100}}

body = str.encode(json.dumps(data))

url = 'https://llma2-fine-tuning.westus2.inference.ml.azure.com/score'
# Replace this with the primary/secondary key or AMLToken for the endpoint
api_key = '5YPJEdQlb4kHhUfbj80pTk2weH3Csxdo'
if not api_key:
    raise Exception("A key should be provided to invoke the endpoint")

# The azureml-model-deployment header will force the request to go to a specific deployment.
# Remove this header to have the request observe the endpoint traffic rules
headers = {'Content-Type':'application/json', 'Authorization':('Bearer '+ api_key), 'azureml-model-deployment': 'blue' }

req = urllib.request.Request(url, body, headers)

try:
    response = urllib.request.urlopen(req)

    result = response.read()
    print(result)
except urllib.error.HTTPError as error:
    print("The request failed with status code: " + str(error.code))

    # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
    print(error.info())
    print(error.read().decode("utf8", 'ignore'))

b'[[{"generated_text": "\\nBelow is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\\n\\n### Instruction:\\nSummarize the following input to less than 30 words .\\n### Input:\\nIn general, perplexity is a measurement of how well a probability model predicts a sample. In the context of Natural Language Processing, perplexity is one way to evaluate language models.\\nA language model is a probability distribution over sentences: it\\u2019s both able to generate plausible human-written sentences (if it\\u2019s a good language model) and to evaluate the goodness of already written sentences. Presented with a well-written document, a good language model should be able to give it a higher probability than a badly written document, i.e. it should not be \\u201cperplexed\\u201d when presented with a well-written document.\\nThus, the perplexity metric in NLP is a way to capture the degree of \\

: 

Perplexity is a measure of how well a probability model predicts a sample. It is used to evaluate language models in NLP. A good language model should assign higher probabilities to well-written documents than to badly written ones. Perplexity is a way to capture the degree of uncertainty a model has in predicting text

In [7]:
print(data[1])

{'instruction': 'What are the three primary colors?', 'input': '', 'output': 'The three primary colors are red, blue, and yellow.'}
