<a href="https://colab.research.google.com/github/imywong/starter_templates/blob/main/%5BTemplate%5D_Llama2_%2B_Gradio_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Initial setup

## Install Huggingface Hub library

In [None]:
!pip install huggingface_hub

## Import modules

In [None]:
import requests
import json
import os

## Apply your Huggingface token

Note: Follow the instructions [here](https://huggingface.co/docs/api-inference/quicktour#get-your-api-token) to get a token



In [None]:
# Paste in your secret key when prompted
from getpass import getpass
HUGGINGFACEHUB_API_TOKEN = getpass()

In [None]:
# Set token to environment variable
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN

# Step 2 : Create a helper function to call the LLama2 70b chat model

## Define model API url

In [None]:
API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-70b-chat-hf"
headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}

## Define the system prompt

In [None]:
system_prompt = "You are a helpful AI Assistant"

## Define the model parameters

In [None]:
top_k=10 #limits the number of words that can be generated at each step to the top k most probable words. For example, if k=10, only the top 10 most probable words will be considered at each step. This method is useful when you want to generate text that is more conservative and predictable.
top_p=0.9 #also known as nucleus sampling, is a method that limits the number of words that can be generated at each step to a cumulative probability p. For example, if p=0.9, only the words that have a cumulative probability of 0.9 or less will be considered at each step. This method is useful when you want to generate text that is more diverse and creative.
temperature=0.7 #A higher temperature value will result in more diverse and creative text, while a lower temperature value will result in more predictable and conservative text
max_new_tokens=500
max_time=120
return_full_text=False
repetition_penalty=50.0

## Create the helper function

### Option 1 : With memory

In [None]:
# This stores the conversation history

messages = f"""<s>[INST] <<SYS>>
              {system_prompt}
              <</SYS>>"""

In [None]:
def text_generation(prompt, history):

    global messages

    messages= f"""{messages}{prompt}[/INST]""" # Add prompt to message before passing in

    payload = {
              "inputs": f"{messages}",
                "parameters":{
                    "top_k":top_k,
                    "top_p": top_p,
                    "temperature":temperature,
                    "max_new_tokens":max_new_tokens,
                    "max_time":max_time,
                    "return_full_text":return_full_text,
                    # "repetition_penalty":repetition_penaly
                }
            }
    response = requests.post(API_URL, headers=headers, json=payload)
    result_json = response.json()
    result = result_json[0]['generated_text']

    messages= f"""{messages}{result}</s><s>[INST]""" # Append latest output to message so context is available on next request

    return result

### Option 2 : Without memory

In [None]:
def text_generation(prompt, history):

    messages = f"""<s>[INST] <<SYS>>
              {system_prompt}
              <</SYS>>

              {prompt} [/INST]"""

    payload = {
              "inputs": f"{messages}",
                "parameters":{
                    "top_k":top_k,
                    "top_p": top_p,
                    "temperature":temperature,
                    "max_new_tokens":max_new_tokens,
                    "max_time":max_time,
                    "return_full_text":return_full_text,
                    # "repetition_penalty":repetition_penaly
                }
            }
    response = requests.post(API_URL, headers=headers, json=payload)
    result_json = response.json()
    result = result_json[0]['generated_text']

    return result

# Step 3 : Test the helper function

## Define the question

In [None]:
prompt = "What is one plus one?"  # @param {type:"string"}

## Generate a response

In [None]:
response = text_generation(prompt,None)
print(response) # Verify what was returned

# Step 4 : Generate a simple chat UI using Gradio

A UI makes it easier to demonstrate the capability to non-coders or less technical stakeholders that might feel overwhelmed when looking at a notebook.




## Install Gradio

In [None]:
!pip install gradio

In [None]:
import gradio as gr

## Run Gradio

In [None]:
gr.ChatInterface(text_generation).launch(share=True,debug=True)