<a href="https://colab.research.google.com/github/kyledinh/gpt-prive/blob/main/notebooks/runpod/falcon_tgi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

References: 

- https://github.com/huggingface/text-generation-inference
- https://vilsonrodrigues.medium.com/serving-falcon-models-with-text-generation-inference-tgi-5f32005c663b

### WORKING NOTES:

- https://github.com/huggingface/text-generation-inference#api-documentation

```
model=meta-llama/Llama-2-7b-chat-hf
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
token=<your cli READ token>

docker run --gpus all --shm-size 1g -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:0.9.3 --model-id $model
```

## On-premise

In [None]:
"""
docker run --gpus all --shm-size 1g -p 8080:80 -v $PWD/data:/data 
     ghcr.io/huggingface/text-generation-inference:0.8 \ 
     --model-id tiiuae/falcon-7b-instruct \ 
     --num-shard 1  \ 
     --quantize bitsandbytes     
"""     

Bash

In [None]:
"""
curl 127.0.0.1:8080/generate \
     -X POST \
     -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17}}' \
     -H 'Content-Type: application/json'

curl 127.0.0.1:8080/generate_stream \
    -X POST \
    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17}}' \
    -H 'Content-Type: application/json'

curl 127.0.0.1:8080/ \
    -X POST \
    -d '{"inputs":"What is Deep Learning?",
          "parameters":{"max_new_tokens":17},
          "stream": True}' \
    -H 'Content-Type: application/json'
"""

TGI Client

In [None]:
%pip install text-generation

In [None]:
from text_generation import Client

# Generate
client = Client("http://127.0.0.1:8080")
print(client.generate("What is Deep Learning?", max_new_tokens=17).generated_text)

# Generate stream
text = ""
for response in client.generate_stream("What is Deep Learning?", max_new_tokens=17):
    if not response.token.special:
        text += response.token.text
print(text)

LangChain

In [None]:
%pip install langchain transformers

In [None]:
# Wrapper to TGI client with langchain

from langchain.llms import HuggingFaceTextGenInference

inference_server_url_local = "http://127.0.0.1:8080"

llm_local = HuggingFaceTextGenInference(
    inference_server_url=inference_server_url_local,
    max_new_tokens=1000,
    top_k=10,
    top_p=0.95,
    typical_p=0.95,
    temperature=0.7,
    repetition_penalty=1.03,
)

In [None]:
from langchain import PromptTemplate, LLMChain

template = """Question: {question}
Answer: Let's think step by step."""

prompt = PromptTemplate(
    template=template, 
    input_variables= ["question"]
)

llm_chain = LLMChain(prompt=prompt, llm=llm_local)

In [None]:
llm_chain("your question")

## Run Pod

In [None]:
%pip install runpod python-dotenv

In [3]:
import runpod
# Load .env variables
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

HF_ACCESS_TOKEN = os.getenv("HF_ACCESS_TOKEN", "add-here-if-not-set-in-env-file")
RUNPOD_API_KEY = os.getenv("RUNPOD_API_KEY", "add-here-if-not-set-in-env-file")
 
assert HF_ACCESS_TOKEN.startswith("hf_"), "This doesn't look like a valid Hugging Face Token"
print("HF_ACCESS_TOKEN configured")
assert not RUNPOD_API_KEY.startswith("add-here"), "This doesn't look like a valid Runpod API Key"
print("RUNPOD_API_KEY configured")

# your key
runpod.api_key = RUNPOD_API_KEY 

  from .autonotebook import tqdm as notebook_tqdm


HF_ACCESS_TOKEN configured
RUNPOD_API_KEY configured


In [4]:
num_shard = 1
model_id = "tiiuae/falcon-7b-instruct"
quantize = "bitsandbytes"


# https://docs.runpod.io/docs/create-pod (instruction in curl)
# https://vilsonrodrigues.medium.com/serving-falcon-models-with-text-generation-inference-tgi-5f32005c663b (python)
pod = runpod.create_pod(
    name="Falcon-7B-Instruct-POD",
    image_name="ghcr.io/huggingface/text-generation-inference:0.8",
    gpu_type_id="NVIDIA GeForce RTX 4080",
    cloud_type="COMMUNITY",
    docker_args=f"--model-id {model_id} --num-shard {num_shard} --quantize {quantize}",
    gpu_count=num_shard,
    volume_in_gb=50,
    container_disk_in_gb=5,
    ports="80/http",
    volume_mount_path="/data",
)

In [5]:
from langchain.llms import HuggingFaceTextGenInference

inference_server_url_cloud = f"https://{pod["id"]}-80.proxy.runpod.net"

llm_cloud = HuggingFaceTextGenInference(
    inference_server_url=inference_server_url_cloud,
    max_new_tokens=1000,
    top_k=10,
    top_p=0.95,
    typical_p=0.95,
    temperature=0.3,
    repetition_penalty=1.03,
)

SyntaxError: f-string: unmatched '[' (1963392774.py, line 3)

In [None]:
llm_chain_cloud = LLMChain(prompt=prompt, llm=llm_cloud)

In [None]:
llm_chain_cloud("your new question to falcon")

In [None]:
# stop pod
runpod.stop_pod(pod["id"])

In [None]:
# terminate
runpod.terminate_pod(pod["id"])

## AWS Support

https://aws.amazon.com/pt/blogs/machine-learning/announcing-the-launch-of-new-hugging-face-llm-inference-containers-on-amazon-sagemaker/