In [5]:
import os

from dotenv import load_dotenv

load_dotenv()

HF_HOME = os.environ["HF_HOME"]
HF_TOKEN = os.getenv("HF_TOKEN", None)
RUNPOD_API_KEY = os.environ["RUNPOD_API_KEY"]

In [6]:
import runpod

runpod.api_key = RUNPOD_API_KEY

In [None]:
# show all possible available GPUs
runpod.get_gpus()

## Create Template


In [8]:
python3_10__cuda_12_1_1_template_id = "rpecke8s9s"

In [None]:
# Set your global API key with `runpod config` or uncomment the line below:
# runpod.api_key = "YOUR_RUNPOD_API_KEY"

try:
    new_template = runpod.create_template(name="test", image_name="runpod/base:0.1.0")

    print(new_template)

except runpod.error.QueryError as err:
    print(err)
    print(err.query)

## Create Pod


In [9]:
# Create your pod, you can set the data_center_id (optional)
# Decide which model you want to use, here we use falcon-40b

# gpu_count = 2
# gpu_type_id = "NVIDIA A100 80GB PCIe"
# model = "mistralai/Mixtral-8x7B-Instruct-v0.1"

gpu_count = 1
gpu_type_id = "NVIDIA GeForce RTX 4090"
model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# model = "mistralai/Mistral-7B-Instruct-v0.2"

served_model_name = model.split("/")[1]

docker_args = f"""
--host 0.0.0
--port 8000
--tensor-parallel-size {gpu_count}
--gpu-memory-utilization 0.9
--model {model}
--served-model-name {served_model_name}
"""

pod = runpod.create_pod(
    name="API_Mixtral-VLLM",
    # image_name="ghcr.io/huggingface/text-generation-inference:0.8",
    # image_name="ghcr.io/mistralai/mistral-src/vllm:latest",
    image_name="runpod/pytorch:2.1.1-py3.10-cuda12.1.1-devel-ubuntu22.04",
    gpu_type_id=gpu_type_id,
    cloud_type="SECURE",
    # data_center_id="US-KS-1",
    # docker_args=f"--host 0.0.0.0 --model {model}",
    docker_args=docker_args,
    env={"HF_HOME": HF_HOME, "HF_TOKEN": HF_TOKEN},
    gpu_count=gpu_count,
    container_disk_in_gb=10,
    volume_in_gb=100,
    ports="80/http,8000/http,8080/http,22/tcp",
    volume_mount_path="/workspace",
    # template_id=python3_10__cuda_12_1_1_template_id,
)

pod_id = pod["id"]

In [10]:
inference_server_url = f"https://{pod_id}-8000.proxy.runpod.net"

inference_server_url

'https://7a3497thj0fahy-8000.proxy.runpod.net'

### Test Inference URL

```bash
curl --request POST \
    --url https://rw4roeeh8ywsze-8000.proxy.runpod.net/v1/chat/completions \
    --header "Content-Type: application/json" \
    --data '{
  "model": "Mixtral-8X7B-Instruct-v0.1",
  "messages": [
    {
      "role": "user",
      "content": "Say this is a test!"

    }
  ],
  "temperature": 0.8,
  "stream": false
}'
```


## Terminate Pod


In [None]:
runpod.terminate_pod(pod_id)