In [1]:
import os

from dotenv import load_dotenv

load_dotenv()

HF_HOME = os.environ["HF_HOME"]
HF_TOKEN = os.getenv("HF_TOKEN", None)
RUNPOD_API_KEY = os.environ["RUNPOD_API_KEY"]

In [2]:
import runpod

runpod.api_key = RUNPOD_API_KEY

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
## show all possible available GPUs

# runpod.get_gpus()

## Create Template


In [None]:
# try:
#     new_template = runpod.create_template(name="test", image_name="runpod/base:0.1.0")

#     print(new_template)

# except runpod.error.QueryError as err:
#     print(err)
#     print(err.query)

## Create Pod


In [3]:
import random

# data_center_id = "US-KS-1"
# data_center_id = "CA-MTL-1"

pod_name = f"API_Mixtral-VLLM_{random.randint(0, 100)}"
data_center_id = None

# image_name = "runpod/pytorch:2.1.1-py3.10-cuda12.1.1-devel-ubuntu22.04"
# image_name = "ghcr.io/mistralai/mistral-src/vllm:latest"
image_name = "ghcr.io/substratusai/vllm"

gpu_count = 2
gpu_type_id = "NVIDIA A100 80GB PCIe"
model = "mistralai/Mixtral-8x7B-Instruct-v0.1"

# gpu_count = 1
# gpu_type_id = "NVIDIA GeForce RTX 4090"
# model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# model = "mistralai/Mistral-7B-Instruct-v0.2"

served_model_name = model.split("/")[1]

port = 8000

docker_args = ""
# docker_args = f"--host 0.0.0.0 --model {model}"
# docker_args = f"""
# --host 0.0.0 \
# --port {port} \
# --tensor-parallel-size {gpu_count} \
# --gpu-memory-utilization 0.9 \
# --model {model} \
# --served-model-name {served_model_name}
# """

env = {
    "HF_HOME": HF_HOME,
    "HF_TOKEN": HF_TOKEN,
    ###################
    "PORT": port,
    "MODEL": model,
    "SERVED_MODEL_NAME": served_model_name,
}


pod = runpod.create_pod(
    name=pod_name,
    image_name=image_name,
    gpu_type_id=gpu_type_id,
    cloud_type="SECURE",
    data_center_id=data_center_id,
    docker_args=docker_args,
    env=env,
    gpu_count=gpu_count,
    container_disk_in_gb=10,
    volume_in_gb=100,
    ports=f"80/http,8080/http,22/tcp,{port}/http",
    volume_mount_path="/workspace",
    # template_id=python3_10__cuda_12_1_1_template_id,
)

pod_id = pod["id"]
print(pod_id)

k7vk14rau9dnpd


In [5]:
pod["desiredStatus"] == "RUNNING"

True

In [6]:
inference_server_url = f"https://{pod_id}-{port}.proxy.runpod.net"

### Test Inference URL

```bash
curl --request POST \
  --url https://2hxodb9yfzn439-8000.proxy.runpod.net/v1/chat/completions \
  --header "Content-Type: application/json" \
  --data '{
  "model": "TinyLlama-1.1B-Chat-v1.0",
  "messages": [
    {
      "role": "user",
      "content": "Say this is a test!"

    }
  ],
  "temperature": 0.2,
  "stream": false
}'
```


In [23]:
import json

import requests

url = f"{inference_server_url}/v1/chat/completions"
headers = {"Content-Type": "application/json"}
data = {
    "model": served_model_name,
    "messages": [{"role": "user", "content": "Say this is a test!"}],
    "temperature": 0.2,
    "stream": False,
}

response = requests.post(url, headers=headers, data=json.dumps(data))

print(response.json())

{'id': 'cmpl-43a0d0fa170a406c90f901289319d7a0', 'object': 'chat.completion', 'created': 1184415, 'model': 'TinyLlama-1.1B-Chat-v1.0', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': "Sure, here's a test:\n\nThis is a test!"}, 'logprobs': None, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 24, 'total_tokens': 41, 'completion_tokens': 17}}


## Terminate Pod


In [None]:
runpod.terminate_pod(pod_id)