In [26]:
# install kamiwaza client sdk
! cd ../../kamiwaza-sdk && pip install -e .

Obtaining file:///Users/tylerhouchin/Desktop/kamiwaza-core/kamiwaza/kamiwaza-sdk
  Preparing metadata (setup.py) ... [?25ldone
Installing collected packages: kamiwaza-client
  Attempting uninstall: kamiwaza-client
    Found existing installation: kamiwaza-client 0.1.0
    Uninstalling kamiwaza-client-0.1.0:
      Successfully uninstalled kamiwaza-client-0.1.0
  Running setup.py develop for kamiwaza-client
Successfully installed kamiwaza-client-0.1.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [27]:
from kamiwaza_client import KamiwazaClient
import platform
import re
import time
from IPython.display import clear_output
import openai
import warnings
warnings.filterwarnings('ignore')

# Initialize the client
client = KamiwazaClient("http://localhost:7777/api/")

## First check if we have any models deployed

In [28]:
deployments = client.serving.list_deployments()
if len(deployments) == 0:
    print('No models deployed')
else:
    print(deployments)

No models deployed


## Next, lets check if we have any models downloaded that we can deploy.

In [29]:
downloaded_models = client.models.list_model_files()
if len(downloaded_models) == 0:
    print('No model files downloaded')
else:
    print(downloaded_models)

No model files downloaded


# We do not have any models deployed or downloaded, lets get a new model, download it, and deploy it. 


# 1. Let's Download a Model

### 1.1. Search for Qwen2.7-7B-Instruct on Huggingface

In [30]:
model_name = "Qwen/Qwen2.5-7B-Instruct"

In [31]:
models = client.models.search_models(model_name)
for m in range(len(models)):
    print(f'model index: {m}')
    print(models[m])
    print('-----------')

model index: 0
Model: Qwen2.5-7B-Instruct
ID: None
Repo Model ID: Qwen/Qwen2.5-7B-Instruct
Version: None
Author: None
Created: None
Files being downloaded: 0
-----------
model index: 1
Model: Qwen2.5-7B-Instruct-GPTQ-Int4
ID: None
Repo Model ID: Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4
Version: None
Author: None
Created: None
Files being downloaded: 0
-----------
model index: 2
Model: Qwen2.5-7B-Instruct-GPTQ-Int8
ID: None
Repo Model ID: Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8
Version: None
Author: None
Created: None
Files being downloaded: 0
-----------
model index: 3
Model: Qwen2.5-7B-Instruct-AWQ
ID: None
Repo Model ID: Qwen/Qwen2.5-7B-Instruct-AWQ
Version: None
Author: None
Created: None
Files being downloaded: 0
-----------
model index: 4
Model: Qwen-Qwen2.5-7B-Instruct-llamafied
ID: None
Repo Model ID: llamafy/Qwen-Qwen2.5-7B-Instruct-llamafied
Version: None
Author: None
Created: None
Files being downloaded: 0
-----------
model index: 5
Model: Qwen-Qwen2.5-7B-Instruct-llamafied-GGUF
ID: Non

### 1.2. We have a helper function to filter these models to the ones compatible with our OS

In [32]:
compatible_models = client.models.filter_compatible_models("Qwen/Qwen2.5-7B-Instruct")

print("Compatible models:")
for model_info in compatible_models:
    print(f"Model: {model_info['model'].name}")
    print(f"Repo Id: {model_info['model'].repo_modelId}")
    print("Compatible files:")
    for file in model_info['files']:
        print(f"- {file.name}")
    print("---")

Compatible models:
Model: Qwen-Qwen2.5-7B-Instruct-llamafied-GGUF
Repo Id: mradermacher/Qwen-Qwen2.5-7B-Instruct-llamafied-GGUF
Compatible files:
- Qwen-Qwen2.5-7B-Instruct-llamafied.IQ4_XS.gguf
- Qwen-Qwen2.5-7B-Instruct-llamafied.Q2_K.gguf
- Qwen-Qwen2.5-7B-Instruct-llamafied.Q3_K_L.gguf
- Qwen-Qwen2.5-7B-Instruct-llamafied.Q3_K_M.gguf
- Qwen-Qwen2.5-7B-Instruct-llamafied.Q3_K_S.gguf
- Qwen-Qwen2.5-7B-Instruct-llamafied.Q4_0_4_4.gguf
- Qwen-Qwen2.5-7B-Instruct-llamafied.Q4_K_M.gguf
- Qwen-Qwen2.5-7B-Instruct-llamafied.Q4_K_S.gguf
- Qwen-Qwen2.5-7B-Instruct-llamafied.Q5_K_M.gguf
- Qwen-Qwen2.5-7B-Instruct-llamafied.Q5_K_S.gguf
- Qwen-Qwen2.5-7B-Instruct-llamafied.Q6_K.gguf
- Qwen-Qwen2.5-7B-Instruct-llamafied.Q8_0.gguf
- Qwen-Qwen2.5-7B-Instruct-llamafied.f16.gguf
---
Model: Qwen2.5-7B-Instruct-GGUF
Repo Id: Qwen/Qwen2.5-7B-Instruct-GGUF
Compatible files:
- qwen2.5-7b-instruct-fp16-00001-of-00004.gguf
- qwen2.5-7b-instruct-fp16-00002-of-00004.gguf
- qwen2.5-7b-instruct-fp16-00003-of-0

### 1.3. Using the helper function, we select a repo that we want to download the model from and a desired quantization. We initate the download.

In [33]:
# Initiate the download
repo_id = "Qwen/Qwen2.5-7B-Instruct-GGUF"
download_info = client.models.initiate_model_download(repo_id, quantization='q6_k')

print(f"Downloading model: {download_info['model'].name}")
print("Files being downloaded:")
for file in download_info['files']:
    print(f"- {file.name}")



Downloading model: Qwen2.5-7B-Instruct-GGUF
Files being downloaded:
- qwen2.5-7b-instruct-q6_k-00001-of-00002.gguf
- qwen2.5-7b-instruct-q6_k-00002-of-00002.gguf


### 1.4. We can monitor the status of the download here. 


In [34]:
repo_id = "Qwen/Qwen2.5-7B-Instruct-GGUF"

def all_downloads_complete(status):
    return all(s.download_percentage == 100 for s in status)

while True:
    status = client.models.check_download_status(repo_id)
    
    clear_output(wait=True)
    print(f"Download Status for {repo_id}:")
    print("-----------------------------")
    
    for s in status:
        print(f"Model ID: {s.m_id}")
        print(f"Model File ID: {s.id}")
        print(f"Model Name: {s.name}")
        print(f"Download Progress: {s.download_percentage}%")
        print("-----------------------------")
    
    if all_downloads_complete(status):
        print("All downloads completed!")
        break
    
    time.sleep(3)

Download Status for Qwen/Qwen2.5-7B-Instruct-GGUF:
-----------------------------
Model ID: 847e2da0-816c-4c37-a923-5f655faa54fa
Model File ID: 80ff9260-9f21-46ae-b1e4-6f2839b47b39
Model Name: qwen2.5-7b-instruct-q6_k-00001-of-00002.gguf
Download Progress: 100%
-----------------------------
All downloads completed!


# 2. Now let's deploy this model

### 2.1. Get the default model config file (optional)
This was created when we downloaded the model

In [35]:
# get the model id from the status 
model_id = '847e2da0-816c-4c37-a923-5f655faa54fa'
configs = client.models.get_model_configs(model_id)
default_config = next((config for config in configs if config.default), configs[0])
default_config

ModelConfig: Default Model Config
ID: 2546ef96-9135-4ca7-ad8f-0f74f8cd7853
Model ID: 847e2da0-816c-4c37-a923-5f655faa54fa
Default: True
Created: 2024-12-27 23:53:37.224340
Kamiwaza Version: None

### 2.2 Deploy the model with default params

In [36]:
deployment_id = client.serving.deploy_model(model_id)
deployments = client.serving.list_deployments()
print(deployments)

[UIModelDeployment: ID: 92c54dc7-87ec-4427-8453-3df7b104f3b4
Model Name: Qwen2.5-7B-Instruct-GGUF
Status: DEPLOYED
Instances: 1
Host IP: None]


# 3. Let's do some inference

In [48]:
from openai import OpenAI
deployments = client.serving.list_deployments()
valid_deployment = None
for deployment in deployments:
    if deployment.status == 'DEPLOYED' and deployment.instances:
        valid_deployment = deployment
        print(f"Found a deployment of {deployment.m_name} - using it")
        break
if valid_deployment is None:
    print("No valid deployments found.")


Found a deployment of Qwen2.5-7B-Instruct-GGUF - using it


In [49]:
valid_deployment.lb_port

51100

In [54]:
import httpx
from openai import OpenAI

http_client = httpx.Client(base_url=f"http://localhost:{valid_deployment.lb_port}/v1")

# Initialize the OpenAI client with the custom http_client
openai_client = OpenAI(
    api_key="local",
    base_url=f"http://localhost:{valid_deployment.lb_port}/v1",
    http_client=http_client
)

print(f"Endpoint: {openai_client.base_url}")

# Perform inference
try:
    chat_completion = openai_client.chat.completions.create(
        model="local-model",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "What is the capital of New Jersey? What about California? And how do I say 'green tea' in mandarin?"}
        ]
    )
    print("Model response:")
    print(chat_completion.choices[0].message.content)
except Exception as e:
    print(f"An error occurred during inference: {e}")

Endpoint: http://localhost:51100/v1/


2024-12-27 16:02:28,624 - httpx - INFO - HTTP Request: POST http://localhost:51100/v1/chat/completions "HTTP/1.1 200 OK"


Model response:
The capital of New Jersey is Trenton.

The capital of California is Sacramento.

In Mandarin, 'green tea' is said as '绿茶' (lǜ guā lì).
