In [None]:
# install kamiwaza client sdk
! cd ../../kamiwaza-sdk && pip install -e .

In [27]:
from kamiwaza_client import KamiwazaClient
import platform
import re
import time
from IPython.display import clear_output
import openai
import warnings
warnings.filterwarnings('ignore')

# Initialize the client
client = KamiwazaClient("http://localhost:7777/api/")

## First check if we have any models deployed

In [None]:
deployments = client.serving.list_deployments()
if len(deployments) == 0:
    print('No models deployed')
else:
    print(deployments)

## Next, lets check if we have any models downloaded that we can deploy.

In [None]:
downloaded_models = client.models.list_model_files()
if len(downloaded_models) == 0:
    print('No model files downloaded')
else:
    print(downloaded_models)

# We do not have any models deployed or downloaded, lets get a new model, download it, and deploy it. 


# 1. Let's Download a Model

### 1.1. Search for Qwen2.7-7B-Instruct on Huggingface

In [30]:
model_name = "Qwen/Qwen2.5-7B-Instruct"

In [None]:
models = client.models.search_models(model_name)
for m in range(len(models)):
    print(f'model index: {m}')
    print(models[m])
    print('-----------')

### 1.2. We have a helper function to filter these models to the ones compatible with our OS

In [None]:
compatible_models = client.models.filter_compatible_models("Qwen/Qwen2.5-7B-Instruct")

print("Compatible models:")
for model_info in compatible_models:
    print(f"Model: {model_info['model'].name}")
    print(f"Repo Id: {model_info['model'].repo_modelId}")
    print("Compatible files:")
    for file in model_info['files']:
        print(f"- {file.name}")
    print("---")

### 1.3. Using the helper function, we select a repo that we want to download the model from and a desired quantization. We initate the download.

In [None]:
# Initiate the download
repo_id = "Qwen/Qwen2.5-7B-Instruct-GGUF"
download_info = client.models.initiate_model_download(repo_id, quantization='q6_k')

print(f"Downloading model: {download_info['model'].name}")
print("Files being downloaded:")
for file in download_info['files']:
    print(f"- {file.name}")



### 1.4. We can monitor the status of the download here. 


In [None]:
repo_id = "Qwen/Qwen2.5-7B-Instruct-GGUF"

def all_downloads_complete(status):
    return all(s.download_percentage == 100 for s in status)

while True:
    status = client.models.check_download_status(repo_id)
    
    clear_output(wait=True)
    print(f"Download Status for {repo_id}:")
    print("-----------------------------")
    
    for s in status:
        print(f"Model ID: {s.m_id}")
        print(f"Model File ID: {s.id}")
        print(f"Model Name: {s.name}")
        print(f"Download Progress: {s.download_percentage}%")
        print("-----------------------------")
    
    if all_downloads_complete(status):
        print("All downloads completed!")
        break
    
    time.sleep(3)

# 2. Now let's deploy this model

### 2.1. Get the default model config file (optional)
This was created when we downloaded the model

In [None]:
# get the model id from the status 
model_id = '847e2da0-816c-4c37-a923-5f655faa54fa'
configs = client.models.get_model_configs(model_id)
default_config = next((config for config in configs if config.default), configs[0])
default_config

### 2.2 Deploy the model with default params

In [None]:
deployment_id = client.serving.deploy_model(model_id)
deployments = client.serving.list_deployments()
print(deployments)

# 3. Let's do some inference

In [None]:
from openai import OpenAI
deployments = client.serving.list_deployments()
valid_deployment = None
for deployment in deployments:
    if deployment.status == 'DEPLOYED' and deployment.instances:
        valid_deployment = deployment
        print(f"Found a deployment of {deployment.m_name} - using it")
        break
if valid_deployment is None:
    print("No valid deployments found.")


In [None]:
valid_deployment.lb_port

In [None]:
import httpx
import time
from openai import OpenAI

http_client = httpx.Client(base_url=f"http://localhost:{valid_deployment.lb_port}/v1")

# Initialize the OpenAI client with the custom http_client
openai_client = OpenAI(
    api_key="local", 
    base_url=f"http://localhost:{valid_deployment.lb_port}/v1",
    http_client=http_client
)

print(f"Endpoint: {openai_client.base_url}")

# Initial sleep to let server initialize
time.sleep(1)

# Try up to 3 times with 5 second delay
max_retries = 3
for attempt in range(max_retries):
    try:
        chat_completion = openai_client.chat.completions.create(
            model="local-model",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": "What is the capital of New Jersey? What about California? And how do I say 'green tea' in mandarin?"}
            ]
        )
        print("Model response:")
        print(chat_completion.choices[0].message.content)
        break
    except Exception as e:
        if "404" in str(e) and attempt < max_retries - 1:
            print(f"Server not ready (attempt {attempt + 1}/{max_retries}), waiting 5 seconds...")
            time.sleep(5)
            continue
        print(f"An error occurred during inference: {e}")
        break