# Model Evaluation: Counting the 'r's in "strawberry"

This notebook demonstrates how to use Kamiwaza SDK to evaluate multiple language models on a simple counting task. We'll deploy three different models, ask each one to count the number of 'r's in the word "strawberry" (the correct answer is 3), and compare their responses.

This approach shows how Kamiwaza can be used for automated model evaluation and benchmarking.

## 1. Setup

First, we'll initialize the Kamiwaza client and define the list of models we want to evaluate. We'll also create a directory to store our evaluation results.

In [None]:
from kamiwaza_sdk import kamiwaza_sdk as kz
import os
import json
from datetime import datetime

In [None]:
# Initialize the Kamiwaza client
client = kz("http://localhost:7777/api/")

# Define the models to evaluate
models = [
    "Qwen/Qwen2.5-Coder-0.5B-Instruct-GGUF",
    "Qwen/Qwen2.5-7B-Instruct-GGUF",
    "bartowski/Llama-3-8B-Instruct-Coder-v2-GGUF"
]

os.makedirs("eval_results", exist_ok=True)

## 2. Run Evaluations

Next, we'll loop through each model and:
1. Download and deploy the model (if not already downloaded)
2. Wait for the model to fully initialize
3. Run 50 evaluation queries asking "How many r's are in the word 'strawberry'?"
4. Save the results to a JSON file
5. Stop the deployment before moving to the next model

This automated process allows us to systematically test each model with the same prompt and collect their responses for comparison.

In [None]:
import time
from datetime import datetime
import json
import os

# Create output directory for results
os.makedirs("eval_results", exist_ok=True)

# Number of evaluation runs per model
num_runs = 500

# Evaluation prompt
prompt = "How many r's are in the word 'strawberry'? ONLY RESPOND WITH A SINGLE NUMBER"

# Track overall results
all_results = {}

for model_repo in models:
    print(f"Evaluating model: {model_repo}")
    model_name = model_repo.split('/')[-1]
    
    # Download and deploy the model
    print(f"Downloading and deploying {model_name}...")
    result = client.models.download_and_deploy_model(model_repo)
    deployment_id = result.get('deployment_id')
    
    if not deployment_id:
        print(f"Failed to deploy {model_name}, skipping...")
        continue
    
    # Fixed wait time to ensure model is fully loaded
    wait_time = 15  # seconds
    print(f"Waiting {wait_time} seconds for model to fully initialize...")
    time.sleep(wait_time)
    
    # Create OpenAI client for the model
    openai_client = client.openai.get_client(repo_id=model_repo)
    
    # Run evaluations and collect responses
    model_results = []
    print(f"Running {num_runs} evaluations...")
    
    for i in range(num_runs):
        try:
            start_time = time.time()
            response = openai_client.chat.completions.create(
                messages=[
                    {"role": "user", "content": prompt}
                ],
                model="model",
            )
            end_time = time.time()
            
            result = {
                "run_id": i,
                "response": response.choices[0].message.content.strip(),
                "time_seconds": end_time - start_time
            }
            model_results.append(result)
            
            print(f"Run {i+1}/{num_runs}: Response = '{result['response']}' ({result['time_seconds']:.2f}s)")
        except Exception as e:
            print(f"Error on run {i+1}: {str(e)}")
            result = {
                "run_id": i,
                "response": "ERROR",
                "time_seconds": time.time() - start_time,
                "error": str(e)
            }
            model_results.append(result)
    
    # Save the results to a file
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"eval_results/{model_name}_{timestamp}.json"
    
    with open(filename, "w") as f:
        json.dump(model_results, f, indent=2)
    
    print(f"Results saved to {filename}")
    all_results[model_name] = {
        "filename": filename,
        "results": model_results
    }
    
    # Stop the deployment
    print(f"Stopping deployment for {model_name}...")
    client.serving.stop_deployment(repo_id=model_repo)

# Save the overall results
with open("eval_results/all_models_summary.json", "w") as f:
    # We'll save just the filenames in the summary to avoid duplicating all the data
    summary = {k: {"filename": v["filename"]} for k, v in all_results.items()}
    json.dump(summary, f, indent=2)

print("All evaluations complete!")

## 3. Display Results

Now we'll read the saved results and display a summary for each model, including:
- Total number of evaluation runs
- Average response time
- Distribution of responses
- Accuracy (percentage of correct answers)

This gives us a quick overview of how each model performed on our task.

In [None]:
import os
import json

# Get all result files
result_files = [f for f in os.listdir("eval_results") if f.endswith(".json") and f != "all_models_summary.json"]

for file in result_files:
    print(f"\n=== Results from {file} ===")
    with open(f"eval_results/{file}", "r") as f:
        data = json.load(f)
        
    # Count occurrences of each response
    responses = {}
    for item in data:
        response = item["response"]
        if response not in responses:
            responses[response] = 0
        responses[response] += 1
    
    # Calculate average response time
    avg_time = sum(item["time_seconds"] for item in data) / len(data)
    
    # Display summary
    print(f"Total runs: {len(data)}")
    print(f"Average response time: {avg_time:.2f} seconds")
    print("Response distribution:")
    for response, count in responses.items():
        percentage = (count / len(data)) * 100
        print(f"  '{response}': {count} ({percentage:.1f}%)")
    
    # Check for correct answers (3)
    correct = sum(1 for item in data if item["response"] == "3")
    accuracy = (correct / len(data)) * 100
    print(f"Accuracy (correct answer is '3'): {correct}/{len(data)} ({accuracy:.1f}%)")

## 4. Visualize Response Distribution

Finally, we'll create a visualization comparing how each model responded to our question across all trials.

The chart shows the percentage distribution of answers from each model to the prompt "How many r's are in strawberry?" (the correct answer is 3). 

Qwen2.5-7B-Instruct-GGUF shows remarkably consistent performance with 96% correct answers, while the smaller Qwen2.5-Coder-0.5B-Instruct model and Llama-3-8B-Instruct show more varied responses. This simple evaluation demonstrates how Kamiwaza can be used to quickly benchmark multiple models on the same task.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os
import json

# Get all result files
result_files = [f for f in os.listdir("eval_results") if f.endswith(".json") and f != "all_models_summary.json"]

# Prepare data for visualization
model_names = []
response_distributions = []

for file in result_files:
    model_name = file.split('_')[0]
    model_names.append(model_name)
    
    with open(f"eval_results/{file}", "r") as f:
        data = json.load(f)
    
    # Get response distribution
    responses = {}
    for item in data:
        response = item["response"]
        if response not in responses:
            responses[response] = 0
        responses[response] += 1
    response_distributions.append(responses)

# Plot response distributions
plt.figure(figsize=(14, 8))
width = 0.2
all_responses = sorted(set(resp for dist in response_distributions for resp in dist.keys()))
x = np.arange(len(all_responses))

for i, (model, dist) in enumerate(zip(model_names, response_distributions)):
    values = [dist.get(resp, 0) for resp in all_responses]
    percentages = [v / sum(values) * 100 for v in values]
    plt.bar(x + (i - len(model_names)/2 + 0.5) * width, percentages, width, label=model)

plt.ylabel('Percentage of Responses (%)', fontsize=12)
plt.title('Model Response Distribution: "How many r\'s are in strawberry?"', fontsize=16)
plt.xticks(x, all_responses, fontsize=12)
plt.legend(fontsize=12)
plt.ylim(0, 105)  # Leave room for percentage labels

# Add percentage labels on top of bars
for i, (model, dist) in enumerate(zip(model_names, response_distributions)):
    values = [dist.get(resp, 0) for resp in all_responses]
    percentages = [v / sum(values) * 100 for v in values]
    for j, pct in enumerate(percentages):
        if pct > 0:  # Only add labels to non-zero bars
            plt.text(
                x[j] + (i - len(model_names)/2 + 0.5) * width, 
                pct + 2, 
                f"{pct:.0f}%", 
                ha='center',
                fontsize=10
            )

plt.tight_layout()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()