In [1]:
import json
import jsonlines
import os
import pprint
import sys

from openai import OpenAI

In [3]:
endpoint = "https://conductor.arcee.ai/v1"
model = "auto"

api_key=os.getenv("CONDUCTOR_API_KEY")   # You can sign up at https://conductor.arcee.ai

client = OpenAI(
    base_url=endpoint,
    api_key=api_key,
)

In [4]:
input_data_file = "metadata-enrichment-test-data.jsonl"
column_name = "Item"

def load_jsonl(file_path):
    data = []
    with jsonlines.open(file_path) as reader:
        for item in reader:
            data.append(item)
    return data

In [None]:
# Load the data
lines = load_jsonl(input_data_file)

# Display the first few items to verify the data loaded correctly
print(f"Loaded {len(lines)} items")
print("\nSample items:")
for item in lines[:3]:
    print(f"Item: {item['Item']}")
    print(f"SKU: {item['SKU']}")
    print(f"Stock: {item['Stock']}")
    print(f"Last Update: {item['LastUpdate']}")
    print("-" * 50)


In [6]:
def process_item(item_data):
    """Process a single item with the model and return enriched data."""
    item_name = item_data["Item"]
    sku = item_data["SKU"]
    stock = item_data["Stock"]
    last_update = item_data["LastUpdate"]

    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "You are a medical equipment expert. Provide accurate descriptions, applications, and risks for medical items. Always respond with valid JSON."
            },
            {
                "role": "user",
                "content": f"""
                Based on the item "{item_name}", provide a valid JSON document with the following structure:
                {{
                  "Description": "a human-readable description, in 1-2 sentences",
                  "Applications": ["main use case 1", "main use case 2", "etc"],
                  "Risks": ["main precaution 1", "main precaution 2", "etc"]
                }}
                Your answer must stay within the medical equipment domain.
                Don't reuse the item name in the description, applications or risks.
                Ensure your response is a properly formatted JSON object with no additional text.
                Make sure to add a closing brace at the end of the JSON object.
                Don't add extra tabs or spaces.
                Don't use Markdown or backticks.
                """
            }
        ],
        stream=False,
    )
    
    # Handle potential JSON parsing errors
    try:
        enriched_data = json.loads(response.choices[0].message.content)
    except json.JSONDecodeError:
        # Provide a fallback if the response isn't valid JSON
        print(f"\nError parsing JSON for item: {item_name}")
        print(response.choices[0].message.content)
        enriched_data = {
            "Description": "Description unavailable due to formatting error",
            "Applications": ["Not available"],
            "Risks": ["Not available"]
        }

    # Print a dot to indicate progress
    sys.stdout.write('.')
    sys.stdout.flush()

    # Extract the model id, the number of input and output tokens from the response
    model_id = response.model
    input_tokens = response.usage.prompt_tokens
    output_tokens = response.usage.completion_tokens
    
    merged_data = {
        "Item": item_name,
        "SKU": sku,
        "Stock": stock,
        "LastUpdate": last_update,
        "Description": enriched_data.get("Description", ""),
        "Applications": enriched_data.get("Applications", ""),
        "Risks": enriched_data.get("Risks", ""),
    }
    return {
        "merged_data": merged_data,
        "accounting": {
            "model_id": model_id,
            "input_tokens": input_tokens,
            "output_tokens": output_tokens,
        }
    }
    

In [None]:
enriched_lines = []
accounting_data = []

for line in lines:
    result = process_item(line)
    enriched_lines.append(result["merged_data"])
    accounting_data.append(result["accounting"])

In [None]:
# Define a dictionary with model prices
model_prices = {
    "claude-3-7-sonnet": {
        "input_price_dollars": 3.00,
        "output_price_dollars": 15.00,
    },
    "deepseek/r1": {
        "input_price_dollars": 3.00,
        "output_price_dollars": 7.00,
    },
    "gpt-4.1": {
        "input_price_dollars": 2.00,
        "output_price_dollars": 8.00,
    },
    "openai/o3-mini": {
        "input_price_dollars": 1.10,
        "output_price_dollars": 4.40,
    },
    "arcee-ai/virtuoso-large": {
        "input_price_dollars": 0.75,
        "output_price_dollars": 1.20,
    },
    "arcee-ai/virtuoso-medium-v2": {
        "input_price_dollars": 0.50,
        "output_price_dollars": 0.80,
    },
    "arcee-ai/arcee-blitz": {
        "input_price_dollars": 0.45,
        "output_price_dollars": 0.75,
    },
    "arcee-ai/caller-large": {
        "input_price_dollars": 0.55,
        "output_price_dollars": 0.85,
    },
    "arcee-ai/maestro-32b": {
        "input_price_dollars": 0.90,
        "output_price_dollars": 3.30,
    }
}

# Reduce the accounting data on model_id
# Calculate total tokens and cost by model
model_summary = {}

for entry in accounting_data:
    model_id = entry['model_id']
    if model_id.startswith("claude-3-7-sonnet"):
        model_id = "claude-3-7-sonnet"
    if model_id.startswith("gpt-4.1"):
        model_id = "gpt-4.1"
    input_tokens = entry['input_tokens']
    output_tokens = entry['output_tokens']
    
    if model_id not in model_summary:
        model_summary[model_id] = {
            'total_input_tokens': 0,
            'total_output_tokens': 0,
            'total_tokens': 0,
            'total_cost_dollars': 0,
            'invocation_count': 0
        }
    
    model_summary[model_id]['total_input_tokens'] += input_tokens
    model_summary[model_id]['total_output_tokens'] += output_tokens
    model_summary[model_id]['total_tokens'] += (input_tokens + output_tokens)
    model_summary[model_id]['invocation_count'] += 1
    
    # Calculate cost if model exists in price dictionary
    model_key = model_id.lower()  # Normalize for case-insensitive matching
    
    
    for price_model in model_prices:
        if price_model.lower() == model_key:
            input_cost = input_tokens * model_prices[price_model]['input_price_dollars'] / 1e6
            output_cost = output_tokens * model_prices[price_model]['output_price_dollars'] / 1e6
            model_summary[model_id]['total_cost_dollars'] += (input_cost + output_cost)

# Display the summary
for model, stats in model_summary.items():
    print(f"Model: {model}")
    print(f"  Invocation Count: {stats['invocation_count']}")
    print(f"  Total Input Tokens: {stats['total_input_tokens']}")
    print(f"  Total Output Tokens: {stats['total_output_tokens']}")
    print(f"  Total Tokens: {stats['total_tokens']}")
    print(f"  Total Cost: ${stats['total_cost_dollars']:.8f}")
    print()

# Sum all the input and output tokens
total_input_tokens = sum(stats['total_input_tokens'] for stats in model_summary.values())
total_output_tokens = sum(stats['total_output_tokens'] for stats in model_summary.values())
print(f"Total Input Tokens: {total_input_tokens}")
print(f"Total Output Tokens: {total_output_tokens}")

# Sum of all the costs
total_cost = sum(stats['total_cost_dollars'] for stats in model_summary.values())
print(f"Total Cost: ${total_cost:.8f}")
print('')

# Total cost if we had used claude-3-7-sonnet-20250219 for all the tokens
claude_3_7_sonnet_cost = total_input_tokens * model_prices['claude-3-7-sonnet']['input_price_dollars'] + total_output_tokens * model_prices['claude-3-7-sonnet']['output_price_dollars']
claude_3_7_sonnet_cost /= 1e6
print(f"Total Cost if we had used claude-3-7-sonnet: ${claude_3_7_sonnet_cost:.8f}")
print(f"Cost Savings: {((total_cost - claude_3_7_sonnet_cost) / claude_3_7_sonnet_cost) * 100:.2f}%")

# Total cost if we had used gpt-4.1-2025-04-14 for all the tokens
gpt_4_1_cost = total_input_tokens * model_prices['gpt-4.1']['input_price_dollars'] + total_output_tokens * model_prices['gpt-4.1']['output_price_dollars']
gpt_4_1_cost /= 1e6
print(f"Total Cost if we had used gpt-4.1: ${gpt_4_1_cost:.8f}")
print(f"Cost Savings: {((total_cost - gpt_4_1_cost) / gpt_4_1_cost) * 100:.2f}%")



In [9]:
# Save the enriched lines to a file
enriched_lines_file = "enriched_lines.jsonl"

with open(enriched_lines_file, "w") as f:
    for line in enriched_lines:
        f.write(json.dumps(line) + "\n")
