In [2]:
!pip install -q fastapi uvicorn pyngrok litellm langfuse requests pandas plotly kaleido

In [3]:
import os
import getpass

os.environ["GEMINI_API_KEY"] = getpass.getpass("Enter your Google AI Studio API Key: ")
# --- ADD THIS LINE ---
os.environ["GROQ_API_KEY"] = getpass.getpass("Enter your Groq API Key: ")
# --- END ADD ---
NGROK_AUTHTOKEN = getpass.getpass("Enter your ngrok Authtoken: ")
os.environ["LANGFUSE_PUBLIC_KEY"] = getpass.getpass("Enter your Langfuse Public Key: ")
os.environ["LANGFUSE_SECRET_KEY"] = getpass.getpass("Enter your Langfuse Secret Key: ")

Enter your Google AI Studio API Key: ··········
Enter your Groq API Key: ··········
Enter your ngrok Authtoken: ··········
Enter your Langfuse Public Key: ··········
Enter your Langfuse Secret Key: ··········


In [4]:
%%writefile app.py

import os
import time
from fastapi import FastAPI, HTTPException
import litellm
import traceback # Keep for potential future debugging
from typing import List, Dict

litellm.callbacks = ["langfuse"]
app = FastAPI()

REFERENCE_ANSWERS = {
    "what is the capital of france?": "Paris",
    "what is 2+2?": "4",
    "hello": "Hello",
    "calculate 15 * 23": "345",
    "calculate 125 * 47": "5875",
    # Add reference answers for the new factual prompts if desired
    "what is 5 + 7?": "12",
    "calculate 31 multiplied by 19": "589",
    "who wrote the play 'romeo and juliet'?": "William Shakespeare",
    "what is 987 / 3?": "329",
    "what branch of science studies stars and planets?": "Astronomy",
    "what is the capital city of canada?": "Ottawa",
}

# --- All routes now point to gemini/gemini-2.5-flash, Router order corrected ---
def smart_router(prompt: str) -> dict:
    """Route based on prompt type - ALL USE FLASH MODEL."""
    p = prompt.lower()
    base_model = "gemini/gemini-2.5-flash" # Define the single model here

    # 1. Check for precise/calculation prompts first
    if any(w in p for w in ["calculate", "what is", "capital", "+", "-", "*", "/", "multiplied by", "divided by"]):
        return {"model": base_model, "temp": 0.1, "name": "Flash-Precise"}

    # 2. Check for CODING prompts BEFORE creative prompts
    if any(w in p for w in ["code", "function", "python", "debug", "error in this python code"]):
        return {"model": base_model, "temp": 0.3, "name": "Flash-Coder"}

    # 3. Check for creative prompts (now less likely to overlap with code)
    if any(w in p for w in ["write", "poem", "story", "creative", "haiku", "imagine", "describe"]):
        return {"model": base_model, "temp": 0.9, "name": "Flash-Creative"}

    # 4. Check for analytical/factual prompts (broader keywords)
    if any(w in p for w in ["analyze", "compare", "explain", "themes", "summarize", "who wrote", "what branch", "common colors"]) or len(prompt) > 200: # Reduced length threshold slightly
        return {"model": base_model, "temp": 0.5, "name": "Flash-Analyst"} # Uses Flash

    # 5. Fallback to balanced
    return {"model": base_model, "temp": 0.7, "name": "Flash-Balanced"}
# --- END MODIFICATION ---

def calculate_cost(completion, model_name: str) -> float:
    """Calculate cost based on tokens. Assumes only Flash model is used."""
    try:
        # Check if usage object and tokens exist, otherwise return 0
        if not hasattr(completion, 'usage') or not completion.usage:
             print("⚠️ WARNING: Usage object missing in completion, cannot calculate cost.")
             return 0.0
        p_tok = getattr(completion.usage, 'prompt_tokens', 0)
        c_tok = getattr(completion.usage, 'completion_tokens', 0)

        # Gemini Flash costs: $0.00001875/1K input, $0.000075/1K output
        return (p_tok * 0.00001875 + c_tok * 0.000075) / 1000
    except Exception as e:
        print(f"❌ ERROR calculating cost: {e}")
        return 0.0


def check_accuracy(prompt: str, response: str) -> dict:
    """Check if response matches reference answer."""
    ref = REFERENCE_ANSWERS.get(prompt.lower().strip().replace('?','')) # Clean prompt slightly
    if not ref:
        return {"match": None, "has_ref": False}

    # Make matching more robust (ignore case, punctuation, extra words)
    response_cleaned = response.lower().strip().replace('.','').replace(',','')
    ref_cleaned = ref.lower().strip()

    match = ref_cleaned in response_cleaned
    # For calculations, also check if the number is simply present
    if ref_cleaned.isdigit():
        match = match or f" {ref_cleaned} " in f" {response_cleaned} " or response_cleaned.endswith(f" {ref_cleaned}")

    return {"match": match, "has_ref": True, "expected": ref}


@app.post("/chat")
def chat(request_body: dict):
    """Single chat endpoint using only Gemini Flash."""
    prompt = request_body.get("prompt", "Hello")
    config = smart_router(prompt)
    timeout_duration = 45 # Keep slightly longer timeout for safety
    start = time.time()

    try:
        # LiteLLM uses GEMINI_API_KEY from environment automatically
        print(f"Attempting model: {config['model']} with name: {config['name']}") # Debug print

        completion = litellm.completion(
            model=config["model"],
            messages=[{"role": "user", "content": prompt}],
            temperature=config["temp"],
            max_tokens=500,
            timeout=timeout_duration,
            # No api_key needed here, LiteLLM uses env var for Gemini
        )

        latency = time.time() - start

        # More robust check for valid completion structure
        if not completion or not hasattr(completion, 'choices') or not completion.choices or not completion.choices[0].message:
             # Handle potential empty message or missing attributes gracefully
             if completion and hasattr(completion, 'model_response') and hasattr(completion.model_response, 'get'):
                  finish_reason = completion.model_response.get('usageMetadata', {}).get('finishReason', 'Unknown')
                  if finish_reason == 'SAFETY':
                       raise HTTPException(status_code=400, detail="API request blocked due to SAFETY filters.")
                  elif finish_reason == 'RECITATION':
                       raise HTTPException(status_code=400, detail="API request blocked due to RECITATION filters.")
             raise HTTPException(status_code=500, detail="API returned empty or invalid response structure")


        response = getattr(completion.choices[0].message, 'content', None) # Safely get content
        if response is None:
             # Check finish reason again if content is None
             finish_reason = "Unknown"
             if hasattr(completion, 'model_response') and hasattr(completion.model_response, 'get'):
                   finish_reason = completion.model_response.get('usageMetadata', {}).get('finishReason', 'Unknown')
             raise ValueError(f"Response content is None. Finish Reason: {finish_reason}")


        # Handle potential missing usage info more gracefully
        p_tok, c_tok, total_tok = 0, 0, 0
        if hasattr(completion, 'usage') and completion.usage:
            p_tok = getattr(completion.usage, 'prompt_tokens', 0)
            c_tok = getattr(completion.usage, 'completion_tokens', 0)
            total_tok = getattr(completion.usage, 'total_tokens', p_tok + c_tok) # Estimate total if missing
        else:
            print("⚠️ WARNING: Usage attribute missing, estimating tokens.")
            p_tok = len(prompt.split()) # Estimate
            c_tok = len(response.split()) if response else 0 # Estimate
            total_tok = p_tok + c_tok

        # Pass the friendly NAME from the router to calculate_cost
        cost = calculate_cost(completion, config["name"])
        accuracy = check_accuracy(prompt, response)

        return {
            "response": response,
            "model": config["name"], # Log the friendly name
            "temperature": config["temp"],
            "latency_sec": round(latency, 3),
            "tokens_per_sec": round(total_tok / latency, 2) if latency > 0 else 0,
            "cost_usd": round(cost, 9),
            "tokens": {"prompt": p_tok, "completion": c_tok, "total": total_tok},
            "accuracy": accuracy
        }

    except litellm.RateLimitError as e: # Catch specific RateLimitError
         print(f"RATE LIMIT HIT for model {config.get('model', 'N/A')}: {e}")
         raise HTTPException(status_code=429, detail=f"Rate limit exceeded. {str(e)}")
    except litellm.APIConnectionError as e:
         print(f"API CONNECTION ERROR for model {config.get('model', 'N/A')}: {e}")
         raise HTTPException(status_code=503, detail=f"API connection error. {str(e)}")
    except litellm.AuthenticationError as e:
         print(f"AUTHENTICATION ERROR for model {config.get('model', 'N/A')}: {e}")
         raise HTTPException(status_code=401, detail=f"Authentication error. Check API Key. {str(e)}")
    except HTTPException as e: # Re-raise known HTTP exceptions
         raise e
    except Exception as e: # Catch other potential errors
        error_type_name = type(e).__name__
        print(f"❌ UNEXPECTED ERROR during litellm.completion for model {config.get('model', 'N/A')}:")
        print(f"   Error Type: {error_type_name}")
        print(f"   Error Details: {str(e)}")
        # traceback.print_exc() # Uncomment for full stack trace if needed

        api_response_text = "N/A"
        # Try getting response text for common request exception types
        if hasattr(e, 'response') and hasattr(e.response, 'text'): api_response_text = e.response.text
        elif hasattr(e, 'message'): api_response_text = getattr(e, 'message', str(e))

        # Default to 500 but include more specific info
        raise HTTPException(
            status_code=500,
            detail=f"Completion failed for model {config.get('model', 'N/A')}. Type: {error_type_name}. Detail: {str(e)}. API Response: {api_response_text}"
        )


@app.post("/batch")
def batch_chat(request_body: dict):
    """Batch processing endpoint with increased delay"""
    prompts = request_body.get("prompts", [])
    if not prompts:
        raise HTTPException(status_code=400, detail="No prompts provided")
    results = []
    print(f"\n--- Starting Batch of {len(prompts)} prompts ---") # Add batch start log
    for i, prompt in enumerate(prompts):
        print(f"Batch progress: Processing prompt {i+1}/{len(prompts)}") # Add progress log
        try:
            result = chat({"prompt": prompt})
            results.append({"success": True, "data": result})
        except Exception as e:
            error_message = str(e)
            status_code = 500 # Default
            if isinstance(e, HTTPException):
                 error_message = e.detail # Get the detailed message from chat()
                 status_code = e.status_code
            results.append({"success": False, "error": f"Status {status_code}: {error_message}", "prompt": prompt})
            print(f"   Batch Error on prompt {i+1} (Status {status_code}): {error_message[:200]}...") # Log error during batch

        # --- Increased delay to 5 seconds ---
        print(f"   Waiting 5 seconds before next request...")
        time.sleep(5.0)
        # --- END Increased delay ---

    print(f"--- Finished Batch of {len(prompts)} prompts ---") # Add batch end log
    return {
        "results": results,
        "total": len(prompts),
        "successful": sum(1 for r in results if r.get("success")),
        "failed": sum(1 for r in results if not r.get("success"))
    }

@app.get("/")
def root():
    # Update the list of models to reflect actual usage
    return {
        "status": "OK",
        "models": ["Flash-Precise", "Flash-Creative", "Flash-Coder", "Flash-Analyst", "Flash-Balanced"],
        "endpoints": { "/chat": "Single prompt processing", "/batch": "Batch prompt processing", "/": "Status check" }
    }

@app.get("/health")
def health():
    return {"status": "healthy", "timestamp": time.time()}

Writing app.py


In [5]:
import os
import time
from pyngrok import ngrok
import requests

# ⭐⭐⭐ ADD THIS LINE - Define your token BEFORE using it! ⭐⭐⭐
NGROK_AUTHTOKEN = "34f1taGA1zAnbN2Vd8LHdSUd6E4_tZpJwbGk5a8M74ebwTJd"  # Get from https://dashboard.ngrok.com

# Now set it
ngrok.set_auth_token(NGROK_AUTHTOKEN)

print("Starting lightweight server...")
os.system("uvicorn app:app --host 0.0.0.0 --port 8000 > server.log 2>&1 &")

time.sleep(8)

try:
    test = requests.get("http://localhost:8000/", timeout=5)
    print(f"✅ Server running: {test.json()}")
except Exception as e:
    print(f"❌ Failed: {e}")
    os.system("tail -20 server.log")
    raise

public_url = ngrok.connect(8000)
SERVER_URL = public_url.public_url
print(f"✅ Live at: {SERVER_URL}")
print(f"📊 Stats: {SERVER_URL}/stats")

Starting lightweight server...
✅ Server running: {'status': 'OK', 'models': ['Flash-Precise', 'Flash-Creative', 'Flash-Coder', 'Flash-Analyst', 'Flash-Balanced'], 'endpoints': {'/chat': 'Single prompt processing', '/batch': 'Batch prompt processing', '/': 'Status check'}}
✅ Live at: https://obovoid-unshifting-asia.ngrok-free.dev
📊 Stats: https://obovoid-unshifting-asia.ngrok-free.dev/stats


In [7]:
import pandas as pd
from datetime import datetime
from typing import List, Dict
from collections import Counter

GPU_PROFILES = {
    "T4_COLAB": {
        "name": "NVIDIA Tesla T4",
        "compute_capability": 7.5,
        "memory_gb": 16,
        "tdp_watts": 70,
        "fp32_tflops": 8.1,
        "efficiency_factor": 1.0,
        "location": "Google Cloud US-Central (Iowa)",
        "grid_intensity_gco2_kwh": 394,
        "grid_source": "Mixed (Coal 48%, Wind 28%, Nuclear 15%, Gas 9%)",
        "pue": 1.10
    },
    "A100_SIMULATED": {
        "name": "NVIDIA A100 80GB",
        "compute_capability": 8.0,
        "memory_gb": 80,
        "tdp_watts": 400,
        "fp32_tflops": 19.5,
        "efficiency_factor": 0.4,
        "location": "AWS US-East-1 (Virginia)",
        "grid_intensity_gco2_kwh": 340,
        "grid_source": "Mixed (Gas 52%, Nuclear 31%, Coal 8%, Renewables 9%)",
        "pue": 1.15
    },
    "L40S_SIMULATED": {
        "name": "NVIDIA L40S",
        "compute_capability": 8.9,
        "memory_gb": 48,
        "tdp_watts": 350,
        "fp32_tflops": 91.6,
        "efficiency_factor": 0.5,
        "location": "Azure West Europe (Netherlands)",
        "grid_intensity_gco2_kwh": 220,
        "grid_source": "Mixed (Gas 45%, Wind 25%, Solar 15%, Coal 10%, Nuclear 5%)",
        "pue": 1.18
    }
}

def calculate_enhanced_metrics(model_name: str, total_tokens: int,
                               latency_sec: float, gpu_profile: dict) -> dict:
    """Calculate comprehensive energy and carbon metrics"""

    if "Pro" in model_name:
        base_joules_per_token = 0.5
    else:
        base_joules_per_token = 0.05

    adjusted_joules_per_token = base_joules_per_token * gpu_profile['efficiency_factor']

    compute_joules = total_tokens * adjusted_joules_per_token
    compute_kwh = compute_joules / 3_600_000

    idle_power_watts = gpu_profile['tdp_watts'] * 0.3
    idle_joules = idle_power_watts * latency_sec
    idle_kwh = idle_joules / 3_600_000

    total_kwh_with_pue = (compute_kwh + idle_kwh) * gpu_profile['pue']
    carbon_gco2e = total_kwh_with_pue * gpu_profile['grid_intensity_gco2_kwh']
    energy_cost_usd = total_kwh_with_pue * 0.10

    return {
        'compute_kwh': round(compute_kwh, 9),
        'idle_kwh': round(idle_kwh, 9),
        'total_kwh': round(total_kwh_with_pue, 9),
        'carbon_gco2e': round(carbon_gco2e, 9),
        'energy_cost_usd': round(energy_cost_usd, 9),
        'joules_per_token': round(adjusted_joules_per_token, 6)
    }

print("✅ GPU Profiles and Metrics Calculator loaded")

✅ GPU Profiles and Metrics Calculator loaded


In [8]:
class BatchProcessor:
    """Handles batch processing across different GPU profiles"""

    def __init__(self, server_url: str):
        self.server_url = server_url
        self.results = []

    def process_batch(self, prompts: List[str], gpu_profile_name: str = "T4_COLAB") -> List[dict]:
        """Process a batch of prompts on a specific GPU profile"""
        gpu_profile = GPU_PROFILES[gpu_profile_name]
        batch_results = []

        print(f"\n{'='*80}")
        print(f"🖥️  Processing on {gpu_profile['name']}")
        print(f"📍 Location: {gpu_profile['location']}")
        print(f"⚡ Grid Intensity: {gpu_profile['grid_intensity_gco2_kwh']} gCO2e/kWh")
        print(f"🌍 Grid Source: {gpu_profile['grid_source']}")
        print(f"💾 Memory: {gpu_profile['memory_gb']} GB")
        print(f"⚙️  TDP: {gpu_profile['tdp_watts']} W")
        print(f"🏢 PUE: {gpu_profile['pue']}")
        print(f"{'='*80}\n")

        batch_start_time = time.time()
        batch_id = datetime.now().strftime('%Y%m%d_%H%M%S')

        for idx, prompt in enumerate(prompts, 1):
            print(f"[{idx}/{len(prompts)}] 📤 {prompt[:60]}...")

            retry_count = 0
            # --- MODIFICATION: Reduced retries ---
            max_retries = 3
            # --- END MODIFICATION ---
            success = False

            while retry_count < max_retries and not success:
                try:
                    start = time.time()
                    response = requests.post(
                        f"{self.server_url}/chat",
                        json={"prompt": prompt},
                        timeout=45 # Keep timeout reasonable for potential delays
                    )
                    latency = time.time() - start

                    if response.status_code == 200:
                        data = response.json()
                        if not data or 'model' not in data or 'tokens' not in data:
                            raise ValueError("Invalid response structure")

                        enhanced = calculate_enhanced_metrics(
                            data['model'], data['tokens']['total'], latency, gpu_profile
                        )
                        result = {
                            'timestamp': datetime.now().isoformat(), 'batch_id': batch_id, 'prompt_id': idx,
                            'prompt': prompt, 'prompt_length': len(prompt),
                            'model_name': data['model'], 'temperature': data['temperature'],
                            'gpu_name': gpu_profile['name'], 'gpu_memory_gb': gpu_profile['memory_gb'],
                            'gpu_tdp_watts': gpu_profile['tdp_watts'], 'gpu_tflops': gpu_profile['fp32_tflops'],
                            'gpu_location': gpu_profile['location'], 'gpu_compute_capability': gpu_profile['compute_capability'],
                            'grid_intensity_gco2_kwh': gpu_profile['grid_intensity_gco2_kwh'],
                            'grid_source': gpu_profile['grid_source'], 'datacenter_pue': gpu_profile['pue'],
                            'latency_sec': round(latency, 3),
                            'tokens_per_sec': round(data['tokens']['total'] / latency, 2) if latency > 0 else 0,
                            'prompt_tokens': data['tokens']['prompt'], 'completion_tokens': data['tokens']['completion'],
                            'total_tokens': data['tokens']['total'],
                            'api_cost_usd': data['cost_usd'], 'energy_cost_usd': enhanced['energy_cost_usd'],
                            'total_cost_usd': round(data['cost_usd'] + enhanced['energy_cost_usd'], 9),
                            'compute_kwh': enhanced['compute_kwh'], 'idle_kwh': enhanced['idle_kwh'],
                            'total_kwh': enhanced['total_kwh'], 'joules_per_token': enhanced['joules_per_token'],
                            'carbon_gco2e': enhanced['carbon_gco2e'],
                            'carbon_per_token': round(enhanced['carbon_gco2e'] / data['tokens']['total'], 9) if data['tokens']['total'] > 0 else 0,
                            'response': data['response'][:200], 'response_full_length': len(data['response']),
                            'has_reference': data['accuracy']['has_ref'],
                            'accuracy_match': data['accuracy'].get('match'),
                            'expected_answer': data['accuracy'].get('expected', '')
                        }
                        batch_results.append(result); self.results.append(result)
                        print(f"   ✅ {data['model']} | {latency:.2f}s | ${result['total_cost_usd']:.6f} | {enhanced['carbon_gco2e']:.6f} gCO2e | {data['tokens']['total']} tokens")
                        if data['accuracy']['has_ref']:
                            match_symbol = "✓" if data['accuracy']['match'] else "✗"
                            print(f"      {match_symbol} Accuracy: {data['accuracy']['match']} (expected: {data['accuracy']['expected']})")
                        success = True
                    elif response.status_code == 500:
                        # Use the detailed error from the response if available
                        error_detail = response.text
                        try:
                            error_json = response.json()
                            if 'detail' in error_json: error_detail = error_json['detail']
                        except: pass # Ignore if response is not JSON
                        print(f"   ⚠️  Server error (500) - Retry {retry_count + 1}/{max_retries}. Detail: {error_detail[:200]}...") # Log detail
                        retry_count += 1
                        time.sleep(2 ** retry_count)
                    else:
                        print(f"   ❌ Error {response.status_code}: {response.text}")
                        break
                except requests.exceptions.Timeout:
                    print(f"   ⏱️  Timeout - Retry {retry_count + 1}/{max_retries}")
                    retry_count += 1; time.sleep(2 ** retry_count)
                except requests.exceptions.RequestException as e: print(f"   ❌ Request failed: {str(e)}"); break
                except (KeyError, ValueError) as e: print(f"   ❌ Data error: {str(e)}"); break
                except Exception as e: print(f"   ❌ Unexpected error: {str(e)}"); break

            if not success: print(f"   ❌ Failed after {max_retries} retries")
            time.sleep(1.5) # Keep the delay from previous step

        batch_duration = time.time() - batch_start_time
        if batch_results:
            print(f"\n{'─'*80}")
            print(f"✅ Batch completed in {batch_duration:.2f}s")
            print(f"📊 Processed: {len(batch_results)}/{len(prompts)} prompts")
            if len(batch_results) > 0:
                 print(f"⚡ Average latency: {sum(r['latency_sec'] for r in batch_results) / len(batch_results):.2f}s")
                 print(f"💰 Total cost: ${sum(r['total_cost_usd'] for r in batch_results):.6f}")
                 print(f"🌍 Total carbon: {sum(r['carbon_gco2e'] for r in batch_results):.6f} gCO2e")
            print(f"{'─'*80}\n")
        return batch_results

    def export_to_csv(self, filename: str = None) -> str:
        if not self.results: print("❌ No results to export"); return None
        if filename is None: filename = f"ai_metrics_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        df = pd.DataFrame(self.results); df.to_csv(filename, index=False)
        print(f"\n{'='*80}\n📁 EXPORTED TO CSV: {filename}\n{'='*80}")
        print(f"📊 Total Records: {len(self.results)}\n📋 Total Columns: {len(df.columns)}\n💾 File Size: {os.path.getsize(filename) / 1024:.2f} KB")
        print(f"\n💰 COST SUMMARY:\n   Total API Cost: ${df['api_cost_usd'].sum():.6f}\n   Total Energy Cost: ${df['energy_cost_usd'].sum():.6f}\n   Total Cost: ${df['total_cost_usd'].sum():.6f}")
        print(f"\n⚡ ENERGY SUMMARY:\n   Total Energy: {df['total_kwh'].sum():.9f} kWh\n   Compute Energy: {df['compute_kwh'].sum():.9f} kWh\n   Idle Energy: {df['idle_kwh'].sum():.9f} kWh")
        print(f"\n🌍 CARBON SUMMARY:\n   Total Carbon: {df['carbon_gco2e'].sum():.6f} gCO2e\n   Average per Request: {df['carbon_gco2e'].mean():.6f} gCO2e\n   Average per Token: {df['carbon_per_token'].mean():.9f} gCO2e")
        print(f"\n⏱️  PERFORMANCE SUMMARY:\n   Average Latency: {df['latency_sec'].mean():.3f}s\n   Average Throughput: {df['tokens_per_sec'].mean():.2f} tokens/sec\n   Total Tokens Processed: {df['total_tokens'].sum():,}")
        print(f"\n🖥️  GPU BREAKDOWN:")
        gpu_summary = df.groupby('gpu_name').agg({'prompt_id': 'count', 'total_cost_usd': 'sum', 'carbon_gco2e': 'sum', 'latency_sec': 'mean', 'total_tokens': 'sum'}).round(6)
        gpu_summary.columns = ['Requests', 'Total Cost ($)', 'Carbon (gCO2e)', 'Avg Latency (s)', 'Total Tokens']; print(gpu_summary.to_string())
        print(f"\n🤖 MODEL BREAKDOWN:")
        model_summary = df.groupby('model_name').agg({'prompt_id': 'count', 'total_cost_usd': 'sum', 'latency_sec': 'mean'}).round(6)
        model_summary.columns = ['Requests', 'Total Cost ($)', 'Avg Latency (s)']; print(model_summary.to_string())
        if df['has_reference'].any():
            print(f"\n✓ ACCURACY SUMMARY:")
            accuracy_df = df[df['has_reference'] == True]; correct = accuracy_df['accuracy_match'].sum(); total_with_ref = len(accuracy_df)
            if total_with_ref > 0: print(f"   Correct: {correct}/{total_with_ref} ({100*correct/total_with_ref:.1f}%)")
            else: print("   No reference data found for accuracy calculation.")
        print(f"{'='*80}\n")
        return filename

print("✅ BatchProcessor class loaded with retry logic (max 3 retries)")

✅ BatchProcessor class loaded with retry logic (max 3 retries)


In [9]:
TEST_PROMPTS = [
    "Hello", # 1 (Greeting -> Balanced)
    "What is the capital of France?", # 2 (Factual Question -> Precise)
    "What is 5 + 7?", # 3 (Simple Calculation -> Precise)
    "Calculate 31 multiplied by 19", # 4 (Calculation -> Precise)
    "What are common colors for domestic cats?", # 5 (Factual Question -> Analyst/Balanced) - NEW
    "Write a Python function that takes two numbers and returns their sum", # 6 (Coding -> Coder)
    "Who wrote the play 'Romeo and Juliet'?", # 7 (Factual Literature Question -> Precise/Analyst)
    "What is 987 / 3?", # 8 (Direct Calculation -> Precise) - NEW
    "Write a three-line poem (haiku) about the moon", # 9 (Specific Creative -> Creative)
    "What branch of science studies stars and planets?", # 10 (Factual Science Question -> Precise/Analyst)
    "Identify the error in this Python code: print('Hello'", # 11 (Coding/Debug -> Coder)
    "What is the capital city of Canada?", # 12 (Factual Question -> Precise)
]

print(f"✅ Defined {len(TEST_PROMPTS)} ultra-safe test prompts")

✅ Defined 12 ultra-safe test prompts


In [10]:
processor = BatchProcessor(SERVER_URL)

print("="*80)
print("🚀 STARTING COMPREHENSIVE BATCH PROCESSING TEST")
print("="*80)
print(f"📝 Total Prompts: {len(TEST_PROMPTS)}")
print(f"🖥️  GPU Profiles: {len(GPU_PROFILES)}")
print(f"📊 Total Requests: {len(TEST_PROMPTS) * len(GPU_PROFILES)}")
print("="*80)

# Process on T4 (Colab)
print("\n🔄 Processing batch on T4 (Colab)...")
processor.process_batch(TEST_PROMPTS, gpu_profile_name="T4_COLAB")

# Process on A100 (Simulated)
print("\n🔄 Processing batch on A100 (Simulated)...")
processor.process_batch(TEST_PROMPTS, gpu_profile_name="A100_SIMULATED")

# Process on L40S (Simulated)
print("\n🔄 Processing batch on L40S (Simulated)...")
processor.process_batch(TEST_PROMPTS, gpu_profile_name="L40S_SIMULATED")

# Export to CSV
csv_filename = processor.export_to_csv()

print(f"\n{'='*80}")
print(f"✅ ALL PROCESSING COMPLETE!")
print(f"📁 Results saved to: {csv_filename}")
print(f"📊 Ready for Plotly visualization!")
print(f"{'='*80}\n")

🚀 STARTING COMPREHENSIVE BATCH PROCESSING TEST
📝 Total Prompts: 12
🖥️  GPU Profiles: 3
📊 Total Requests: 36

🔄 Processing batch on T4 (Colab)...

🖥️  Processing on NVIDIA Tesla T4
📍 Location: Google Cloud US-Central (Iowa)
⚡ Grid Intensity: 394 gCO2e/kWh
🌍 Grid Source: Mixed (Coal 48%, Wind 28%, Nuclear 15%, Gas 9%)
💾 Memory: 16 GB
⚙️  TDP: 70 W
🏢 PUE: 1.1

[1/12] 📤 Hello...
   ✅ Flash-Balanced | 2.04s | $0.000004 | 0.005379 gCO2e | 38 tokens
      ✓ Accuracy: True (expected: Hello)
[2/12] 📤 What is the capital of France?...
   ✅ Flash-Precise | 1.16s | $0.000003 | 0.003172 gCO2e | 39 tokens
[3/12] 📤 What is 5 + 7?...
   ✅ Flash-Precise | 0.82s | $0.000003 | 0.002315 gCO2e | 39 tokens
[4/12] 📤 Calculate 31 multiplied by 19...
   ✅ Flash-Precise | 3.75s | $0.000041 | 0.012530 gCO2e | 508 tokens
      ✓ Accuracy: True (expected: 589)
[5/12] 📤 What are common colors for domestic cats?...
   ⚠️  Server error (500) - Retry 1/3. Detail: Completion failed for model gemini/gemini-2.5-flash. Ty

In [11]:
df = pd.read_csv(csv_filename)

print("📊 CSV DATA PREVIEW:")
print("="*80)
print(f"\nShape: {df.shape[0]} rows × {df.shape[1]} columns")
print(f"\nColumns ({len(df.columns)}):")
for i, col in enumerate(df.columns, 1):
    print(f"  {i:2d}. {col}")

print(f"\n\nFirst 3 rows:")
print(df.head(3))

print(f"\n\nGPU Distribution:")
print(df['gpu_name'].value_counts())

print(f"\n\nModel Distribution:")
print(df['model_name'].value_counts())

print("\n✅ CSV file is ready for Plotly analysis!")

# Store filename for next cells
CSV_FILENAME = csv_filename

📊 CSV DATA PREVIEW:

Shape: 27 rows × 35 columns

Columns (35):
   1. timestamp
   2. batch_id
   3. prompt_id
   4. prompt
   5. prompt_length
   6. model_name
   7. temperature
   8. gpu_name
   9. gpu_memory_gb
  10. gpu_tdp_watts
  11. gpu_tflops
  12. gpu_location
  13. gpu_compute_capability
  14. grid_intensity_gco2_kwh
  15. grid_source
  16. datacenter_pue
  17. latency_sec
  18. tokens_per_sec
  19. prompt_tokens
  20. completion_tokens
  21. total_tokens
  22. api_cost_usd
  23. energy_cost_usd
  24. total_cost_usd
  25. compute_kwh
  26. idle_kwh
  27. total_kwh
  28. joules_per_token
  29. carbon_gco2e
  30. carbon_per_token
  31. response
  32. response_full_length
  33. has_reference
  34. accuracy_match
  35. expected_answer


First 3 rows:
                    timestamp         batch_id  prompt_id  \
0  2025-10-28T21:06:54.840368  20251028_210652          1   
1  2025-10-28T21:06:57.502454  20251028_210652          2   
2  2025-10-28T21:06:59.825873  20251028_210652    

In [12]:
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Reload data
df = pd.read_csv(CSV_FILENAME)

# Create comprehensive GPU comparison dashboard
fig1 = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        '🌍 Carbon Emissions by GPU',
        '💰 Cost Breakdown by GPU',
        '⏱️ Latency Distribution',
        '⚡ Energy Consumption'
    ),
    specs=[[{"type": "bar"}, {"type": "bar"}],
           [{"type": "box"}, {"type": "bar"}]]
)

# 1. Carbon emissions
gpu_carbon = df.groupby('gpu_name')['carbon_gco2e'].sum().reset_index()
fig1.add_trace(
    go.Bar(
        x=gpu_carbon['gpu_name'],
        y=gpu_carbon['carbon_gco2e'],
        name='Carbon',
        marker_color='lightgreen',
        text=gpu_carbon['carbon_gco2e'].round(6),
        textposition='auto'
    ),
    row=1, col=1
)

# 2. Cost breakdown
gpu_costs = df.groupby('gpu_name').agg({
    'api_cost_usd': 'sum',
    'energy_cost_usd': 'sum'
}).reset_index()

fig1.add_trace(
    go.Bar(
        x=gpu_costs['gpu_name'],
        y=gpu_costs['api_cost_usd'],
        name='API Cost',
        marker_color='lightblue'
    ),
    row=1, col=2
)
fig1.add_trace(
    go.Bar(
        x=gpu_costs['gpu_name'],
        y=gpu_costs['energy_cost_usd'],
        name='Energy Cost',
        marker_color='coral'
    ),
    row=1, col=2
)

# 3. Latency box plot
for gpu in df['gpu_name'].unique():
    gpu_data = df[df['gpu_name'] == gpu]
    fig1.add_trace(
        go.Box(
            y=gpu_data['latency_sec'],
            name=gpu.split()[1],
            boxmean='sd'
        ),
        row=2, col=1
    )

# 4. Energy consumption
gpu_energy = df.groupby('gpu_name')['total_kwh'].sum().reset_index()
fig1.add_trace(
    go.Bar(
        x=gpu_energy['gpu_name'],
        y=gpu_energy['total_kwh'],
        name='Energy',
        marker_color='gold',
        text=gpu_energy['total_kwh'].round(9),
        textposition='auto'
    ),
    row=2, col=2
)

# Update layout
fig1.update_xaxes(title_text="GPU", row=1, col=1)
fig1.update_yaxes(title_text="gCO2e", row=1, col=1)
fig1.update_xaxes(title_text="GPU", row=1, col=2)
fig1.update_yaxes(title_text="Cost (USD)", row=1, col=2)
fig1.update_xaxes(title_text="GPU", row=2, col=1)
fig1.update_yaxes(title_text="Latency (seconds)", row=2, col=1)
fig1.update_xaxes(title_text="GPU", row=2, col=2)
fig1.update_yaxes(title_text="kWh", row=2, col=2)

fig1.update_layout(
    height=800,
    showlegend=True,
    title_text="GPU Performance Comparison Dashboard",
    title_font_size=20
)

fig1.show()





This means that static image generation (e.g. `fig.write_image()`) will not work.

Please upgrade Plotly to version 6.1.1 or greater, or downgrade Kaleido to version 0.2.1.




In [13]:
# Model performance analysis
fig2 = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        '🤖 Requests by Model',
        '💰 Cost per Model',
        '⚡ Tokens per Second',
        '🎯 Accuracy Rate'
    ),
    specs=[[{"type": "pie"}, {"type": "bar"}],
           [{"type": "bar"}, {"type": "bar"}]]
)

# 1. Request distribution
model_counts = df['model_name'].value_counts()
fig2.add_trace(
    go.Pie(
        labels=model_counts.index,
        values=model_counts.values,
        hole=0.3
    ),
    row=1, col=1
)

# 2. Cost per model
model_costs = df.groupby('model_name')['total_cost_usd'].sum().reset_index()
fig2.add_trace(
    go.Bar(
        x=model_costs['model_name'],
        y=model_costs['total_cost_usd'],
        marker_color='purple',
        text=model_costs['total_cost_usd'].round(6),
        textposition='auto'
    ),
    row=1, col=2
)

# 3. Tokens per second
model_perf = df.groupby('model_name')['tokens_per_sec'].mean().reset_index()
fig2.add_trace(
    go.Bar(
        x=model_perf['model_name'],
        y=model_perf['tokens_per_sec'],
        marker_color='teal',
        text=model_perf['tokens_per_sec'].round(2),
        textposition='auto'
    ),
    row=2, col=1
)

# 4. Accuracy rate
accuracy_df = df[df['has_reference'] == True]
if not accuracy_df.empty:
    accuracy_by_model = accuracy_df.groupby('model_name')['accuracy_match'].apply(
        lambda x: (x.sum() / len(x)) * 100
    ).reset_index()
    accuracy_by_model.columns = ['model_name', 'accuracy_rate']

    fig2.add_trace(
        go.Bar(
            x=accuracy_by_model['model_name'],
            y=accuracy_by_model['accuracy_rate'],
            marker_color='lightcoral',
            text=accuracy_by_model['accuracy_rate'].round(1).astype(str) + '%',
            textposition='auto'
        ),
        row=2, col=2
    )

# Update layout
fig2.update_xaxes(title_text="Model", row=1, col=2)
fig2.update_yaxes(title_text="Total Cost (USD)", row=1, col=2)
fig2.update_xaxes(title_text="Model", row=2, col=1)
fig2.update_yaxes(title_text="Tokens/Second", row=2, col=1)
fig2.update_xaxes(title_text="Model", row=2, col=2)
fig2.update_yaxes(title_text="Accuracy (%)", row=2, col=2)

fig2.update_layout(
    height=800,
    showlegend=False,
    title_text="Model Performance Analysis",
    title_font_size=20
)

fig2.show()

In [14]:
# Carbon and energy deep dive
fig3 = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        '🌍 Carbon by Location',
        '⚡ Energy Breakdown',
        '📊 Grid Intensity Impact',
        '💡 Carbon per Token'
    ),
    specs=[[{"type": "bar"}, {"type": "bar"}],
           [{"type": "scatter"}, {"type": "box"}]]
)

# 1. Carbon by location
location_carbon = df.groupby('gpu_location')['carbon_gco2e'].sum().reset_index()
location_carbon = location_carbon.sort_values('carbon_gco2e', ascending=False)

fig3.add_trace(
    go.Bar(
        x=location_carbon['gpu_location'],
        y=location_carbon['carbon_gco2e'],
        marker_color='darkgreen',
        text=location_carbon['carbon_gco2e'].round(6),
        textposition='auto'
    ),
    row=1, col=1
)

# 2. Energy breakdown (compute vs idle)
energy_breakdown = df.groupby('gpu_name').agg({
    'compute_kwh': 'sum',
    'idle_kwh': 'sum'
}).reset_index()

fig3.add_trace(
    go.Bar(
        x=energy_breakdown['gpu_name'],
        y=energy_breakdown['compute_kwh'],
        name='Compute',
        marker_color='orange'
    ),
    row=1, col=2
)
fig3.add_trace(
    go.Bar(
        x=energy_breakdown['gpu_name'],
        y=energy_breakdown['idle_kwh'],
        name='Idle',
        marker_color='yellow'
    ),
    row=1, col=2
)

# 3. Grid intensity impact
grid_data = df.groupby('gpu_name').agg({
    'grid_intensity_gco2_kwh': 'first',
    'carbon_gco2e': 'sum'
}).reset_index()

fig3.add_trace(
    go.Scatter(
        x=grid_data['grid_intensity_gco2_kwh'],
        y=grid_data['carbon_gco2e'],
        mode='markers+text',
        marker=dict(size=15, color='red'),
        text=grid_data['gpu_name'].str.split().str[1],
        textposition='top center',
        name='GPUs'
    ),
    row=2, col=1
)

# 4. Carbon per token distribution
for gpu in df['gpu_name'].unique():
    gpu_data = df[df['gpu_name'] == gpu]
    fig3.add_trace(
        go.Box(
            y=gpu_data['carbon_per_token'] * 1000,  # Convert to mgCO2e
            name=gpu.split()[1],
            boxmean='sd'
        ),
        row=2, col=2
    )

# Update layout
fig3.update_xaxes(title_text="Location", row=1, col=1)
fig3.update_yaxes(title_text="Total Carbon (gCO2e)", row=1, col=1)
fig3.update_xaxes(title_text="GPU", row=1, col=2)
fig3.update_yaxes(title_text="Energy (kWh)", row=1, col=2)
fig3.update_xaxes(title_text="Grid Intensity (gCO2e/kWh)", row=2, col=1)
fig3.update_yaxes(title_text="Total Carbon (gCO2e)", row=2, col=1)
fig3.update_xaxes(title_text="GPU", row=2, col=2)
fig3.update_yaxes(title_text="Carbon per Token (mgCO2e)", row=2, col=2)

fig3.update_layout(
    height=800,
    showlegend=True,
    title_text="Carbon & Energy Analysis",
    title_font_size=20,
    barmode='stack'
)

fig3.show()

In [16]:
# Advanced metrics visualization
fig5 = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        '🔥 TDP vs Carbon Emissions',
        '💾 Memory vs Performance',
        '🌐 Grid Source Distribution',
        '⚙️ PUE Impact on Energy'
    ),
    specs=[[{"type": "scatter"}, {"type": "scatter"}],
           [{"type": "bar"}, {"type": "scatter"}]]
)

# 1. TDP vs Carbon (bubble chart with token count as size)
gpu_tdp = df.groupby('gpu_name').agg({
    'gpu_tdp_watts': 'first',
    'carbon_gco2e': 'sum',
    'total_tokens': 'sum'
}).reset_index()

fig5.add_trace(
    go.Scatter(
        x=gpu_tdp['gpu_tdp_watts'],
        y=gpu_tdp['carbon_gco2e'],
        mode='markers+text',
        marker=dict(
            size=gpu_tdp['total_tokens'] / 50,
            color=gpu_tdp['carbon_gco2e'],
            colorscale='Reds',
            showscale=True,
            colorbar=dict(title="Carbon")
        ),
        text=gpu_tdp['gpu_name'].str.split().str[1],
        textposition='top center',
        name='GPUs'
    ),
    row=1, col=1
)

# 2. Memory vs Performance
gpu_mem = df.groupby('gpu_name').agg({
    'gpu_memory_gb': 'first',
    'tokens_per_sec': 'mean',
    'latency_sec': 'mean'
}).reset_index()

fig5.add_trace(
    go.Scatter(
        x=gpu_mem['gpu_memory_gb'],
        y=gpu_mem['tokens_per_sec'],
        mode='markers+text',
        marker=dict(size=20, color='blue'),
        text=gpu_mem['gpu_name'].str.split().str[1],
        textposition='top center',
        name='Throughput'
    ),
    row=1, col=2
)

# 3. Grid source breakdown
grid_sources = df.groupby('grid_source')['carbon_gco2e'].sum().reset_index()
grid_sources = grid_sources.sort_values('carbon_gco2e', ascending=True)

fig5.add_trace(
    go.Bar(
        y=grid_sources['grid_source'],
        x=grid_sources['carbon_gco2e'],
        orientation='h',
        marker_color='forestgreen',
        text=grid_sources['carbon_gco2e'].round(6),
        textposition='auto'
    ),
    row=2, col=1
)

# 4. PUE impact
pue_data = df.groupby('gpu_name').agg({
    'datacenter_pue': 'first',
    'total_kwh': 'sum',
    'compute_kwh': 'sum'
}).reset_index()
pue_data['overhead_kwh'] = pue_data['total_kwh'] - pue_data['compute_kwh']

fig5.add_trace(
    go.Scatter(
        x=pue_data['datacenter_pue'],
        y=pue_data['overhead_kwh'],
        mode='markers+text',
        marker=dict(size=15, color='purple'),
        text=pue_data['gpu_name'].str.split().str[1],
        textposition='top center',
        name='Overhead'
    ),
    row=2, col=2
)

# Update layout
fig5.update_xaxes(title_text="TDP (Watts)", row=1, col=1)
fig5.update_yaxes(title_text="Carbon (gCO2e)", row=1, col=1)
fig5.update_xaxes(title_text="Memory (GB)", row=1, col=2)
fig5.update_yaxes(title_text="Tokens/Second", row=1, col=2)
fig5.update_xaxes(title_text="Carbon (gCO2e)", row=2, col=1)
fig5.update_yaxes(title_text="Grid Source", row=2, col=1)
fig5.update_xaxes(title_text="PUE", row=2, col=2)
fig5.update_yaxes(title_text="Overhead Energy (kWh)", row=2, col=2)

fig5.update_layout(
    height=800,
    showlegend=False,
    title_text="Advanced Hardware Metrics",
    title_font_size=20
)

fig5.show()

In [17]:
# Efficiency metrics
fig6 = make_subplots(
    rows=1, cols=2,
    subplot_titles=(
        '💡 Cost Efficiency ($ per 1000 tokens)',
        '🌱 Carbon Efficiency (gCO2e per 1000 tokens)'
    )
)

# Calculate efficiency metrics
efficiency = df.groupby('gpu_name').agg({
    'total_cost_usd': 'sum',
    'carbon_gco2e': 'sum',
    'total_tokens': 'sum'
}).reset_index()

efficiency['cost_per_1k_tokens'] = (efficiency['total_cost_usd'] / efficiency['total_tokens']) * 1000
efficiency['carbon_per_1k_tokens'] = (efficiency['carbon_gco2e'] / efficiency['total_tokens']) * 1000

# Sort by efficiency
efficiency = efficiency.sort_values('cost_per_1k_tokens')

# 1. Cost efficiency
fig6.add_trace(
    go.Bar(
        x=efficiency['gpu_name'],
        y=efficiency['cost_per_1k_tokens'],
        marker_color='dodgerblue',
        text=efficiency['cost_per_1k_tokens'].round(6),
        textposition='auto',
        name='Cost Efficiency'
    ),
    row=1, col=1
)

# 2. Carbon efficiency
fig6.add_trace(
    go.Bar(
        x=efficiency['gpu_name'],
        y=efficiency['carbon_per_1k_tokens'],
        marker_color='limegreen',
        text=efficiency['carbon_per_1k_tokens'].round(6),
        textposition='auto',
        name='Carbon Efficiency'
    ),
    row=1, col=2
)

# Update layout
fig6.update_xaxes(title_text="GPU", row=1, col=1)
fig6.update_yaxes(title_text="$ per 1K tokens", row=1, col=1)
fig6.update_xaxes(title_text="GPU", row=1, col=2)
fig6.update_yaxes(title_text="gCO2e per 1K tokens", row=1, col=2)

fig6.update_layout(
    height=500,
    showlegend=False,
    title_text="Efficiency Comparison - Which GPU is Best?",
    title_font_size=20
)

fig6.show()

# Print efficiency summary
print("\n" + "="*80)
print("🏆 EFFICIENCY RANKINGS")
print("="*80)

cost_ranked = efficiency.sort_values('cost_per_1k_tokens')
print("\n💰 MOST COST EFFICIENT:")
for idx, row in cost_ranked.iterrows():
    print(f"  {row['gpu_name']}: ${row['cost_per_1k_tokens']:.6f} per 1K tokens")

carbon_ranked = efficiency.sort_values('carbon_per_1k_tokens')
print("\n🌱 MOST CARBON EFFICIENT:")
for idx, row in carbon_ranked.iterrows():
    print(f"  {row['gpu_name']}: {row['carbon_per_1k_tokens']:.6f} gCO2e per 1K tokens")

print("="*80 + "\n")


🏆 EFFICIENCY RANKINGS

💰 MOST COST EFFICIENT:
  NVIDIA Tesla T4: $0.000081 per 1K tokens
  NVIDIA A100 80GB: $0.000110 per 1K tokens
  NVIDIA L40S: $0.000113 per 1K tokens

🌱 MOST CARBON EFFICIENT:
  NVIDIA Tesla T4: 0.032371 gCO2e per 1K tokens
  NVIDIA L40S: 0.090464 gCO2e per 1K tokens
  NVIDIA A100 80GB: 0.126973 gCO2e per 1K tokens



In [18]:
# Save all figures to HTML files
print("💾 Saving visualizations to HTML files...")

fig1.write_html("gpu_comparison_dashboard.html")
print("✅ Saved: gpu_comparison_dashboard.html")

fig2.write_html("model_performance_analysis.html")
print("✅ Saved: model_performance_analysis.html")

fig3.write_html("carbon_energy_analysis.html")
print("✅ Saved: carbon_energy_analysis.html")

fig4.write_html("time_series_analysis.html")
print("✅ Saved: time_series_analysis.html")

fig5.write_html("advanced_hardware_metrics.html")
print("✅ Saved: advanced_hardware_metrics.html")

fig6.write_html("efficiency_comparison.html")
print("✅ Saved: efficiency_comparison.html")

print("\n✅ All visualizations saved!")
print("📁 You can open these HTML files in any web browser")

💾 Saving visualizations to HTML files...
✅ Saved: gpu_comparison_dashboard.html
✅ Saved: model_performance_analysis.html
✅ Saved: carbon_energy_analysis.html
✅ Saved: time_series_analysis.html
✅ Saved: advanced_hardware_metrics.html
✅ Saved: efficiency_comparison.html

✅ All visualizations saved!
📁 You can open these HTML files in any web browser


In [19]:
print("\n" + "="*80)
print("📊 FINAL COMPREHENSIVE REPORT")
print("="*80)

print(f"\n📝 DATASET OVERVIEW:")
print(f"   Total Requests: {len(df)}")
print(f"   Date Range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"   GPU Profiles Tested: {df['gpu_name'].nunique()}")
print(f"   Models Used: {df['model_name'].nunique()}")
print(f"   Unique Prompts: {df['prompt'].nunique()}")

print(f"\n💰 FINANCIAL METRICS:")
print(f"   Total API Cost: ${df['api_cost_usd'].sum():.6f}")
print(f"   Total Energy Cost: ${df['energy_cost_usd'].sum():.6f}")
print(f"   Total Cost: ${df['total_cost_usd'].sum():.6f}")
print(f"   Average Cost per Request: ${df['total_cost_usd'].mean():.6f}")
print(f"   Cost Range: ${df['total_cost_usd'].min():.6f} - ${df['total_cost_usd'].max():.6f}")

print(f"\n⚡ ENERGY METRICS:")
print(f"   Total Energy Consumed: {df['total_kwh'].sum():.9f} kWh")
print(f"   Compute Energy: {df['compute_kwh'].sum():.9f} kWh ({100*df['compute_kwh'].sum()/df['total_kwh'].sum():.1f}%)")
print(f"   Idle Energy: {df['idle_kwh'].sum():.9f} kWh ({100*df['idle_kwh'].sum()/df['total_kwh'].sum():.1f}%)")
print(f"   Average Energy per Request: {df['total_kwh'].mean():.9f} kWh")

print(f"\n🌍 CARBON METRICS:")
print(f"   Total Carbon Emissions: {df['carbon_gco2e'].sum():.6f} gCO2e")
print(f"   Average per Request: {df['carbon_gco2e'].mean():.6f} gCO2e")
print(f"   Average per Token: {df['carbon_per_token'].mean():.9f} gCO2e")
print(f"   Carbon Range: {df['carbon_gco2e'].min():.6f} - {df['carbon_gco2e'].max():.6f} gCO2e")

print(f"\n⏱️  PERFORMANCE METRICS:")
print(f"   Total Tokens Processed: {df['total_tokens'].sum():,}")
print(f"   Average Latency: {df['latency_sec'].mean():.3f}s")
print(f"   Median Latency: {df['latency_sec'].median():.3f}s")
print(f"   Average Throughput: {df['tokens_per_sec'].mean():.2f} tokens/sec")
print(f"   Max Throughput: {df['tokens_per_sec'].max():.2f} tokens/sec")

print(f"\n🖥️  GPU PERFORMANCE:")
for gpu in df['gpu_name'].unique():
    gpu_data = df[df['gpu_name'] == gpu]
    print(f"\n   {gpu}:")
    print(f"      Requests: {len(gpu_data)}")
    print(f"      Avg Latency: {gpu_data['latency_sec'].mean():.3f}s")
    print(f"      Total Cost: ${gpu_data['total_cost_usd'].sum():.6f}")
    print(f"      Total Carbon: {gpu_data['carbon_gco2e'].sum():.6f} gCO2e")
    print(f"      Throughput: {gpu_data['tokens_per_sec'].mean():.2f} tokens/sec")

print(f"\n🤖 MODEL USAGE:")
for model in df['model_name'].unique():
    model_data = df[df['model_name'] == model]
    print(f"\n   {model}:")
    print(f"      Requests: {len(model_data)} ({100*len(model_data)/len(df):.1f}%)")
    print(f"      Avg Cost: ${model_data['total_cost_usd'].mean():.6f}")
    print(f"      Avg Latency: {model_data['latency_sec'].mean():.3f}s")

if df['has_reference'].any():
    accuracy_df = df[df['has_reference'] == True]
    correct = accuracy_df['accuracy_match'].sum()
    total = len(accuracy_df)
    print(f"\n✓ ACCURACY:")
    print(f"   Validated Requests: {total}")
    print(f"   Correct: {correct}/{total} ({100*correct/total:.1f}%)")
    print(f"   Incorrect: {total-correct}/{total} ({100*(total-correct)/total:.1f}%)")

print("\n" + "="*80)
print("✅ REPORT COMPLETE")
print("="*80 + "\n")


📊 FINAL COMPREHENSIVE REPORT

📝 DATASET OVERVIEW:
   Total Requests: 27
   Date Range: 2025-10-28T21:06:54.840368 to 2025-10-28T21:10:30.913152
   GPU Profiles Tested: 3
   Models Used: 4
   Unique Prompts: 11

💰 FINANCIAL METRICS:
   Total API Cost: $0.000413
   Total Energy Cost: $0.000165
   Total Cost: $0.000578
   Average Cost per Request: $0.000021
   Cost Range: $0.000003 - $0.000056

⚡ ENERGY METRICS:
   Total Energy Consumed: 0.001651028 kWh
   Compute Energy: 0.000048169 kWh (2.9%)
   Idle Energy: 0.001382621 kWh (83.7%)
   Average Energy per Request: 0.000061149 kWh

🌍 CARBON METRICS:
   Total Carbon Emissions: 0.511250 gCO2e
   Average per Request: 0.018935 gCO2e
   Average per Token: 0.000145320 gCO2e
   Carbon Range: 0.002315 - 0.067805 gCO2e

⏱️  PERFORMANCE METRICS:
   Total Tokens Processed: 5,696
   Average Latency: 2.169s
   Median Latency: 1.327s
   Average Throughput: 74.92 tokens/sec
   Max Throughput: 149.20 tokens/sec

🖥️  GPU PERFORMANCE:

   NVIDIA Tesla T4:


In [None]:
# Uncomment to stop server and ngrok tunnel when you're done

import subprocess

print("🛑 Stopping server and ngrok...")
subprocess.run(['pkill', '-9', '-f', 'uvicorn'], stderr=subprocess.DEVNULL)
subprocess.run(['pkill', '-9', '-f', 'ngrok'], stderr=subprocess.DEVNULL)

import time
time.sleep(3)
print("✅ Cleanup complete - Server stopped")

🛑 Stopping server and ngrok...
✅ Cleanup complete - Server stopped
