In [None]:
# Extended Multi-Server Performance Comparison: GPUs vs CPUs

import requests
import time
import pandas as pd
from datetime import datetime

#  server configurations
servers = {
    # "RTX_5090": {
    #     "url": "...",
    #     "hardware": "RTX 5090",
    #     "type": "GPU",
    #     "memory": "32GB VRAM",
    #     "cores": "N/A"
    # },
    # "RTX_4090": {
    #     "url": "...", 
    #     "hardware": "RTX 4090",
    #     "type": "GPU", 
    #     "memory": "24GB VRAM",
    #     "cores": "N/A"
    # },
    # "AMD_EPYC": {
    #     "url": "...",
    #     "hardware": "AMD EPYC (with IBPB)",
    #     "type": "CPU",
    #     "memory": "System RAM",
    #     "cores": "Multiple cores"
    # },
    # "AMD_Threadripper": {
    #     "url": "...",
    #     "hardware": "AMD Ryzen Threadripper 7960X 24-Cores",
    #     "type": "CPU",
    #     "memory": "System RAM", 
    #     "cores": "24 cores"
    # }
    
    "A40_GPU": {
        "url": "...",
        "hardware": "A40 GPU",
        "type": "GPU",
        "memory": "48GB VRAM",
        "cores": "N/A"
    },
    "Contabo_Server": {
        "url": "...",
        "hardware": "Contabo Server",
        "type": "CPU",
        "memory": "System RAM",
        "cores": "Multiple cores"
    }
}

# Test configuration
DUMMY_PROMPT = "Hello"
REAL_PROMPT = "Explain the concept of machine learning in detail, including its types, applications, and future prospects."

# Models to test - Testing deepseek-r1 models
models = [
    "deepseek-r1:70b",
    "deepseek-r1:32b"
]

def test_model_on_hardware(model_name, server_name, server_config):
    """Test a specific model on a specific hardware (GPU or CPU)"""
    hardware_type = server_config['type']
    hardware_name = server_config['hardware']
    
    print(f"\n🔄 Testing {model_name} on {hardware_name} ({hardware_type})")
    
    try:
        # 1. Dummy request to load model
        payload = {
            "model": model_name,
            "prompt": DUMMY_PROMPT,
            "stream": False
        }
        
        if hardware_type == "GPU":
            print(f"   📥 Loading model into {hardware_name} VRAM...")
        else:
            print(f"   📥 Loading model into {hardware_name} memory...")
            
        dummy_start = time.time()
        dummy_response = requests.post(server_config['url'], json=payload, timeout=300)  # Longer timeout for 70B model
        dummy_end = time.time()
        
        if dummy_response.status_code != 200:
            print(f"   ❌ Dummy request failed: {dummy_response.status_code}")
            print(f"   Response: {dummy_response.text[:200]}...")
            return None
            
        print(f"   ✅ Model loaded in {dummy_end - dummy_start:.2f}s")
        
        # 2. Real request to measure performance
        payload["prompt"] = REAL_PROMPT
        
        print("   🚀 Measuring inference performance...")
        start_time = time.time()
        response = requests.post(server_config['url'], json=payload, timeout=3600)  # Extended timeout for 70B model (1 hour)
        end_time = time.time()
        
        if response.status_code == 200:
            data = response.json()
            total_time = end_time - start_time
            
            # Extract metrics
            response_text = data.get("response", "")
            words_generated = len(response_text.split())
            chars_generated = len(response_text)
            words_per_second = words_generated / total_time if total_time > 0 else 0
            
            # Ollama-specific metrics
            ollama_total_duration = data.get("total_duration", 0) / 1e9
            ollama_load_duration = data.get("load_duration", 0) / 1e9
            eval_count = data.get("eval_count", 0)
            eval_duration = data.get("eval_duration", 0) / 1e9
            eval_tokens_per_sec = eval_count / eval_duration if eval_duration > 0 else 0
            prompt_eval_count = data.get("prompt_eval_count", 0)
            prompt_eval_duration = data.get("prompt_eval_duration", 0) / 1e9
            
            print(f"   ✅ Results for {hardware_name}:")
            print(f"      ⏱️  Response Time: {total_time:.2f}s")
            print(f"      ⚡ Speed: {words_per_second:.2f} words/sec")
            print(f"      🏃 Ollama Tokens/sec: {eval_tokens_per_sec:.2f}")
            print(f"      📝 Words Generated: {words_generated}")
            
            return {
                "model": model_name,
                "server": server_name,
                "hardware": hardware_name,
                "hardware_type": hardware_type,
                "memory": server_config['memory'],
                "cores": server_config.get('cores', 'N/A'),
                "timestamp": datetime.now().isoformat(),
                "total_time": total_time,
                "words_generated": words_generated,
                "words_per_second": words_per_second,
                "chars_generated": chars_generated,
                "loading_time": dummy_end - dummy_start,
                "ollama_total_duration": ollama_total_duration,
                "ollama_load_duration": ollama_load_duration,
                "ollama_eval_count": eval_count,
                "ollama_eval_duration": eval_duration,
                "ollama_eval_tokens_per_sec": eval_tokens_per_sec,
                "prompt_eval_count": prompt_eval_count,
                "prompt_eval_duration": prompt_eval_duration,
                "response_preview": response_text[:200] + "..." if len(response_text) > 200 else response_text
            }
        else:
            print(f"   ❌ Error: {response.status_code}")
            print(f"   Response: {response.text[:200]}...")
            return None
            
    except requests.exceptions.Timeout:
        print(f"   ❌ Timeout error on {hardware_name}")
        return None
    except Exception as e:
        print(f"   ❌ Error on {hardware_name}: {str(e)}")
        return None

def run_extended_comparison():
    """Run performance comparison across GPUs and CPUs"""
    print("🚀 DEEPSEEK-R1 MODELS PERFORMANCE COMPARISON")
    print("🎯 A40 GPU vs Contabo Server Analysis (70B & 32B)")
    print("=" * 80)
    
    all_results = []
    total_start = time.time()
    
    for model in models:
        print(f"\n{'='*60}")
        print(f"🧠 TESTING MODEL: {model}")
        print(f"{'='*60}")
        
        model_results = []
        
        # Test on each server/hardware
        for server_name, server_config in servers.items():
            result = test_model_on_hardware(model, server_name, server_config)
            if result:
                all_results.append(result)
                model_results.append(result)
        
        # Quick comparison for this model
        if len(model_results) >= 2:
            print(f"\n   📊 Quick Performance Overview for {model}:")
            
            # Sort by performance
            sorted_results = sorted(model_results, key=lambda x: x['words_per_second'], reverse=True)
            
            for i, result in enumerate(sorted_results):
                rank_emoji = ["🥇", "🥈", "🥉", "4️⃣", "5️⃣"][min(i, 4)]
                print(f"      {rank_emoji} {result['hardware']} ({result['hardware_type']}): {result['words_per_second']:.2f} words/sec | {result['loading_time']:.2f}s loading")
        
        print("-" * 60)
    
    total_end = time.time()
    
    if all_results:
        df = pd.DataFrame(all_results)
        
        print(f"\n📊 DEEPSEEK-R1 MODELS A40 vs CONTABO COMPARISON")
        print(f"⏱️  Total Test Time: {total_end - total_start:.2f}s")
        print("=" * 100)
        
        # Create comparison table
        comparison_cols = ['model', 'hardware', 'hardware_type', 'total_time', 'words_per_second', 
                          'ollama_eval_tokens_per_sec', 'words_generated', 'loading_time']
        comparison_df = df[comparison_cols].round(2)
        comparison_df.columns = ['Model', 'Hardware', 'Type', 'Response Time (s)', 'Words/sec', 
                                'Tokens/sec', 'Words Generated', 'Loading Time (s)']
        
        print(comparison_df.to_string(index=False))
        
        return df
    else:
        print("❌ No successful tests completed")
        return None

# Run the extended comparison
print("Starting DeepSeek-R1 Models Performance Comparison...")
extended_results = run_extended_comparison()


In [None]:
# Comprehensive GPU vs CPU Performance Analysis

def analyze_gpu_vs_cpu_performance(df):
    """Provide detailed analysis comparing GPU vs CPU performance"""
    if df is None or df.empty:
        print("No results to analyze")
        return
    
    print("\n🔬 COMPREHENSIVE GPU vs CPU PERFORMANCE ANALYSIS")
    print("=" * 80)
    
    # Separate results by hardware type
    gpu_results = df[df['hardware_type'] == 'GPU']
    cpu_results = df[df['hardware_type'] == 'CPU']
    
    if gpu_results.empty and cpu_results.empty:
        print("No results found for analysis")
        return
    
    # Model sizes for efficiency calculations
    model_sizes = {
        "deepseek-r1:70b": 45,  # Approximate size for 70B model
        "deepseek-r1:32b": 19,  # Approximate size for 32B model
        # "qwen3:32b": 20,
        # "gemma3:27b": 17, 
        # "deepseek-r1:32b-qwen-distill-q4_K_M": 19
    }
    
    print("\n📊 HARDWARE TYPE PERFORMANCE OVERVIEW")
    print("-" * 60)
    
    comparison_summary = []
    
    for model in models:
        print(f"\n🧠 {model} (Model Size: {model_sizes.get(model, 'Unknown')} GB)")
        print("-" * 50)
        
        model_gpu_results = gpu_results[gpu_results['model'] == model]
        model_cpu_results = cpu_results[cpu_results['model'] == model]
        
        # GPU Performance Summary
        if not model_gpu_results.empty:
            print("   🎮 GPU Performance:")
            for _, gpu_row in model_gpu_results.iterrows():
                print(f"      • {gpu_row['hardware']}: {gpu_row['words_per_second']:.2f} words/sec | {gpu_row['loading_time']:.2f}s loading")
            
            best_gpu = model_gpu_results.loc[model_gpu_results['words_per_second'].idxmax()]
            avg_gpu_speed = model_gpu_results['words_per_second'].mean()
            avg_gpu_loading = model_gpu_results['loading_time'].mean()
        else:
            print("   🎮 GPU Performance: No GPU results available")
            best_gpu = None
            avg_gpu_speed = 0
            avg_gpu_loading = 0
        
        # CPU Performance Summary
        if not model_cpu_results.empty:
            print("   🖥️  CPU Performance:")
            for _, cpu_row in model_cpu_results.iterrows():
                print(f"      • {cpu_row['hardware']}: {cpu_row['words_per_second']:.2f} words/sec | {cpu_row['loading_time']:.2f}s loading")
            
            best_cpu = model_cpu_results.loc[model_cpu_results['words_per_second'].idxmax()]
            avg_cpu_speed = model_cpu_results['words_per_second'].mean()
            avg_cpu_loading = model_cpu_results['loading_time'].mean()
        else:
            print("   🖥️  CPU Performance: No CPU results available")
            best_cpu = None
            avg_cpu_speed = 0
            avg_cpu_loading = 0
        
        # Comparison
        if best_gpu is not None and best_cpu is not None:
            speed_advantage = ((avg_gpu_speed - avg_cpu_speed) / avg_cpu_speed) * 100
            loading_advantage = ((avg_cpu_loading - avg_gpu_loading) / avg_cpu_loading) * 100
            
            print(f"\n   📈 GPU vs CPU Comparison:")
            print(f"      Speed Advantage (GPU): {speed_advantage:+.1f}%")
            print(f"      Loading Advantage (GPU): {loading_advantage:+.1f}%")
            print(f"      Best GPU: {best_gpu['hardware']} ({best_gpu['words_per_second']:.2f} words/sec)")
            print(f"      Best CPU: {best_cpu['hardware']} ({best_cpu['words_per_second']:.2f} words/sec)")
            
            comparison_summary.append({
                'model': model,
                'model_size_gb': model_sizes.get(model, 0),
                'best_gpu_hardware': best_gpu['hardware'],
                'best_gpu_speed': best_gpu['words_per_second'],
                'best_gpu_loading': best_gpu['loading_time'],
                'avg_gpu_speed': avg_gpu_speed,
                'avg_gpu_loading': avg_gpu_loading,
                'best_cpu_hardware': best_cpu['hardware'],
                'best_cpu_speed': best_cpu['words_per_second'],
                'best_cpu_loading': best_cpu['loading_time'],
                'avg_cpu_speed': avg_cpu_speed,
                'avg_cpu_loading': avg_cpu_loading,
                'gpu_speed_advantage_pct': speed_advantage,
                'gpu_loading_advantage_pct': loading_advantage
            })
    
    # Overall Performance Summary
    if comparison_summary:
        summary_df = pd.DataFrame(comparison_summary)
        
        print(f"\n🏆 OVERALL GPU vs CPU PERFORMANCE SUMMARY")
        print("=" * 70)
        
        avg_gpu_advantage = summary_df['gpu_speed_advantage_pct'].mean()
        avg_loading_advantage = summary_df['gpu_loading_advantage_pct'].mean()
        
        print(f"Average GPU Speed Advantage: {avg_gpu_advantage:+.1f}%")
        print(f"Average GPU Loading Advantage: {avg_loading_advantage:+.1f}%")
        
        # Hardware Rankings
        print(f"\n🥇 PERFORMANCE RANKINGS BY HARDWARE")
        print("-" * 50)
        
        # Create overall ranking
        all_hardware_performance = []
        
        for _, row in df.iterrows():
            all_hardware_performance.append({
                'hardware': row['hardware'],
                'type': row['hardware_type'],
                'avg_speed': df[df['hardware'] == row['hardware']]['words_per_second'].mean(),
                'avg_loading': df[df['hardware'] == row['hardware']]['loading_time'].mean()
            })
        
        # Remove duplicates and sort
        hardware_df = pd.DataFrame(all_hardware_performance).drop_duplicates('hardware').sort_values('avg_speed', ascending=False)
        
        for i, (_, hw_row) in enumerate(hardware_df.iterrows()):
            rank_emoji = ["🥇", "🥈", "🥉", "4️⃣", "5️⃣"][min(i, 4)]
            print(f"{rank_emoji} {hw_row['hardware']} ({hw_row['type']}): {hw_row['avg_speed']:.2f} words/sec avg")
        
        # Efficiency Analysis
        print(f"\n⚡ EFFICIENCY ANALYSIS")
        print("-" * 40)
        
        print("GPU Efficiency (Performance per VRAM GB):")
        gpu_efficiency_data = []
        for _, row in summary_df.iterrows():
            if 'RTX' in row['best_gpu_hardware']:
                vram_gb = 32 if '5090' in row['best_gpu_hardware'] else 24
                efficiency = row['best_gpu_speed'] / vram_gb
                gpu_efficiency_data.append(f"  • {row['model']} on {row['best_gpu_hardware']}: {efficiency:.2f} words/sec/GB")
            elif 'A40' in row['best_gpu_hardware']:
                vram_gb = 48  # A40 GPU has 48GB VRAM
                efficiency = row['best_gpu_speed'] / vram_gb
                gpu_efficiency_data.append(f"  • {row['model']} on {row['best_gpu_hardware']}: {efficiency:.2f} words/sec/GB")
        
        for efficiency_info in gpu_efficiency_data:
            print(efficiency_info)
        
        print("\nCPU Efficiency (Performance characteristics):")
        for _, row in summary_df.iterrows():
            print(f"  • {row['model']} on {row['best_cpu_hardware']}: {row['best_cpu_speed']:.2f} words/sec")
        
        # Use Case Recommendations
        print(f"\n🎯 USE CASE RECOMMENDATIONS")
        print("-" * 40)
        print("✅ Choose GPUs when:")
        print("   • Maximum speed is critical")
        print("   • Working with large models frequently")
        print("   • Running batch inference")
        print("   • Power consumption is not a primary concern")
        
        print("\n✅ Choose CPUs when:")
        print("   • Cost efficiency is important")
        print("   • Power consumption is a concern")
        print("   • Running smaller models occasionally")
        print("   • Need more flexible memory allocation")
        
        # Memory Utilization
        print(f"\n💾 MEMORY UTILIZATION ANALYSIS")
        print("-" * 40)
        
        largest_model_size = max(model_sizes.values())
        print(f"Largest tested model: {largest_model_size}GB (deepseek-r1:70b)")
        print("\nGPU VRAM Utilization:")
        print(f"  • A40 GPU (48GB): {(largest_model_size/48)*100:.1f}% utilization (70B model)")
        print(f"  • A40 GPU (48GB): {(19/48)*100:.1f}% utilization (32B model)")
        # print(f"  • RTX 5090 (32GB): {(largest_model_size/32)*100:.1f}% utilization")
        # print(f"  • RTX 4090 (24GB): {(largest_model_size/24)*100:.1f}% utilization")
        print("\nCPU RAM Utilization:")
        print("  • Contabo Server: Flexible allocation from system RAM")
        # print("  • AMD EPYC: Flexible allocation from system RAM")
        # print("  • AMD Threadripper: Flexible allocation from system RAM")
        print("  • Note: CPU inference can use system RAM dynamically")
        
        return summary_df
    
    return None

def save_extended_results(df, analysis_df=None):
    """Save extended comparison results"""
    if df is not None:
        # Save raw results
        df.to_csv("gpu_cpu_comparison_raw.csv", index=False)
        print(f"\n💾 Raw GPU vs CPU results saved to gpu_cpu_comparison_raw.csv")
        
        # Save analysis summary if available
        if analysis_df is not None:
            analysis_df.to_csv("gpu_cpu_performance_analysis.csv", index=False)
            print(f"💾 Performance analysis saved to gpu_cpu_performance_analysis.csv")

# Run the comprehensive analysis
if 'extended_results' in locals() and extended_results is not None:
    print("\n" + "="*80)
    analysis_summary = analyze_gpu_vs_cpu_performance(extended_results)
    save_extended_results(extended_results, analysis_summary)
else:
    print("Run the extended comparison first to get results for GPU vs CPU analysis")
