# Qwen3 Optimizer Comparison Analysis

This notebook provides an interactive analysis of the optimizer comparison results.

## Overview

We compare three optimizers:
- **AdamW**: Adaptive moment estimation with weight decay
- **SGD + Momentum**: Stochastic gradient descent with momentum
- **AdaBound**: Smooth transition from Adam to SGD

All models are fine-tuned on CommonsenseQA using LoRA (Low-Rank Adaptation).

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

print("üìä Analysis environment ready!")

## 1. Load and Explore Data

In [None]:
# Load results
results_path = "../results/results.csv"

try:
    df = pd.read_csv(results_path)
    print(f"‚úÖ Loaded {len(df)} results")
    print(f"Models: {', '.join(df['model'].tolist())}")
except FileNotFoundError:
    print("‚ùå Results file not found. Run the evaluation first.")
    print("Example data for demonstration:")
    
    # Create example data
    df = pd.DataFrame({
        'model': ['baseline', 'adamw', 'sgd', 'adabound'],
        'accuracy': [0.6245, 0.6892, 0.6634, 0.6978],
        'items_per_second': [2.34, 2.12, 2.28, 2.05],
        'evaluation_time_seconds': [527.3, 581.2, 540.8, 601.4],
        'gpu_memory_used_mb': [8234, 8456, 8321, 8492]
    })

# Display basic info
print("\nüìã Dataset Info:")
display(df.info())

print("\nüìä Summary Statistics:")
display(df.describe().round(4))

In [None]:
# Display the results table
print("üéØ Complete Results:")
display(df.style.format({
    'accuracy': '{:.4f}',
    'items_per_second': '{:.2f}',
    'evaluation_time_seconds': '{:.1f}',
    'gpu_memory_used_mb': '{:.0f}'
}).background_gradient(subset=['accuracy'], cmap='RdYlGn'))

## 2. Accuracy Analysis

In [None]:
# Accuracy comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Bar plot
colors = ['#ff7f0e' if model == 'baseline' else '#1f77b4' for model in df['model']]
bars = ax1.bar(df['model'], df['accuracy'], color=colors, alpha=0.8, edgecolor='black')

# Add value labels
for bar, acc in zip(bars, df['accuracy']):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + 0.001,
             f'{acc:.4f}', ha='center', va='bottom', fontweight='bold')

ax1.set_ylabel('Accuracy')
ax1.set_title('Model Accuracy Comparison')
ax1.set_ylim(0, max(df['accuracy']) * 1.1)
ax1.tick_params(axis='x', rotation=45)

# Horizontal bar plot (sorted)
df_sorted = df.sort_values('accuracy')
bars2 = ax2.barh(df_sorted['model'], df_sorted['accuracy'], 
                 color=['#ff7f0e' if model == 'baseline' else '#1f77b4' for model in df_sorted['model']],
                 alpha=0.8, edgecolor='black')

# Add value labels
for i, (model, acc) in enumerate(zip(df_sorted['model'], df_sorted['accuracy'])):
    ax2.text(acc + 0.002, i, f'{acc:.4f}', va='center', fontweight='bold')

ax2.set_xlabel('Accuracy')
ax2.set_title('Model Accuracy (Ranked)')
ax2.set_xlim(0, max(df['accuracy']) * 1.1)

plt.tight_layout()
plt.show()

# Print best model
best_model = df.loc[df['accuracy'].idxmax()]
print(f"üèÜ Best Model: {best_model['model']} with {best_model['accuracy']:.4f} accuracy")

## 3. Performance Metrics Dashboard

In [None]:
# Create comprehensive dashboard
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Accuracy
axes[0, 0].bar(df['model'], df['accuracy'], color='skyblue', alpha=0.8, edgecolor='navy')
axes[0, 0].set_title('Accuracy by Optimizer', fontsize=14, fontweight='bold')
axes[0, 0].set_ylabel('Accuracy')
axes[0, 0].tick_params(axis='x', rotation=45)

# Speed
axes[0, 1].bar(df['model'], df['items_per_second'], color='lightgreen', alpha=0.8, edgecolor='darkgreen')
axes[0, 1].set_title('Inference Speed by Optimizer', fontsize=14, fontweight='bold')
axes[0, 1].set_ylabel('Items per Second')
axes[0, 1].tick_params(axis='x', rotation=45)

# Evaluation time
axes[1, 0].bar(df['model'], df['evaluation_time_seconds'], color='salmon', alpha=0.8, edgecolor='darkred')
axes[1, 0].set_title('Evaluation Time by Optimizer', fontsize=14, fontweight='bold')
axes[1, 0].set_ylabel('Time (seconds)')
axes[1, 0].tick_params(axis='x', rotation=45)

# GPU Memory (if available)
if 'gpu_memory_used_mb' in df.columns:
    axes[1, 1].bar(df['model'], df['gpu_memory_used_mb'], color='gold', alpha=0.8, edgecolor='orange')
    axes[1, 1].set_title('GPU Memory Usage by Optimizer', fontsize=14, fontweight='bold')
    axes[1, 1].set_ylabel('Memory (MB)')
    axes[1, 1].tick_params(axis='x', rotation=45)
else:
    axes[1, 1].text(0.5, 0.5, 'GPU Memory\nData Not Available', 
                   ha='center', va='center', transform=axes[1, 1].transAxes, fontsize=14)
    axes[1, 1].set_title('GPU Memory Usage')

plt.tight_layout()
plt.show()

## 4. Improvement Analysis

In [None]:
# Improvement over baseline
baseline_row = df[df['model'] == 'baseline']

if not baseline_row.empty:
    baseline_acc = baseline_row['accuracy'].iloc[0]
    
    # Calculate improvements
    improvements = []
    for _, row in df.iterrows():
        if row['model'] != 'baseline':
            improvement = (row['accuracy'] - baseline_acc) * 100
            improvements.append({
                'model': row['model'],
                'improvement_pct': improvement,
                'accuracy': row['accuracy']
            })
    
    if improvements:
        imp_df = pd.DataFrame(improvements)
        
        # Plot improvements
        plt.figure(figsize=(12, 6))
        colors = ['#2ca02c' if imp > 0 else '#d62728' for imp in imp_df['improvement_pct']]
        bars = plt.bar(imp_df['model'], imp_df['improvement_pct'], color=colors, alpha=0.8, edgecolor='black')
        
        # Add value labels
        for i, (model, imp) in enumerate(zip(imp_df['model'], imp_df['improvement_pct'])):
            plt.text(i, imp + 0.1 if imp > 0 else imp - 0.1, f'{imp:+.2f}%', 
                    ha='center', va='bottom' if imp > 0 else 'top', fontweight='bold')
        
        plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
        plt.xlabel('Optimizer')
        plt.ylabel('Accuracy Improvement (%)')
        plt.title(f'Accuracy Improvement over Baseline\n(Baseline: {baseline_acc:.4f})', fontsize=14, fontweight='bold')
        plt.tight_layout()
        plt.show()
        
        # Summary table
        print("üìà Improvement Summary:")
        display(imp_df.style.format({
            'improvement_pct': '{:+.2f}%',
            'accuracy': '{:.4f}'
        }).background_gradient(subset=['improvement_pct'], cmap='RdYlGn'))
        
else:
    print("‚ö†Ô∏è No baseline model found for comparison")

## 5. Efficiency Analysis: Accuracy vs Speed

In [None]:
# Scatter plot: Accuracy vs Speed
plt.figure(figsize=(12, 8))

# Create scatter plot
colors = ['#ff7f0e' if model == 'baseline' else '#1f77b4' for model in df['model']]
sizes = [150 if model == 'baseline' else 100 for model in df['model']]

scatter = plt.scatter(df['items_per_second'], df['accuracy'], 
                     s=sizes, c=colors, alpha=0.7, edgecolors='black', linewidth=2)

# Add labels for each point
for _, row in df.iterrows():
    plt.annotate(row['model'], 
                (row['items_per_second'], row['accuracy']),
                xytext=(10, 10), textcoords='offset points', 
                fontsize=11, fontweight='bold',
                bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.8))

# Add quadrant lines
plt.axhline(y=df['accuracy'].mean(), color='gray', linestyle='--', alpha=0.5, linewidth=2)
plt.axvline(x=df['items_per_second'].mean(), color='gray', linestyle='--', alpha=0.5, linewidth=2)

# Add quadrant labels
plt.text(0.02, 0.98, 'High Accuracy\nLow Speed', transform=plt.gca().transAxes, 
         ha='left', va='top', fontsize=10, 
         bbox=dict(boxstyle='round', facecolor='lightcoral', alpha=0.7))

plt.text(0.98, 0.98, 'High Accuracy\nHigh Speed', transform=plt.gca().transAxes, 
         ha='right', va='top', fontsize=10,
         bbox=dict(boxstyle='round', facecolor='lightgreen', alpha=0.7))

plt.text(0.02, 0.02, 'Low Accuracy\nLow Speed', transform=plt.gca().transAxes, 
         ha='left', va='bottom', fontsize=10,
         bbox=dict(boxstyle='round', facecolor='lightgray', alpha=0.7))

plt.text(0.98, 0.02, 'Low Accuracy\nHigh Speed', transform=plt.gca().transAxes, 
         ha='right', va='bottom', fontsize=10,
         bbox=dict(boxstyle='round', facecolor='lightyellow', alpha=0.7))

plt.xlabel('Inference Speed (items/second)', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.title('Efficiency Analysis: Accuracy vs Speed Trade-off', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Calculate correlation
correlation = df['items_per_second'].corr(df['accuracy'])
print(f"üîÑ Speed-Accuracy Correlation: {correlation:.3f}")
if abs(correlation) > 0.5:
    trend = "positive" if correlation > 0 else "negative"
    print(f"üìä Strong {trend} correlation detected!")
else:
    print("üìä Weak correlation - independent performance factors")

## 6. Statistical Analysis

In [None]:
# Statistical summary
print("üìä Statistical Analysis:")
print("=" * 50)

# Accuracy statistics
acc_stats = df['accuracy'].describe()
print(f"\nüéØ Accuracy Statistics:")
print(f"   Mean: {acc_stats['mean']:.4f}")
print(f"   Std:  {acc_stats['std']:.4f}")
print(f"   Min:  {acc_stats['min']:.4f} ({df.loc[df['accuracy'].idxmin(), 'model']})")
print(f"   Max:  {acc_stats['max']:.4f} ({df.loc[df['accuracy'].idxmax(), 'model']})")
print(f"   Range: {acc_stats['max'] - acc_stats['min']:.4f}")

# Speed statistics
speed_stats = df['items_per_second'].describe()
print(f"\n‚ö° Speed Statistics:")
print(f"   Mean: {speed_stats['mean']:.2f} items/sec")
print(f"   Std:  {speed_stats['std']:.2f}")
print(f"   Min:  {speed_stats['min']:.2f} ({df.loc[df['items_per_second'].idxmin(), 'model']})")
print(f"   Max:  {speed_stats['max']:.2f} ({df.loc[df['items_per_second'].idxmax(), 'model']})")

# Variability analysis
acc_cv = (acc_stats['std'] / acc_stats['mean']) * 100
speed_cv = (speed_stats['std'] / speed_stats['mean']) * 100

print(f"\nüìà Coefficient of Variation:")
print(f"   Accuracy: {acc_cv:.2f}%")
print(f"   Speed: {speed_cv:.2f}%")

if acc_cv < 5:
    print("   ‚Üí Low accuracy variability (models perform similarly)")
elif acc_cv > 15:
    print("   ‚Üí High accuracy variability (significant differences)")
else:
    print("   ‚Üí Moderate accuracy variability")

## 7. Key Insights and Recommendations

In [None]:
# Generate insights
print("üîç KEY INSIGHTS:")
print("=" * 60)

# Best performing model
best_model = df.loc[df['accuracy'].idxmax()]
print(f"\nüèÜ BEST ACCURACY: {best_model['model'].upper()}")
print(f"   ‚Ä¢ Accuracy: {best_model['accuracy']:.4f}")
print(f"   ‚Ä¢ Speed: {best_model['items_per_second']:.2f} items/sec")

# Fastest model
fastest_model = df.loc[df['items_per_second'].idxmax()]
print(f"\n‚ö° FASTEST MODEL: {fastest_model['model'].upper()}")
print(f"   ‚Ä¢ Speed: {fastest_model['items_per_second']:.2f} items/sec")
print(f"   ‚Ä¢ Accuracy: {fastest_model['accuracy']:.4f}")

# Most balanced model
df['normalized_acc'] = (df['accuracy'] - df['accuracy'].min()) / (df['accuracy'].max() - df['accuracy'].min())
df['normalized_speed'] = (df['items_per_second'] - df['items_per_second'].min()) / (df['items_per_second'].max() - df['items_per_second'].min())
df['balance_score'] = (df['normalized_acc'] + df['normalized_speed']) / 2

balanced_model = df.loc[df['balance_score'].idxmax()]
print(f"\n‚öñÔ∏è MOST BALANCED: {balanced_model['model'].upper()}")
print(f"   ‚Ä¢ Balance Score: {balanced_model['balance_score']:.3f}")
print(f"   ‚Ä¢ Accuracy: {balanced_model['accuracy']:.4f}")
print(f"   ‚Ä¢ Speed: {balanced_model['items_per_second']:.2f} items/sec")

# Baseline comparison
baseline = df[df['model'] == 'baseline']
if not baseline.empty:
    baseline_acc = baseline['accuracy'].iloc[0]
    improved_models = df[df['accuracy'] > baseline_acc]
    
    print(f"\nüìà BASELINE COMPARISON:")
    print(f"   ‚Ä¢ Baseline accuracy: {baseline_acc:.4f}")
    print(f"   ‚Ä¢ Models beating baseline: {len(improved_models) - 1}")  # -1 to exclude baseline itself
    
    if len(improved_models) > 1:
        avg_improvement = ((improved_models['accuracy'].mean() - baseline_acc) * 100)
        print(f"   ‚Ä¢ Average improvement: {avg_improvement:.2f}%")

print("\n" + "=" * 60)
print("üí° RECOMMENDATIONS:")
print("=" * 60)

if best_model['model'] != 'baseline':
    print(f"\n1. üéØ FOR HIGHEST ACCURACY:")
    print(f"   ‚Üí Use {best_model['model'].upper()} optimizer")
    print(f"   ‚Üí Expected accuracy: {best_model['accuracy']:.4f}")

if fastest_model['model'] != best_model['model']:
    print(f"\n2. ‚ö° FOR FASTEST INFERENCE:")
    print(f"   ‚Üí Use {fastest_model['model'].upper()} optimizer")
    print(f"   ‚Üí Expected speed: {fastest_model['items_per_second']:.2f} items/sec")

print(f"\n3. ‚öñÔ∏è FOR BALANCED PERFORMANCE:")
print(f"   ‚Üí Use {balanced_model['model'].upper()} optimizer")
print(f"   ‚Üí Good trade-off between accuracy and speed")

print(f"\n4. üî¨ FOR FURTHER RESEARCH:")
print(f"   ‚Üí Try ensemble methods combining top performers")
print(f"   ‚Üí Experiment with learning rate schedules")
print(f"   ‚Üí Consider longer training with best optimizer")

print(f"\n5. üè≠ FOR PRODUCTION:")
print(f"   ‚Üí Consider your accuracy vs speed requirements")
print(f"   ‚Üí Monitor resource usage in your specific environment")
print(f"   ‚Üí Test with your actual data distribution")

## 8. Export Results

In [None]:
# Create output directory
output_dir = Path("../results/notebook_analysis")
output_dir.mkdir(parents=True, exist_ok=True)

# Save processed dataframe
df_export = df[['model', 'accuracy', 'items_per_second', 'evaluation_time_seconds', 'balance_score']].copy()
df_export = df_export.round(4)
df_export.to_csv(output_dir / "processed_results.csv", index=False)

# Save insights as markdown
insights_md = f"""# Analysis Insights

## Best Performing Models

- **Highest Accuracy**: {best_model['model']} ({best_model['accuracy']:.4f})
- **Fastest Speed**: {fastest_model['model']} ({fastest_model['items_per_second']:.2f} items/sec)
- **Most Balanced**: {balanced_model['model']} (score: {balanced_model['balance_score']:.3f})

## Statistical Summary

- **Mean Accuracy**: {df['accuracy'].mean():.4f} ¬± {df['accuracy'].std():.4f}
- **Mean Speed**: {df['items_per_second'].mean():.2f} ¬± {df['items_per_second'].std():.2f} items/sec
- **Speed-Accuracy Correlation**: {df['items_per_second'].corr(df['accuracy']):.3f}

## Recommendations

1. **For production use**: {balanced_model['model']} provides the best overall balance
2. **For research**: {best_model['model']} achieves highest accuracy
3. **For real-time applications**: {fastest_model['model']} offers fastest inference
"""

with open(output_dir / "insights.md", "w") as f:
    f.write(insights_md)

print(f"üìÅ Results exported to: {output_dir}")
print("   ‚Ä¢ processed_results.csv")
print("   ‚Ä¢ insights.md")
print("\n‚úÖ Analysis complete!")