## Set Up

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
from pathlib import Path

if "workding_dir" not in globals():
    workding_dir = str(Path.cwd().parent)

os.chdir(workding_dir)
sys.path.append(workding_dir)
print("workding dir:", workding_dir)

from dotenv import find_dotenv, load_dotenv

found_dotenv = find_dotenv(".env")

if len(found_dotenv) == 0:
    found_dotenv = find_dotenv(".env.example")
print(f"loading env vars from: {found_dotenv}")
load_dotenv(found_dotenv, override=True)

In [None]:
# run cells above before running anything below

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.lines import Line2D

# Set global font sizes and style
plt.rcParams.update({
    'font.size': 14,
    'axes.titlesize': 18,
    'axes.labelsize': 16,
    'xtick.labelsize': 14,
    'ytick.labelsize': 14,
    'legend.fontsize': 13,
    'figure.titlesize': 20,
    'font.weight': 'normal',
    'axes.labelweight': 'bold',
    'axes.titleweight': 'bold'
})

# Load all CSV files
csv_files = {
    'amazon': [
        'deepseek-amazon_reviews_metrics.csv',
        'granite-magistral-amazon_metrics.csv', 
        'qwen3-amazon_metrics.csv'
    ],
    'imdb': [
        'deepseek-imdb_reviews_results_all_metrics.csv',
        'granite-magistral-imdb_metrics.csv',
        'qwen3-imdb_metrics.csv'
    ],
    'goemotions': [
        'deepseek-GoEmotions_results_metrics.csv',
        'granite-magistral-GoEmotions_metrics.csv',
        'qwen3-goemotions_metrics.csv'
    ]
}

def categorize_model(model_name):
    """Categorize models as reasoning vs base"""
    model_lower = model_name.lower()
    
    # Reasoning/Thinking models
    reasoning_indicators = [
        '(t)', 'thinking', 'deepseek-r1', 'deepseek_r1'
    ]
    
    # Check for reasoning models first (more specific)
    for indicator in reasoning_indicators:
        if indicator in model_lower:
            return 'Reasoning/Thinking'
    
    # Special case for magistral with thinking mode
    if 'magistral' in model_lower and '(t)' in model_lower:
        return 'Reasoning/Thinking'
    
    # Everything else is base/non-thinking
    return 'Base/Non-thinking'

def load_dataset(files, metric_col):
    """Load and combine data from multiple CSV files"""
    all_data = []
    for file in files:
        try:
            df = pd.read_csv(f"results/paper/{file}")
            df['dataset'] = file.split('-')[0].replace('deepseek', '').replace('granite', '').replace('qwen3', '')
            all_data.append(df)
        except FileNotFoundError:
            # Try without the results/paper/ prefix
            df = pd.read_csv(file)
            df['dataset'] = file.split('-')[0].replace('deepseek', '').replace('granite', '').replace('qwen3', '')
            all_data.append(df)
    
    combined_df = pd.concat(all_data, ignore_index=True)
    
    # Calculate computational cost (inverse of throughput) and best F1
    results = []
    included_model_shots = []
    for _, row in combined_df.iterrows():
        model_shots = f"{row['model']}_{row['shots']}"
        if model_shots in included_model_shots:
            print(f"Skipping duplicate model_shots: {model_shots}")
            continue
        included_model_shots.append(model_shots)
        
        if pd.notna(row[metric_col]) and pd.notna(row['eval_time']): # and row[metric_col] > 0.1:
            model = row['model']
            if model.lower() in ['deepseek-r1', 'deepseek-v3']:
                print(f"Skipping model {model}")
                continue

            model = model.replace('granite', 'Granite').replace('qwen3', 'Qwen3').replace('llama', 'Llama-').replace(':', '-').replace('b', 'B')

            results.append({
                'model': model,
                'shots': row['shots'], 
                'f1': row[metric_col] * 100,  # Convert to percentage
                'cost': row['eval_time'],
                'speed': 1 / row['eval_time'], 
                'model_type': categorize_model(row['model'])
            })

    print("#included_model_shots:", len(included_model_shots))
    
    return pd.DataFrame(results)

# Load data for all three datasets
amazon_data = load_dataset(csv_files['amazon'], 'f1_5_level')
amazon_data['dataset'] = 'Amazon (5-class)'
amazon_data['color'] = 'green'

imdb_data = load_dataset(csv_files['imdb'], 'f1')
imdb_data['dataset'] = 'IMDB (binary)'
imdb_data['color'] = 'blue'

goemotions_data = load_dataset(csv_files['goemotions'], 'f1')
goemotions_data['dataset'] = 'GoEmotions (27-class)'
goemotions_data['color'] = 'red'

# Combine all data
all_data = pd.concat([amazon_data, imdb_data, goemotions_data], ignore_index=True)

# Separate reasoning and base models
reasoning_data = all_data[all_data['model_type'] == 'Reasoning/Thinking']
base_data = all_data[all_data['model_type'] == 'Base/Non-thinking']

# Create the efficiency frontier plot with larger figure size
fig, ax = plt.subplots(figsize=(16, 8))

# Enhanced color palette with better contrast
colors = {
    'green': '#228B22',    # Forest Green
    'blue': '#1E90FF',     # Dodger Blue  
    'red': '#DC143C'       # Crimson
}

# Enhanced marker properties
marker_size = 80
edge_width = 1.2
alpha = 0.8

# Create scatter plot for each dataset and model type
for dataset, color in [('IMDB (binary)', 'blue'), ('Amazon (5-class)', 'green'), ('GoEmotions (27-class)', 'red')]:
    # Base models - circles
    base_subset = base_data[base_data['dataset'] == dataset]
    if len(base_subset) > 0:
        ax.scatter(base_subset['cost'], base_subset['f1'], 
                  c=colors[color], alpha=alpha, s=marker_size, 
                  marker='o', edgecolors='white', linewidth=edge_width,
                  label=f'{dataset}')
    
    # Reasoning models - triangles
    reasoning_subset = reasoning_data[reasoning_data['dataset'] == dataset]
    if len(reasoning_subset) > 0:
        ax.scatter(reasoning_subset['cost'], reasoning_subset['f1'], 
                  c=colors[color], alpha=alpha, s=marker_size,
                  marker='^', edgecolors='white', linewidth=edge_width)

# Set log scale for x-axis (cost)
ax.set_xscale('log')

# Enhanced labels and title
ax.set_xlabel('Computational Cost (Log-scaled Mean Per-sample Latency in Seconds)', 
              fontsize=16, fontweight='bold', labelpad=15)
ax.set_ylabel('F1 Score (%)', fontsize=16, fontweight='bold', labelpad=15)

# Multi-line title for better readability
# title_line1 = 'Performance vs. Computational Cost Trade-offs by Model Type'
# title_line2 = 'Across All Model Configurations'
# ax.set_title(f'{title_line1}\n{title_line2}', 
#             fontsize=18, fontweight='bold', pad=25)

# Enhanced grid
ax.grid(True, alpha=0.4, linestyle='--', linewidth=0.8)
ax.set_axisbelow(True)

# Create enhanced custom legend with better organization
legend_elements = []

# Dataset legend (top section)
legend_elements.append(Line2D([0], [0], color='black', linewidth=2, 
                             label='Datasets:', linestyle='None'))

for dataset, color in [('IMDB (binary)', 'blue'), ('Amazon (5-class)', 'green'), ('GoEmotions (27-class)', 'red')]:
    legend_elements.append(Line2D([0], [0], marker='o', color='w', 
                                 markerfacecolor=colors[color], 
                                 markersize=10, label=f'  {dataset}', 
                                 markeredgecolor='white', markeredgewidth=1.2))

# Separator
legend_elements.append(Line2D([0], [0], color='white', label=''))

# Model type legend (bottom section)
legend_elements.append(Line2D([0], [0], color='black', linewidth=2,
                             label='Model Types:', linestyle='None'))

legend_elements.append(Line2D([0], [0], marker='o', color='w', 
                             markerfacecolor='#666666', 
                             markersize=10, label='  Base/Non-thinking', 
                             markeredgecolor='white', markeredgewidth=1.2))
legend_elements.append(Line2D([0], [0], marker='^', color='w', 
                             markerfacecolor='#666666', 
                             markersize=10, label='  Reasoning/Thinking', 
                             markeredgecolor='white', markeredgewidth=1.2))

# Enhanced legend with better positioning and styling
legend = ax.legend(handles=legend_elements, loc='lower right', 
                  fontsize=13, frameon=True, fancybox=True, 
                  shadow=True, framealpha=0.95, 
                  bbox_to_anchor=(0.98, 0.02),
                  borderpad=1.2, columnspacing=1.5, handletextpad=0.8)

# Enhance legend frame
legend.get_frame().set_facecolor('white')
legend.get_frame().set_edgecolor('gray')
legend.get_frame().set_linewidth(1.5)

# Find and annotate key points with enhanced styling
amazon_subset = all_data[all_data['dataset'] == 'Amazon (5-class)']
imdb_subset = all_data[all_data['dataset'] == 'IMDB (binary)']
goemotions_subset = all_data[all_data['dataset'] == 'GoEmotions (27-class)']

# Best performance for each dataset
amazon_best = amazon_subset.loc[amazon_subset['f1'].idxmax()]
imdb_best = imdb_subset.loc[imdb_subset['f1'].idxmax()]
goemotions_best = goemotions_subset.loc[goemotions_subset['f1'].idxmax()]

# Enhanced annotation styling
annotation_fontsize = 12
annotation_props = dict(
    fontsize=annotation_fontsize, 
    ha='center', va='center',
    bbox=dict(boxstyle='round,pad=0.5', alpha=0.9, edgecolor='black', linewidth=1.5),
    arrowprops=dict(arrowstyle='->', lw=2, color='black')
)

# Annotate key points with better positioning
ax.annotate(f'Best Amazon\n{amazon_best["model"]}\n{amazon_best["f1"]:.1f}%', 
           xy=(amazon_best['cost'], amazon_best['f1']), 
           xytext=(-180, 15), textcoords='offset points',
           bbox=dict(boxstyle='round,pad=0.5', facecolor='lightgreen', 
                    alpha=0.9, edgecolor='darkgreen', linewidth=1.5),
           arrowprops=dict(arrowstyle='->', lw=2, color='darkgreen'),
           fontsize=annotation_fontsize, ha='center', va='center')

ax.annotate(f'Best IMDB\n{imdb_best["model"]}\n{imdb_best["f1"]:.1f}%', 
           xy=(imdb_best['cost'], imdb_best['f1']), 
           xytext=(80, -60), textcoords='offset points',
           bbox=dict(boxstyle='round,pad=0.5', facecolor='lightblue', 
                    alpha=0.9, edgecolor='darkblue', linewidth=1.5),
           arrowprops=dict(arrowstyle='->', lw=2, color='darkblue'),
           fontsize=annotation_fontsize, ha='center', va='center')

ax.annotate(f'Best GoEmotions\n{goemotions_best["model"]}\n{goemotions_best["f1"]:.1f}%', 
           xy=(goemotions_best['cost'], goemotions_best['f1']), 
           xytext=(60, 40), textcoords='offset points',
           bbox=dict(boxstyle='round,pad=0.5', facecolor='lightcoral', 
                    alpha=0.9, edgecolor='darkred', linewidth=1.5),
           arrowprops=dict(arrowstyle='->', lw=2, color='darkred'),
           fontsize=annotation_fontsize, ha='center', va='center')

# Add enhanced efficiency frontier lines
for dataset, color in [('Amazon (5-class)', 'green'), ('IMDB (binary)', 'blue'), ('GoEmotions (27-class)', 'red')]:
    data_subset = all_data[all_data['dataset'] == dataset].copy()
    data_subset = data_subset.sort_values('cost')
    
    # Calculate Pareto frontier (efficiency frontier)
    frontier_points = []
    max_f1_so_far = 0
    
    for _, point in data_subset.iterrows():
        if point['f1'] > max_f1_so_far:
            frontier_points.append(point)
            max_f1_so_far = point['f1']
    
    if frontier_points:
        frontier_df = pd.DataFrame(frontier_points)
        ax.plot(frontier_df['cost'], frontier_df['f1'], 
               color=colors[color], linestyle='--', alpha=0.9, 
               linewidth=3, zorder=10)

# Enhanced axis limits and ticks
ax.set_ylim(20, 102)
ax.set_xlim(left=all_data['cost'].min() * 0.8, right=all_data['cost'].max() * 1.2)

# Enhance tick formatting
ax.tick_params(axis='both', which='major', labelsize=14, width=1.2, length=6)
ax.tick_params(axis='both', which='minor', width=0.8, length=4)

# Add subtle background color
ax.set_facecolor('#fafafa')

# Enhance spines
for spine in ax.spines.values():
    spine.set_linewidth(1.2)
    spine.set_color('#333333')

# Adjust layout to prevent clipping
plt.tight_layout(pad=2.0)

# Save with high quality
plt.savefig('results/paper/pareto_frontier_v2.pdf', 
           dpi=600, bbox_inches='tight', 
           facecolor='white', edgecolor='none',
           format='pdf')

plt.savefig('results/paper/pareto_frontier_v2.png', 
           dpi=300, bbox_inches='tight', 
           facecolor='white', edgecolor='none',
           format='png')

plt.show()

# Enhanced statistical analysis with better formatting
print("=" * 60)
print("COMPREHENSIVE EFFICIENCY ANALYSIS REPORT")
print("=" * 60)

print(f"\nüìä DATASET OVERVIEW")
print(f"   Total configurations analyzed: {len(all_data):,}")
print(f"   Base/Non-thinking models: {len(base_data):,} ({len(base_data)/len(all_data)*100:.1f}%)")
print(f"   Reasoning/Thinking models: {len(reasoning_data):,} ({len(reasoning_data)/len(all_data)*100:.1f}%)")
print(f"   Computational cost range: {all_data['cost'].min():.6f} - {all_data['cost'].max():.6f}s")
print(f"   Performance range: {all_data['f1'].min():.1f}% - {all_data['f1'].max():.1f}% F1")

print(f"\nüìà OVERALL MODEL TYPE COMPARISON")
if len(base_data) > 0:
    print(f"   Base models:")
    print(f"     ‚Ä¢ Average F1: {base_data['f1'].mean():.1f}% (¬±{base_data['f1'].std():.1f}%)")
    print(f"     ‚Ä¢ Average Cost: {base_data['cost'].mean():.6f}s (¬±{base_data['cost'].std():.6f}s)")
    print(f"     ‚Ä¢ Median Cost: {base_data['cost'].median():.6f}s")

if len(reasoning_data) > 0:
    print(f"   Reasoning models:")
    print(f"     ‚Ä¢ Average F1: {reasoning_data['f1'].mean():.1f}% (¬±{reasoning_data['f1'].std():.1f}%)")
    print(f"     ‚Ä¢ Average Cost: {reasoning_data['cost'].mean():.6f}s (¬±{reasoning_data['cost'].std():.6f}s)")
    print(f"     ‚Ä¢ Median Cost: {reasoning_data['cost'].median():.6f}s")

if len(base_data) > 0 and len(reasoning_data) > 0:
    cost_ratio = reasoning_data['cost'].mean() / base_data['cost'].mean()
    perf_diff = reasoning_data['f1'].mean() - base_data['f1'].mean()
    print(f"\n   üìä RELATIVE COMPARISON:")
    print(f"     ‚Ä¢ Cost ratio (Reasoning/Base): {cost_ratio:.1f}√ó slower")
    print(f"     ‚Ä¢ Performance difference: {perf_diff:+.1f} percentage points")
    print(f"     ‚Ä¢ Efficiency ratio: {perf_diff/cost_ratio:.2f} F1 points per cost unit")

print(f"\nüéØ DATASET-SPECIFIC DETAILED ANALYSIS")
print("-" * 60)

for dataset in ['Amazon (5-class)', 'IMDB (binary)', 'GoEmotions (27-class)']:
    subset = all_data[all_data['dataset'] == dataset]
    base_subset = base_data[base_data['dataset'] == dataset]
    reasoning_subset = reasoning_data[reasoning_data['dataset'] == dataset]
    
    print(f"\nüìÅ {dataset}:")
    print(f"   Total configurations: {len(subset):,}")
    print(f"   Model split: {len(base_subset)} Base, {len(reasoning_subset)} Reasoning")
    print(f"   F1 range: {subset['f1'].min():.1f}% ‚Üí {subset['f1'].max():.1f}%")
    print(f"   Cost range: {subset['cost'].min():.6f}s ‚Üí {subset['cost'].max():.6f}s")
    
    if len(base_subset) > 0 and len(reasoning_subset) > 0:
        base_avg_f1 = base_subset['f1'].mean()
        base_avg_cost = base_subset['cost'].mean()
        reasoning_avg_f1 = reasoning_subset['f1'].mean()
        reasoning_avg_cost = reasoning_subset['cost'].mean()
        
        print(f"   üìä Base models: {base_avg_f1:.1f}% F1, {base_avg_cost:.6f}s cost")
        print(f"   üß† Reasoning models: {reasoning_avg_f1:.1f}% F1, {reasoning_avg_cost:.6f}s cost")
        
        perf_advantage = reasoning_avg_f1 - base_avg_f1
        cost_ratio = reasoning_avg_cost / base_avg_cost
        print(f"   üí° Reasoning advantage: {perf_advantage:+.1f}% F1 at {cost_ratio:.1f}√ó cost")
    
    # Find most efficient high performer
    high_performers = subset[subset['f1'] > subset['f1'].quantile(0.9)]
    if len(high_performers) > 0:
        most_efficient = high_performers.loc[high_performers['cost'].idxmin()]
        print(f"   üèÜ Most efficient top performer:")
        print(f"       {most_efficient['model']} ({most_efficient['model_type']})")
        print(f"       {most_efficient['f1']:.1f}% F1, {most_efficient['cost']:.6f}s cost")

print(f"\nüöÄ EFFICIENCY FRONTIER ANALYSIS")
print("-" * 60)

for dataset in ['Amazon (5-class)', 'IMDB (binary)', 'GoEmotions (27-class)']:
    data_subset = all_data[all_data['dataset'] == dataset].copy()
    data_subset = data_subset.sort_values('cost')
    
    # Calculate Pareto frontier
    frontier_points = []
    max_f1_so_far = 0
    
    for _, point in data_subset.iterrows():
        if point['f1'] > max_f1_so_far:
            frontier_points.append(point)
            max_f1_so_far = point['f1']
    
    if frontier_points:
        frontier_df = pd.DataFrame(frontier_points)
        base_on_frontier = len(frontier_df[frontier_df['model_type'] == 'Base/Non-thinking'])
        reasoning_on_frontier = len(frontier_df[frontier_df['model_type'] == 'Reasoning/Thinking'])
        
        print(f"\nüéØ {dataset}:")
        print(f"   Pareto frontier points: {len(frontier_df)}")
        print(f"   Base models on frontier: {base_on_frontier} ({base_on_frontier/len(frontier_df)*100:.1f}%)")
        print(f"   Reasoning models on frontier: {reasoning_on_frontier} ({reasoning_on_frontier/len(frontier_df)*100:.1f}%)")
        
        # Show frontier composition
        if len(frontier_df) > 0:
            print(f"   üèÖ Frontier models:")
            for _, model in frontier_df.iterrows():
                efficiency_score = model['f1'] / (model['cost'] * 1000)  # F1 per millisecond
                print(f"       ‚Ä¢ {model['model']} ({model['model_type'][:4]}): "
                      f"{model['f1']:.1f}% F1, {model['cost']:.6f}s, "
                      f"Efficiency: {efficiency_score:.1f}")

print(f"\n" + "=" * 60)
print("üìã KEY RECOMMENDATIONS FOR PRACTITIONERS")
print("=" * 60)

# Generate recommendations based on analysis
recommendations = []

# Task complexity recommendations
amazon_reasoning_advantage = reasoning_data[reasoning_data['dataset'] == 'Amazon (5-class)']['f1'].mean() - base_data[base_data['dataset'] == 'Amazon (5-class)']['f1'].mean() if len(reasoning_data[reasoning_data['dataset'] == 'Amazon (5-class)']) > 0 and len(base_data[base_data['dataset'] == 'Amazon (5-class)']) > 0 else 0
imdb_reasoning_advantage = reasoning_data[reasoning_data['dataset'] == 'IMDB (binary)']['f1'].mean() - base_data[base_data['dataset'] == 'IMDB (binary)']['f1'].mean() if len(reasoning_data[reasoning_data['dataset'] == 'IMDB (binary)']) > 0 and len(base_data[base_data['dataset'] == 'IMDB (binary)']) > 0 else 0
goemotions_reasoning_advantage = reasoning_data[reasoning_data['dataset'] == 'GoEmotions (27-class)']['f1'].mean() - base_data[base_data['dataset'] == 'GoEmotions (27-class)']['f1'].mean() if len(reasoning_data[reasoning_data['dataset'] == 'GoEmotions (27-class)']) > 0 and len(base_data[base_data['dataset'] == 'GoEmotions (27-class)']) > 0 else 0

if goemotions_reasoning_advantage > 2:
    recommendations.append("‚úÖ Use reasoning models for complex multi-class emotion tasks (27+ classes)")
if imdb_reasoning_advantage < 0:
    recommendations.append("‚ùå Avoid reasoning models for simple binary classification tasks")
if amazon_reasoning_advantage < 1:
    recommendations.append("‚ö†Ô∏è  Exercise caution with reasoning models for moderate complexity tasks")

# Cost efficiency recommendations
overall_cost_ratio = reasoning_data['cost'].mean() / base_data['cost'].mean() if len(base_data) > 0 and len(reasoning_data) > 0 else 1
if overall_cost_ratio > 5:
    recommendations.append(f"üí∞ Consider {overall_cost_ratio:.1f}√ó computational cost when deploying reasoning models")

# Performance recommendations
best_overall = all_data.loc[all_data['f1'].idxmax()]
recommendations.append(f"üèÜ Best overall performer: {best_overall['model']} ({best_overall['f1']:.1f}% F1)")

for i, rec in enumerate(recommendations, 1):
    print(f"{i}. {rec}")

print(f"\nüìä Figure saved as:")
print(f"   ‚Ä¢ pareto_frontier_v2.pdf (High-resolution)")
print(f"   ‚Ä¢ pareto_frontier.png (Web-ready)")
print("=" * 60)

In [None]:
7 * 2 * (5 + 4 + 3)

In [None]:
168 * 3

In [None]:
(168 * 3 - 462) / 3 / 7