In [2]:
import os
import json
import glob
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from pathlib import Path
import warnings

In [9]:
# Base path for results
user = os.environ.get("USER", "default_user")
BASE_RESULTS_PATH = Path("/home/users/ntu/{user}/slimsc/prune/results".format(user=user))

# Model specific configurations
MODEL_CONFIGS = {
    "R1-Distill-Qwen-14B": {
        "total_gpu_memory_gib": 39.56 * 2,
        "model_weights_gib": 15.41 * 2,
        "activation_memory_gib": 2.95,
    },
    "QwQ-32B": {
        "total_gpu_memory_gib": 39.56 * 4,
        "model_weights_gib": 13.95 * 4,
        "activation_memory_gib": 10.15,
    }
}

GPU_MEMORY_UTILIZATION = 0.9
PRECOMPUTED_CACHE_USAGE_FILENAME = "precomputed_mean_gpu_cache_perc.txt"

# Datasets and Models to process
MODELS_TO_PROCESS = ["QwQ-32B", "R1-Distill-Qwen-14B"]
DATASETS_TO_PROCESS = ["aqua_rat", "aime", "gpqa_diamond"]

# Plotting configuration
PLOT_OUTPUT_DIR = "plots" # Directory to save the plots
PLOT_FILENAME_TEMPLATE_MEMORY = "memory_combined.png"
PLOT_FILENAME_TEMPLATE_TOKENS = "tokens_combined.png"

# --- Color/Style setup ---
# Define a specific color for each model
# Using colors from tab10 for distinct base colors
model_colors = {
    "QwQ-32B": cm.tab10(0), # First color of tab10 (blue)
    "R1-Distill-Qwen-14B": cm.tab10(1), # Second color of tab10 (orange)
    # Add more models/colors if needed
}

# Define a specific linestyle for each dataset
dataset_linestyles = {
    "aqua_rat": "-",  # Solid line
    "aime": "--",     # Dashed line
    "gpqa_diamond": ":", # Dotted line
    # Add more datasets/linestyles if needed
}

# Define markers per model (keeping markers per model)
model_markers = {
    "QwQ-32B": "o", # Circle marker
    "R1-Distill-Qwen-14B": "s", # Square marker
    # Add more models/markers if needed
}


In [10]:
def get_precomputed_mean_gpu_cache_usage_perc(run_path: Path):
    """
    Retrieves the precomputed mean_gpu_cache_usage_perc from the file
    saved by the precomputation script.
    """
    precomputed_file_path = run_path / PRECOMPUTED_CACHE_USAGE_FILENAME
    if not precomputed_file_path.exists():
        warnings.warn(f"Precomputed file '{PRECOMPUTED_CACHE_USAGE_FILENAME}' not found in {run_path}. "
                      f"Please run the precomputation script.")
        return None
    
    try:
        with open(precomputed_file_path, 'r') as f:
            value_str = f.read().strip()
            return float(value_str)
    except (IOError, ValueError) as e:
        warnings.warn(f"Error reading or parsing precomputed file {precomputed_file_path}: {e}")
        return None


def get_total_time_for_run(run_path: Path):
    """
    Calculates total processing time from aggregated_metrics.json.
    """
    metrics_file = run_path / "aggregated_metrics.json"
    if not metrics_file.exists():
        warnings.warn(f"aggregated_metrics.json not found in {run_path}")
        return None
    
    try:
        with open(metrics_file, 'r') as f:
            metrics_data = json.load(f)
        
        mean_duration = float(metrics_data["metrics"]["mean_processing_duration_sec_per_question"])
        num_questions = int(metrics_data["metrics"]["num_questions_processed"])
        return mean_duration * num_questions
    except (KeyError, ValueError, TypeError) as e:
        warnings.warn(f"Error processing {metrics_file}: {e}")
        return None


def calculate_mean_kv_cache_memory_gib(model_name: str, mean_gpu_cache_usage_perc: float):
    """
    Calculates the mean KV cache memory in GiB based on the formula.
    """
    if model_name not in MODEL_CONFIGS:
        warnings.warn(f"Model {model_name} not found in MODEL_CONFIGS.")
        return None
    if mean_gpu_cache_usage_perc is None: # Important check
        return None

    config = MODEL_CONFIGS[model_name]
    
    torch_available_memory = (
        GPU_MEMORY_UTILIZATION * config["total_gpu_memory_gib"]
        - config["model_weights_gib"]
        - config["activation_memory_gib"]
    )
    
    if torch_available_memory < 0:
        warnings.warn(f"Calculated torch_available_memory is negative for {model_name}. Check config.")
        torch_available_memory = 0 # Prevent negative memory

    mean_kv_mem_gib = torch_available_memory * mean_gpu_cache_usage_perc
    return mean_kv_mem_gib

In [11]:
all_results = []

for model_name in MODELS_TO_PROCESS:
    for dataset_name in DATASETS_TO_PROCESS:
        dataset_path = BASE_RESULTS_PATH / model_name / dataset_name
        if not dataset_path.is_dir():
            warnings.warn(f"Dataset path not found: {dataset_path}")
            continue

        sc_run_dirs = [d for d in dataset_path.iterdir() if d.is_dir() and re.match(r"sc_\d+_control", d.name)]

        for run_dir in sc_run_dirs:
            run_name = run_dir.name 
            match = re.match(r"sc_(\d+)_control", run_name)
            if not match:
                warnings.warn(f"Could not parse sc_i from directory name: {run_name}")
                continue
            
            sc_i = int(match.group(1))

            print(f"Processing: {model_name} / {dataset_name} / {run_name} (sc_i={sc_i})")

            # 1. Get the precomputed KV cache usage percentage
            mean_gpu_cache_usage_perc = get_precomputed_mean_gpu_cache_usage_perc(run_dir)
            if mean_gpu_cache_usage_perc is None:
                print(f"  Skipping {run_name} due to missing or invalid precomputed KV cache usage percentage.")
                continue # Skip this run entirely if cache perc is missing

            # 2. Get metrics from aggregated_metrics.json
            metrics_file = run_dir / "aggregated_metrics.json"
            total_time_s = None
            mean_completion_tokens = None

            if not metrics_file.exists():
                warnings.warn(f"aggregated_metrics.json not found in {run_dir}")
                # We proceed if KV cache perc was found, but total_time_s and tokens will be None
            else:
                try:
                    with open(metrics_file, 'r') as f:
                        metrics_data = json.load(f)
                    
                    # Extract time metrics
                    mean_duration = float(metrics_data["metrics"]["mean_processing_duration_sec_per_question"])
                    num_questions = int(metrics_data["metrics"]["num_questions_processed"])
                    total_time_s = mean_duration * num_questions

                    # Extract completion tokens metric (added)
                    mean_completion_tokens_str = metrics_data["metrics"]["mean_total_completion_tokens_per_question"]
                    mean_completion_tokens = float(mean_completion_tokens_str) # Or int(), depending on desired type

                except (KeyError, ValueError, TypeError, json.JSONDecodeError) as e:
                    warnings.warn(f"Error processing {metrics_file}: {e}")
                    # total_time_s and mean_completion_tokens remain None

            # 3. Calculate KV memory and cost ONLY if total_time_s was successfully extracted
            mean_kv_mem_gib = None
            cost = None
            if total_time_s is not None:
                mean_kv_mem_gib = calculate_mean_kv_cache_memory_gib(model_name, mean_gpu_cache_usage_perc)
                if mean_kv_mem_gib is not None: # Should not be None if perc is not None, but good practice
                    cost = mean_kv_mem_gib * total_time_s
                else:
                     warnings.warn(f"KV cache memory calculation failed for {model_name} with perc {mean_gpu_cache_usage_perc}")


            # Append results - include mean_completion_tokens
            all_results.append({
                "model_name": model_name,
                "dataset_name": dataset_name,
                "sc_i": sc_i,
                "mean_gpu_cache_usage_perc": mean_gpu_cache_usage_perc, # Still store the precomputed value
                "total_time_s": total_time_s, # May be None
                "mean_kv_cache_memory_gib": mean_kv_mem_gib, # May be None
                "cost_kv_mem_gib_total_time_s": cost, # May be None
                "mean_completion_tokens": mean_completion_tokens # May be None
            })

# Convert to DataFrame
results_df = pd.DataFrame(all_results)

# Filter out rows where essential data (like total_time_s for cost, or tokens for tokens plot) is missing
# For cost plot: need total_time_s and mean_kv_cache_memory_gib (which depends on perc)
# For tokens plot: need mean_completion_tokens
# We'll handle skipping missing data specifically in the plotting loops.

if not results_df.empty:
    results_df = results_df.sort_values(by=["model_name", "dataset_name", "sc_i"]).reset_index(drop=True)

print("\nProcessed Data:")
if not results_df.empty:
    print(results_df)
else:
    print("No data was processed successfully.")

Processing: QwQ-32B / aqua_rat / sc_32_control (sc_i=32)
Processing: QwQ-32B / aqua_rat / sc_8_control (sc_i=8)
Processing: QwQ-32B / aqua_rat / sc_2_control (sc_i=2)
Processing: QwQ-32B / aqua_rat / sc_16_control (sc_i=16)
Processing: QwQ-32B / aqua_rat / sc_1_control (sc_i=1)
Processing: QwQ-32B / aime / sc_8_control (sc_i=8)
Processing: QwQ-32B / aime / sc_2_control (sc_i=2)
Processing: QwQ-32B / aime / sc_4_control (sc_i=4)
Processing: QwQ-32B / aime / sc_16_control (sc_i=16)
Processing: QwQ-32B / aime / sc_1_control (sc_i=1)
Processing: QwQ-32B / gpqa_diamond / sc_32_control (sc_i=32)
Processing: QwQ-32B / gpqa_diamond / sc_8_control (sc_i=8)
Processing: QwQ-32B / gpqa_diamond / sc_2_control (sc_i=2)
Processing: QwQ-32B / gpqa_diamond / sc_16_control (sc_i=16)
Processing: QwQ-32B / gpqa_diamond / sc_1_control (sc_i=1)
Processing: R1-Distill-Qwen-14B / aqua_rat / sc_32_control (sc_i=32)
Processing: R1-Distill-Qwen-14B / aqua_rat / sc_64_control (sc_i=64)
Processing: R1-Distill-Qwen

In [19]:
if results_df.empty:
    print("No data to plot.")
else:
    # Ensure plot output directory exists
    plot_output_path = Path(PLOT_OUTPUT_DIR)
    plot_output_path.mkdir(parents=True, exist_ok=True)
    
    # Get unique models and datasets for consistent mapping
    unique_models = sorted(results_df['model_name'].unique())
    unique_datasets = sorted(results_df['dataset_name'].unique())
    
    # Reference the color, linestyle, and marker mappings defined in Cell 2
    # model_colors, dataset_linestyles, model_markers

    # Set x-ticks based on all available sc_i values across all data
    unique_sc_i_all = sorted(results_df['sc_i'].unique())

    # --- Plot 1: KV Cache Memory-Time Cost ---
    print("\nGenerating combined plot for KV Cache Memory-Time Cost...")
    # Increase figure size slightly, e.g., from (8, 6) or (5, 4) to (7, 5) or (8, 5)
    fig, ax = plt.subplots(figsize=(7, 5)) 

    # Iterate through each unique combination of Model and Dataset
    for (model_name, dataset_name), group in results_df.groupby(['model_name', 'dataset_name']):
        # Filter for valid cost data within this group
        group_cost_filtered = group[group['cost_kv_mem_gib_total_time_s'].notna()].copy()

        if not group_cost_filtered.empty:
            # Sort by sc_i for proper line plotting
            group_cost_filtered = group_cost_filtered.sort_values(by="sc_i")

            # Get color based on the model name
            line_color = model_colors.get(model_name, 'gray') # Use gray as a fallback

            # Get linestyle based on the dataset name
            line_style = dataset_linestyles.get(dataset_name, '-') # Use solid line as fallback

            # Get marker based on the model name
            marker_style = model_markers.get(model_name, 'x') # Use 'x' as fallback

            # Plot the line
            ax.plot(group_cost_filtered['sc_i'],
                    group_cost_filtered['cost_kv_mem_gib_total_time_s'],
                    label=f"{model_name} - {dataset_name}", # Label includes both model and dataset
                    color=line_color,        # Set color by model
                    linestyle=line_style,    # Set linestyle by dataset
                    marker=marker_style,     # Set marker by model
                    markersize=4)     

    ax.set_xlabel("Number of Chains")
    ax.set_ylabel("Mean KV Cache Memory (GiB) * Total Time (s)")
    ax.grid(True, which="both", linestyle="--", linewidth=0.5)

    # --- Legend Placement & Size ---
    # Place legend INSIDE the plot area using loc='best'
    # You can also specify a location like loc='upper left', 'upper right', etc.
    # Use fontsize='small' or 'x-small' to make the legend text smaller
    ax.legend(title="Model - Dataset", loc='best', fontsize='small') 
    # Remove bbox_to_anchor=(1.05, 1) that placed it outside

    if len(unique_sc_i_all) > 0:
        ax.set_xticks(unique_sc_i_all)

    # Use tight_layout without 'rect' so it only adjusts internal padding,
    # allowing the legend placed inside to determine its space within the axes.
    plt.tight_layout()

    filename = PLOT_FILENAME_TEMPLATE_MEMORY
    save_path = plot_output_path / filename
    try:
        plt.savefig(save_path, dpi=300)
        print(f"  Combined plot saved to {save_path}")
    except Exception as e:
        warnings.warn(f"Error saving combined plot {save_path}: {e}")

    plt.close(fig)


    # --- Plot 2: Mean Total Completion Tokens ---
    print("\nGenerating combined plot for Mean Total Completion Tokens...")
    # Increase figure size consistently with the first plot
    fig, ax = plt.subplots(figsize=(5, 4)) 

    # Iterate through each unique combination of Model and Dataset
    for (model_name, dataset_name), group in results_df.groupby(['model_name', 'dataset_name']):
        # Filter for valid token data within this group
        group_tokens_filtered = group[group['mean_completion_tokens'].notna()].copy()

        if not group_tokens_filtered.empty:
            # Sort by sc_i for proper line plotting
            group_tokens_filtered = group_tokens_filtered.sort_values(by="sc_i")

            # Get color based on the model name
            line_color = model_colors.get(model_name, 'gray')

            # Get linestyle based on the dataset name
            line_style = dataset_linestyles.get(dataset_name, '-')

             # Get marker based on the model name
            marker_style = model_markers.get(model_name, 'x')

            # Plot the line, dividing tokens by 1000 as before
            ax.plot(group_tokens_filtered['sc_i'],
                    group_tokens_filtered['mean_completion_tokens']/1000,
                    label=f"{model_name} - {dataset_name}", # Label includes both model and dataset
                    color=line_color,        # Set color by model
                    linestyle=line_style,    # Set linestyle by dataset
                    marker=marker_style,     # Set marker by model
                    markersize=4)

    ax.set_xlabel("Number of Chains")
    ax.set_ylabel("Tokens (Thousands)")
    ax.grid(True, which="both", linestyle="--", linewidth=0.5)
    
    # --- Legend Placement & Size ---
    # Place legend INSIDE the plot area
    ax.legend(title="Model - Dataset", loc='best', fontsize='small')
    # Remove bbox_to_anchor=(1.05, 1)

    if len(unique_sc_i_all) > 0:
        ax.set_xticks(unique_sc_i_all)

    # Use tight_layout without 'rect'
    plt.tight_layout()

    filename = PLOT_FILENAME_TEMPLATE_TOKENS
    save_path = plot_output_path / filename
    try:
        plt.savefig(save_path, dpi=300)
        print(f"  Combined plot saved to {save_path}")
    except Exception as e:
            warnings.warn(f"Error saving combined plot {save_path}: {e}")

    plt.close(fig)

    print("\nAll combined plots generated and saved.")


Generating combined plot for KV Cache Memory-Time Cost...
  Combined plot saved to plots/memory_combined.png

Generating combined plot for Mean Total Completion Tokens...
  Combined plot saved to plots/tokens_combined.png

All combined plots generated and saved.
