In [None]:
import numpy as np

# side_by_side_bars.py
import matplotlib.pyplot as plt

def plot_side_by_side(labels, data, series_labels=None, colors=None,
                      total_width=0.8, figsize=(8, 4), ylabel=None, title=None,
                      annotate=False, ylim=None):
    """
    labels: list of group labels (length G)
    data: sequence of length S, each item is a sequence of length G (S series)
    series_labels: list of S labels for the legend
    colors: list of S colors
    total_width: fraction of group width occupied by bars (0-1)
    """
    data = np.array(data)
    if data.ndim == 1:
        data = data[np.newaxis, :]
    S, G = data.shape
    indices = np.arange(G)
    bar_w = total_width / S
    # center the group around each index
    offsets = (np.arange(S) - (S - 1) / 2) * bar_w

    fig, ax = plt.subplots(figsize=figsize)
    for i in range(S):
        ax.bar(indices + offsets[i], data[i], width=bar_w,
               label=(series_labels[i] if series_labels is not None else f"Series {i+1}"),
               color=(colors[i] if colors is not None else None))

    ax.set_xticks(indices)
    ax.set_xticklabels(labels)
    if ylabel:
        ax.set_ylabel(ylabel)
    if title:
        ax.set_title(title)
    if ylim:
        ax.set_ylim(ylim)
    ax.legend()
    if annotate:
        for i in range(S):
            for j in range(G):
                ax.text(indices[j] + offsets[i], data[i, j], str(data[i, j]),
                        ha='center', va='bottom', fontsize=8, rotation=0)

    plt.tight_layout()
    plt.show()


if __name__ == "__main__":
    # Example usage
    groups = ["No ICL System Prompt", "ICL System Prompt"]
    series = [
        [20, 34],  # Series A
        [25, 32],  # Series B
        [25, 32],  # Series B
    ]
    plot_side_by_side(groups, series,
                      series_labels=["Initial System Prompt", "Opt System Prompt 1", "Opt System Prompt 2"],
                      colors=["#4C72B0", "#55A868", "#C44E52"],
                      title="Answer Accuracy",
                      ylabel="Average Accuracy (%)",
                      annotate=True,
                      total_width=0.2,)

In [10]:
import matplotlib.pyplot as plt
import numpy as np

# Data for all three generators
# plot another six figures for supergpqa and commonsense
data = {
    'Gemma-3-4B-PT': {
        'generator_accuracy': 13.80,
        'mentors': ['Qwen3-14B', 'Qwen3-32B', 'R1-Distilled-\nLlama-70B'],
        'Average Decoding': [9.60, 3.60, 18.20],
        'Nudging\n(γ=0.40)': [8.60, 9.20, 11.60],
        'CoSD\n(α=0.50, β=0.50)': [6.40, 2.60, 11.80],
        'R-Stitch\n(τ=0.03)': [11.20, 9.80, 16.60],
        'MENTORCOLLAB-FREE\n(ρ=25%)': [12.60, 12.40, 15.20],
        'MENTORCOLLAB-MLP\n(ρ=25%)': [15.20, 14.40, 14.80]
    },
    'Llama3.1-8B': {
        'generator_accuracy': 18.00,
        'mentors': ['Qwen3-14B', 'Qwen3-32B', 'R1-Distilled-\nLlama-70B'],
        'Average Decoding': [8.80, 2.80, 21.00],
        'Nudging\n(γ=0.40)': [11.40, 9.00, 10.60],
        'CoSD\n(α=0.50, β=0.50)': [7.80, 2.80, 16.40],
        'R-Stitch\n(τ=0.03)': [10.80, 10.20, 16.60],
        'MENTORCOLLAB-FREE\n(ρ=25%)': [17.00, 16.60, 16.80],
        'MENTORCOLLAB-MLP\n(ρ=25%)': [17.00, 17.40, 15.40]
    },
    'Qwen3-8B-Base': {
        'generator_accuracy': 15.40,
        'mentors': ['Qwen3-14B', 'Qwen3-32B', 'R1-Distilled-\nLlama-70B'],
        'Average Decoding': [13.20, 13.00, 18.60],
        'Nudging\n(γ=0.40)': [11.40, 10.00, 14.40],
        'CoSD\n(α=0.50, β=0.50)': [15.40, 15.20, 12.80],
        'R-Stitch\n(τ=0.03)': [10.20, 10.40, 15.80],
        'MENTORCOLLAB-FREE\n(ρ=25%)': [17.20, 16.20, 18.60],
        'MENTORCOLLAB-MLP\n(ρ=25%)': [16.80, 16.80, 17.60]
    }
}

# Colors for different methods
colors = {
    'Average Decoding': '#8B4513',
    'Nudging\n(γ=0.40)': '#FF6B6B',
    'CoSD\n(α=0.50, β=0.50)': '#4ECDC4',
    'R-Stitch\n(τ=0.03)': '#95E1D3',
    'MENTORCOLLAB-FREE\n(ρ=25%)': '#F38181',
    'MENTORCOLLAB-MLP\n(ρ=25%)': '#AA96DA'
}

# Methods list
methods = ['Average Decoding', 'Nudging\n(γ=0.40)', 'CoSD\n(α=0.50, β=0.50)', 
           'R-Stitch\n(τ=0.03)', 'MENTORCOLLAB-FREE\n(ρ=25%)', 'MENTORCOLLAB-MLP\n(ρ=25%)']

# Create separate PDF for each generator
for generator_name in ['Gemma-3-4B-PT', 'Llama3.1-8B', 'Qwen3-8B-Base']:
    fig, ax = plt.subplots(figsize=(12, 6))
    
    plot_data = data[generator_name]
    n_mentors = len(plot_data['mentors'])
    n_methods = len(methods)
    
    # Bar width and positions
    bar_width = 0.13
    x_positions = np.arange(n_mentors)
    
    # Plot bars for each method
    for i, method in enumerate(methods):
        values = plot_data[method]
        offset = (i - n_methods/2 + 0.5) * bar_width
        bars = ax.bar(x_positions + offset, values, bar_width, 
                     label=method.replace('\n', ' '), 
                     color=colors[method],
                     edgecolor='black',
                     linewidth=0.5)
    
    # Draw generator baseline (dashed grey line)
    ax.axhline(y=plot_data['generator_accuracy'], color='grey', 
               linestyle='--', linewidth=2.5, 
               label=f'Generator ({plot_data["generator_accuracy"]:.1f}%)',
               zorder=0)
    
    # Formatting
    ax.set_xlabel('Mentor Model', fontsize=14, fontweight='bold')
    ax.set_ylabel('Accuracy (%)', fontsize=14, fontweight='bold')
    ax.set_title(f'Generator: {generator_name} (SuperGPQA)', fontsize=16, fontweight='bold', pad=20)
    
    ax.set_xticks(x_positions)
    ax.set_xticklabels(plot_data['mentors'], fontsize=11)
    
    # Set y-axis limits based on the generator
    if generator_name == 'Qwen3-8B-Base':
        ax.set_ylim(0, 50)
        ax.set_yticks(np.arange(0, 51, 10))
    else:
        ax.set_ylim(0, 25)
        ax.set_yticks(np.arange(0, 26, 5))
    
    ax.grid(True, alpha=0.3, linestyle='-', linewidth=0.5, axis='y')
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=9, 
              framealpha=0.95, edgecolor='gray', fancybox=True, ncol=1)
    
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    plt.tight_layout()
    
    # Save to separate PDF
    filename = f'mentor_comparison_SuperGPQA_{generator_name.replace(".", "_").replace("-", "_").lower()}.pdf'
    plt.savefig(filename, format='pdf', bbox_inches='tight', dpi=300)
    plt.close()
    
    print(f"Plot saved to '{filename}'")

print(f"\nAll plots saved to separate PDF files")

Plot saved to 'mentor_comparison_SuperGPQA_gemma_3_4b_pt.pdf'
Plot saved to 'mentor_comparison_SuperGPQA_llama3_1_8b.pdf'
Plot saved to 'mentor_comparison_SuperGPQA_qwen3_8b_base.pdf'

All plots saved to separate PDF files


In [3]:
import matplotlib.pyplot as plt
import numpy as np

# Data for all three generators
# plot another six figures for supergpqa and commonsense
data = {
    'Gemma-3-4B-PT': {
        'generator_accuracy': 24.07,
        'mentors': ['Qwen3-14B', 'Qwen3-32B', 'R1-Distilled-\nLlama-70B'],
        'Average Decoding': [3.73, 7.47, 5.39],
        'Nudging\n(γ=0.40)': [2.90, 9.96, 7.47],
        'CoSD\n(α=0.50, β=0.50)': [0.83, 3.73, 2.49],
        'R-Stitch\n(τ=0.03)': [4.98, 7.88, 1.66],
        'MENTORCOLLAB-FREE\n(ρ=25%)': [23.24, 21.99, 19.09],
        'MENTORCOLLAB-MLP\n(ρ=25%)': [25.73, 23.24, 26.97]
    },
    'Llama3.1-8B': {
        'generator_accuracy': 30.29,
        'mentors': ['Qwen3-14B', 'Qwen3-32B', 'R1-Distilled-\nLlama-70B'],
        'Average Decoding': [3.73, 9.96, 6.64],
        'Nudging\n(γ=0.40)': [2.49, 11.62, 4.98],
        'CoSD\n(α=0.50, β=0.50)': [0.83, 3.73, 2.90],
        'R-Stitch\n(τ=0.03)': [2.49, 7.88, 2.07],
        'MENTORCOLLAB-FREE\n(ρ=25%)': [31.54, 30.71, 32.78],
        'MENTORCOLLAB-MLP\n(ρ=25%)': [33.61, 30.29, 26.56]
    },
    'Qwen3-8B-Base': {
        'generator_accuracy': 54.77,
        'mentors': ['Qwen3-14B', 'Qwen3-32B', 'R1-Distilled-\nLlama-70B'],
        'Average Decoding': [3.73, 12.45, 35.27],
        'Nudging\n(γ=0.40)': [18.26, 20.89, 22.41],
        'CoSD\n(α=0.50, β=0.50)': [7.05, 15.77, 15.77],
        'R-Stitch\n(τ=0.03)': [2.90, 9.54, 3.32],
        'MENTORCOLLAB-FREE\n(ρ=25%)': [51.87, 49.79, 54.77],
        'MENTORCOLLAB-MLP\n(ρ=25%)': [42.32, 49.79, 48.55]
    }
}

# Colors for different methods
colors = {
    'Average Decoding': '#8B4513',
    'Nudging\n(γ=0.40)': '#FF6B6B',
    'CoSD\n(α=0.50, β=0.50)': '#4ECDC4',
    'R-Stitch\n(τ=0.03)': '#95E1D3',
    'MENTORCOLLAB-FREE\n(ρ=25%)': '#F38181',
    'MENTORCOLLAB-MLP\n(ρ=25%)': '#AA96DA'
}

# Methods list
methods = ['Average Decoding', 'Nudging\n(γ=0.40)', 'CoSD\n(α=0.50, β=0.50)', 
           'R-Stitch\n(τ=0.03)', 'MENTORCOLLAB-FREE\n(ρ=25%)', 'MENTORCOLLAB-MLP\n(ρ=25%)']

# Create separate PDF for each generator
for generator_name in ['Gemma-3-4B-PT', 'Llama3.1-8B', 'Qwen3-8B-Base']:
    fig, ax = plt.subplots(figsize=(12, 6))
    
    plot_data = data[generator_name]
    n_mentors = len(plot_data['mentors'])
    n_methods = len(methods)
    
    # Bar width and positions
    bar_width = 0.13
    x_positions = np.arange(n_mentors)
    
    # Plot bars for each method
    for i, method in enumerate(methods):
        values = plot_data[method]
        offset = (i - n_methods/2 + 0.5) * bar_width
        bars = ax.bar(x_positions + offset, values, bar_width, 
                     label=method.replace('\n', ' '), 
                     color=colors[method],
                     edgecolor='black',
                     linewidth=0.5)
    
    # Draw generator baseline (dashed grey line)
    ax.axhline(y=plot_data['generator_accuracy'], color='grey', 
               linestyle='--', linewidth=2.5, 
               label=f'Generator ({plot_data["generator_accuracy"]:.1f}%)',
               zorder=0)
    
    # Formatting
    ax.set_xlabel('Mentor Model', fontsize=14, fontweight='bold')
    ax.set_ylabel('Accuracy (%)', fontsize=14, fontweight='bold')
    ax.set_title(f'Generator: {generator_name} (Com2-hard Intervention)', fontsize=16, fontweight='bold', pad=20)
    
    ax.set_xticks(x_positions)
    ax.set_xticklabels(plot_data['mentors'], fontsize=11)
    
    # Set y-axis limits based on the generator
    if generator_name == 'Qwen3-8B-Base':
        ax.set_ylim(0, 60)
        ax.set_yticks(np.arange(0, 61, 10))
    else:
        ax.set_ylim(0, 40)
        ax.set_yticks(np.arange(0, 41, 5))
    
    ax.grid(True, alpha=0.3, linestyle='-', linewidth=0.5, axis='y')
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=9, 
              framealpha=0.95, edgecolor='gray', fancybox=True, ncol=1)
    
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    plt.tight_layout()
    
    # Save to separate PDF
    filename = f'mentor_comparison_commonsense_{generator_name.replace(".", "_").replace("-", "_").lower()}.pdf'
    plt.savefig(filename, format='pdf', bbox_inches='tight', dpi=300)
    plt.close()
    
    print(f"Plot saved to '{filename}'")

print(f"\nAll plots saved to separate PDF files")

Plot saved to 'mentor_comparison_commonsense_gemma_3_4b_pt.pdf'
Plot saved to 'mentor_comparison_commonsense_llama3_1_8b.pdf'
Plot saved to 'mentor_comparison_commonsense_qwen3_8b_base.pdf'

All plots saved to separate PDF files
