# Final Model Comparison for Thesis
This notebook generates all comparison tables and figures needed for the thesis.

**Run this AFTER running both DistilGPT2 and Qwen notebooks.**

**Generates:**
- Table 2: Model Comparison
- Figure 4: Model Performance Bar Chart
- Figure 5: Per-CWE Comparison
- Figure 7: Efficiency vs Performance Trade-off

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os

sns.set_style('whitegrid')
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.size'] = 10

Imports successful!


In [None]:
# Load all model results

def load_results(path, model_name):
    """Load results from JSON file"""
    with open(path, 'r') as f:
        results = json.load(f)
    if 'model_name' not in results:
        results['model_name'] = model_name
    return results

# Load model results
all_results = []
model_files = [
    ('results/eval_results_distilgpt.json', 'DistilGPT2'),
    ('results/eval_results_qwen15.json', 'Qwen2.5-Coder-1.5B'),
]

for file_path, model_name in model_files:
    try:
        results = load_results(file_path, model_name)
        all_results.append(results)
        print(f"✓ Loaded {model_name}")
    except FileNotFoundError:
        print(f"{model_name} results not found at {file_path}")

print(f"\nLoaded {len(all_results)} models for comparison")

In [None]:
# Model Comparison

comparison_data = []
for results in all_results:
    comparison_data.append({
        'Model': results['model_name'],
        'Parameters': results.get('total_parameters', 'N/A'),
        # 'Trainable': results.get('trainable_parameters', 'N/A'),
        'Trainable': results['trainable_params'],
        'Accuracy': results['metrics']['test']['accuracy'],
        'Precision': results['metrics']['test']['precision_macro'],
        'Recall': results['metrics']['test']['recall_macro'],
        'F1-Score': results['metrics']['test']['f1_macro'],
        'Inference (ms)': results['metrics']['test']['inference_time_ms'],
        'Training (s)': results.get('training_time_s', 'N/A')
    })

df_comparison = pd.DataFrame(comparison_data)
df_comparison = df_comparison.sort_values('F1-Score', ascending=False)


print("TABLE 2: MODEL COMPARISON - OVERALL PERFORMANCE")
print(df_comparison.to_string(index=False))

# Save as CSV
df_comparison.to_csv('results/table2_model_comparison.csv', index=False)
# Save as LaTeX
# latex_table = df_comparison.to_latex(index=False, float_format="%.4f")
# with open('results/table2_model_comparison.tex', 'w') as f:
#     f.write(latex_table)

# print("\nTable 2 saved as CSV and LaTeX")


TABLE 2: MODEL COMPARISON - OVERALL PERFORMANCE
                  Model Parameters  Trainable  Accuracy  Precision   Recall  F1-Score  Inference (ms) Training (s)
             distilgpt2        N/A      16916  0.880702   0.888545 0.889516  0.887538        0.581030          N/A
Qwen/Qwen2.5-Coder-1.5B        N/A    2394644  0.861404   0.858158 0.865028  0.856458        8.093195          N/A

Table 2 saved as CSV and LaTeX


In [None]:
#@title Figure 4: Model Performance Comparison

models = [r['model_name'] for r in all_results]
metrics = {
    'Accuracy': [r['metrics']['test']['accuracy'] for r in all_results],
    'Precision': [r['metrics']['test']['precision_macro'] for r in all_results],
    'Recall': [r['metrics']['test']['recall_macro'] for r in all_results],
    'F1-Score': [r['metrics']['test']['f1_macro'] for r in all_results]
}

fig, ax = plt.subplots(figsize=(12, 6))
x = np.arange(len(models))
width = 0.2

colors = ['#3498db', '#e74c3c', '#2ecc71', '#f39c12']
for i, (metric, values) in enumerate(metrics.items()):
    offset = width * (i - 1.5)
    bars = ax.bar(x + offset, values, width, label=metric, color=colors[i], alpha=0.8)
    
    # Add value labels
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}',
                ha='center', va='bottom', fontsize=8)

ax.set_xlabel('Model', fontweight='bold', fontsize=12)
ax.set_ylabel('Score', fontweight='bold', fontsize=12)
ax.set_title('Model Performance Comparison Across Metrics', fontweight='bold', fontsize=14)
ax.set_xticks(x)
ax.set_xticklabels(models, rotation=45, ha='right')
ax.set_ylim(0, 1.1)
ax.legend(loc='upper left')
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('figures/figure4_model_comparison.png', dpi=300, bbox_inches='tight')
plt.savefig('figures/figure4_model_comparison.pdf', bbox_inches='tight')
print("Figure 4 saved: figures/figure4_model_comparison.png")
plt.close()

# just F1 scores
fig, ax = plt.subplots(figsize=(10, 6))
f1_scores = [r['metrics']['test']['f1_macro'] for r in all_results]
bars = ax.bar(models, f1_scores, color=['#3498db', '#e74c3c', '#2ecc71', '#f39c12'][:len(models)], alpha=0.8)

for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.4f}',
            ha='center', va='bottom', fontweight='bold', fontsize=11)

ax.set_xlabel('Model', fontweight='bold', fontsize=12)
ax.set_ylabel('F1-Score (Macro)', fontweight='bold', fontsize=12)
ax.set_title('Model F1-Score Comparison', fontweight='bold', fontsize=14)
ax.set_ylim(0, max(f1_scores) * 1.15)
ax.grid(axis='y', alpha=0.3)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('figures/figure4b_f1_comparison.png', dpi=300, bbox_inches='tight')
plt.savefig('figures/figure4b_f1_comparison.pdf', bbox_inches='tight')
plt.close()

In [None]:
#Per-CWE Performance Comparison

# Collect per-CWE F1 scores from all models
all_cwes = set()
for results in all_results:
    if 'per_class_test' in results:
        all_cwes.update(results['per_class_test'].keys())

print(all_cwes)

cwe_comparison = {}
for cwe in sorted(all_cwes, key=lambda x: int(x) if str(x).isdigit() else 0):
    cwe_comparison[cwe] = {}
    for results in all_results:
        if 'per_class_test' in results and cwe in results['per_class_test']:
            model_name = results['model_name']
            cwe_comparison[cwe][model_name] = results['per_class_test'][cwe]['f1']

df_cwe = pd.DataFrame(cwe_comparison).T

fig, ax = plt.subplots(figsize=(14, 7))
df_cwe.plot(kind='bar', ax=ax, width=0.8, color=['#3498db', '#e74c3c', '#2ecc71', '#f39c12'][:len(all_results)])

ax.set_xlabel('CWE Type', fontweight='bold', fontsize=12)
ax.set_ylabel('F1-Score', fontweight='bold', fontsize=12)
ax.set_title('Per-CWE Performance Across Models', fontweight='bold', fontsize=14)
ax.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
ax.set_ylim(0, 1.1)
ax.grid(axis='y', alpha=0.3)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

plt.savefig('figures/figure5_per_cwe_comparison.png', dpi=300, bbox_inches='tight')
plt.savefig('figures/figure5_per_cwe_comparison.pdf', bbox_inches='tight')
print("Figure 5 saved: figures/figure5_per_cwe_comparison.png")
plt.close()

df_cwe.to_csv('results/table3_per_cwe_comparison_all_models.csv')

In [None]:
# Efficiency vs Performance Trade-off

# Filter models with inference time data
models_with_inference = [r for r in all_results if r['metrics']['test']['inference_time_ms'] > 0]

if len(models_with_inference) > 0:
    model_names = [r['model_name'] for r in models_with_inference]
    f1_scores = [r['metrics']['test']['f1_macro'] for r in models_with_inference]
    inference_times = [r['metrics']['test']['inference_time_ms'] for r in models_with_inference]
    params = [r.get('total_parameters', 0) for r in models_with_inference]
    
    # Normalize parameter count for marker size
    max_params = max(params) if max(params) > 0 else 1
    marker_sizes = [200 + (p / max_params) * 800 for p in params]
    
    fig, ax = plt.subplots(figsize=(10, 7))
    
    colors = ['#3498db', '#e74c3c', '#2ecc71', '#f39c12', '#9b59b6', '#1abc9c']
    for i, (name, f1, inf_time, size) in enumerate(zip(model_names, f1_scores, inference_times, marker_sizes)):
        ax.scatter(inf_time, f1, s=size, alpha=0.6, c=[colors[i % len(colors)]], edgecolors='black', linewidth=1.5)
        ax.annotate(name, (inf_time, f1), xytext=(10, 5), textcoords='offset points',
                   fontsize=10, fontweight='bold', 
                   bbox=dict(boxstyle='round,pad=0.3', facecolor=colors[i % len(colors)], alpha=0.3))
    
    ax.set_xlabel('Inference Time (ms/sample)', fontweight='bold', fontsize=12)
    ax.set_ylabel('F1-Score (Macro)', fontweight='bold', fontsize=12)
    ax.set_title('Model Efficiency vs Performance Trade-off', fontweight='bold', fontsize=14)
    ax.grid(True, alpha=0.3)
    
    # Add ideal region shading (top-left = fast + accurate)
    ax.axhspan(0.7, 1.0, alpha=0.05, color='green', label='High Performance')
    if max(inference_times) > 10:
        ax.axvspan(0, np.percentile(inference_times, 50), alpha=0.05, color='blue', label='Fast Inference')
    
    plt.tight_layout()
    plt.savefig('figures/figure7_efficiency_tradeoff.png', dpi=300, bbox_inches='tight')
    plt.savefig('figures/figure7_efficiency_tradeoff.pdf', bbox_inches='tight')
    plt.close()

In [None]:
# Model Size vs Performance

models_with_params = [r for r in all_results if r.get('total_parameters', 0) > 0]

if len(models_with_params) > 0:
    model_names = [r['model_name'] for r in models_with_params]
    f1_scores = [r['f1_macro'] for r in models_with_params]
    params = [r['total_parameters'] / 1e6 for r in models_with_params]  # Convert to millions
    
    fig, ax = plt.subplots(figsize=(10, 6))
    
    colors = ['#3498db', '#e74c3c', '#2ecc71', '#f39c12', '#9b59b6', '#1abc9c']
    bars = ax.bar(range(len(model_names)), f1_scores, color=[colors[i % len(colors)] for i in range(len(model_names))], alpha=0.7)
    
    # Add parameter count as text on bars
    for i, (bar, param) in enumerate(zip(bars, params)):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.4f}\n({param:.1f}M params)',
                ha='center', va='bottom', fontsize=9, fontweight='bold')
    
    ax.set_xlabel('Model', fontweight='bold', fontsize=12)
    ax.set_ylabel('F1-Score (Macro)', fontweight='bold', fontsize=12)
    ax.set_title('Model Size vs Performance', fontweight='bold', fontsize=14)
    ax.set_xticks(range(len(model_names)))
    ax.set_xticklabels(model_names, rotation=45, ha='right')
    ax.set_ylim(0, max(f1_scores) * 1.2)
    ax.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('figures/figure8_model_size_vs_performance.png', dpi=300, bbox_inches='tight')
    plt.savefig('figures/figure8_model_size_vs_performance.pdf', bbox_inches='tight')
    plt.close()

In [None]:
# Statistical Summary Table

# Create detailed statistical summary
summary_stats = []
for results in all_results:
    per_class = results.get('per_class_test', {})
    if per_class:
        f1_values = [m['f1'] for m in per_class.values()]
        summary_stats.append({
            'Model': results['model_name'],
            'Mean F1': np.mean(f1_values),
            'Std F1': np.std(f1_values),
            'Min F1': np.min(f1_values),
            'Max F1': np.max(f1_values),
            'Median F1': np.median(f1_values)
        })

if summary_stats:
    df_stats = pd.DataFrame(summary_stats)
    
    print("STATISTICAL SUMMARY: PER-CWE F1-SCORE DISTRIBUTION")
    print(df_stats.to_string(index=False))
    
    df_stats.to_csv('results/statistical_summary.csv', index=False)


STATISTICAL SUMMARY: PER-CWE F1-SCORE DISTRIBUTION
                  Model  Mean F1   Std F1   Min F1  Max F1  Median F1
             distilgpt2 0.887538 0.237348 0.234043     1.0   0.984495
Qwen/Qwen2.5-Coder-1.5B 0.856458 0.263951 0.000000     1.0   0.973786

Statistical summary saved to results/statistical_summary.csv


In [None]:
# Quick Statistics for Writing

if len(all_results) > 0:
    print("QUICK STATS FOR YOUR THESIS WRITING")
    
    # Best model
    best = max(all_results, key=lambda x: x['f1_macro'])
    print(f"\n Best Model: {best['model_name']}")
    print(f"  - F1-Score: {best['f1_macro']:.4f}")
    print(f"  - Accuracy: {best['accuracy']:.4f}")
    print(f"  - Precision: {best['precision_macro']:.4f}")
    print(f"  - Recall: {best['recall_macro']:.4f}")

    # Comparison with worst
    worst = min(all_results, key=lambda x: x['f1_macro'])
    improvement = ((best['f1_macro'] - worst['f1_macro']) / worst['f1_macro'] * 100)
    print(f"\nPerformance Range:")
    print(f"   - Best: {best['model_name']} (F1={best['f1_macro']:.4f})")
    print(f"   - Worst: {worst['model_name']} (F1={worst['f1_macro']:.4f})")
    print(f"   - Relative Improvement: {improvement:.1f}%")
    
    # Efficiency analysis
    with_inference = [r for r in all_results if r.get('inference_time_ms', 0) > 0]
    if with_inference:
        fastest = min(with_inference, key=lambda x: x['inference_time_ms'])
        print(f"\nMost Efficient:")
        print(f"   - Model: {fastest['model_name']}")
        print(f"   - Inference Time: {fastest['inference_time_ms']:.2f} ms/sample")
        print(f"   - F1-Score: {fastest['f1_macro']:.4f}")
        print(f"   - Throughput: ~{1000/fastest['inference_time_ms']:.0f} samples/second")
    
    # Sentences for thesis
    print("\nSample Sentences for Your Thesis:")
    print("\n1. Abstract/Introduction:")
    print(f'   "We evaluate {len(all_results)} models for CWE classification in Go code, ')
    print(f'   achieving a best F1-score of {best["f1_macro"]:.4f} with {best["model_name"]}."')
    
    print("\n2. Results:")
    print(f'   "Our experiments show that {best["model_name"]} outperforms baseline methods ')
    print(f'   by {improvement:.1f}%, demonstrating the effectiveness of small language models."')
    

In [None]:
# Final Summary Report

print("\nBest Performing Models:")
sorted_results = sorted(all_results, key=lambda x: x['f1_macro'], reverse=True)
for i, r in enumerate(sorted_results[:3], 1):
    print(f"  {i}. {r['model_name']}: F1={r['f1_macro']:.4f}, Acc={r['accuracy']:.4f}")

print("\nMost Efficient Model (if data available):")
efficient = [r for r in all_results if r.get('inference_time_ms', 0) > 0]
if efficient:
    fastest = min(efficient, key=lambda x: x['inference_time_ms'])
    print(f"  {fastest['model_name']}: {fastest['inference_time_ms']:.2f} ms/sample, F1={fastest['f1_macro']:.4f}")