# 09 - Model Comparison: Base vs Fine-Tuned vs Gemini

Compare three models on the same test CVs:
1. **Base DistilGPT-2** (no fine-tuning)
2. **Fine-Tuned DistilGPT-2** (after LoRA training)
3. **Gemini 2.0 Flash** (reference baseline)

## Evaluation Approach
- Same 10 test CVs for all models
- LLM-as-Judge evaluation
- Statistical comparison
- Memory-optimized for CPU

---

In [1]:
# Imports
import pandas as pd
import numpy as np
import json
from pathlib import Path
import time
import gc
from tqdm import tqdm
import sys
sys.path.append('..')

import google.generativeai as genai
import matplotlib.pyplot as plt
import seaborn as sns

# PyTorch & Transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

# Visualization
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 6)

print("✓ All imports loaded")

An error occurred: module 'importlib.metadata' has no attribute 'packages_distributions'




✓ All imports loaded


## 1. Setup & Configuration

In [2]:
# Load API key
from config import GEMINI_API_KEY
genai.configure(api_key=GEMINI_API_KEY)

# Paths
MODEL_PATH = Path('../models/medium_roaster_lora')
RESULTS_DIR = Path('../results')
RESULTS_DIR.mkdir(exist_ok=True)

print(f"✓ API configured")
print(f"✓ Model path: {MODEL_PATH}")
print(f"✓ Results directory: {RESULTS_DIR}")

✓ API configured
✓ Model path: ../models/medium_roaster_lora
✓ Results directory: ../results


## 2. Load Data

In [3]:
# Load dataset
df = pd.read_csv('../data/resume_data.csv')

# Load test indices
with open('../data/test_cv_indices.json', 'r') as f:
    test_cv_indices = json.load(f)['indices']

print(f"✓ Loaded {len(df):,} CVs")
print(f"✓ Test set: {len(test_cv_indices)} CVs")
print(f"   Indices: {test_cv_indices}")

✓ Loaded 9,544 CVs
✓ Test set: 10 CVs
   Indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [4]:
# CV formatting function
def format_cv_for_llm(resume_row):
    """Format a resume row into readable text."""
    cv_text = []
    
    if pd.notna(resume_row.get('career_objective')):
        cv_text.append(f"CAREER OBJECTIVE:\n{resume_row['career_objective']}")
    
    if pd.notna(resume_row.get('skills')):
        cv_text.append(f"\nSKILLS:\n{resume_row['skills']}")
    
    education_parts = []
    if pd.notna(resume_row.get('educational_institution_name')):
        education_parts.append(f"Institution: {resume_row['educational_institution_name']}")
    if pd.notna(resume_row.get('degree_names')):
        education_parts.append(f"Degree: {resume_row['degree_names']}")
    if pd.notna(resume_row.get('major_field_of_studies')):
        education_parts.append(f"Major: {resume_row['major_field_of_studies']}")
    if pd.notna(resume_row.get('passing_years')):
        education_parts.append(f"Year: {resume_row['passing_years']}")
    
    if education_parts:
        cv_text.append(f"\nEDUCATION:\n" + "\n".join(education_parts))
    
    work_parts = []
    if pd.notna(resume_row.get('professional_company_names')):
        work_parts.append(f"Company: {resume_row['professional_company_names']}")
    if pd.notna(resume_row.get('positions')):
        work_parts.append(f"Position: {resume_row['positions']}")
    if pd.notna(resume_row.get('start_dates')):
        work_parts.append(f"Period: {resume_row['start_dates']}")
        if pd.notna(resume_row.get('end_dates')):
            work_parts[-1] += f" to {resume_row['end_dates']}"
    if pd.notna(resume_row.get('responsibilities')):
        work_parts.append(f"Responsibilities:\n{resume_row['responsibilities']}")
    
    if work_parts:
        cv_text.append(f"\nWORK EXPERIENCE:\n" + "\n".join(work_parts))
    
    if pd.notna(resume_row.get('languages')):
        cv_text.append(f"\nLANGUAGES:\n{resume_row['languages']}")
    
    if pd.notna(resume_row.get('certification_skills')):
        cv_text.append(f"\nCERTIFICATIONS:\n{resume_row['certification_skills']}")
    
    return "\n".join(cv_text)

print("✓ Helper functions defined")

✓ Helper functions defined


## 3. Load Models

In [5]:
print("Loading models...\n")

# Load tokenizer
print("[1/3] Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token
print("  ✓ Tokenizer loaded")

# Load base model
print("\n[2/3] Loading base DistilGPT-2...")
base_model = AutoModelForCausalLM.from_pretrained("distilgpt2")
base_model.eval()
print(f"  ✓ Base model loaded ({base_model.num_parameters():,} parameters)")

# Load fine-tuned model
print("\n[3/3] Loading fine-tuned model with LoRA...")
if MODEL_PATH.exists():
    ft_base = AutoModelForCausalLM.from_pretrained("distilgpt2")
    fine_tuned_model = PeftModel.from_pretrained(ft_base, MODEL_PATH)
    fine_tuned_model.eval()
    print(f"  ✓ Fine-tuned model loaded from {MODEL_PATH}")
else:
    print(f"  ✗ Fine-tuned model not found at {MODEL_PATH}")
    print("  → Run notebook 07 first to train the model")
    fine_tuned_model = None

print("\n" + "="*80)
print("MODEL LOADING COMPLETE")
print("="*80)

Loading models...

[1/3] Loading tokenizer...
  ✓ Tokenizer loaded

[2/3] Loading base DistilGPT-2...
  ✓ Base model loaded (81,912,576 parameters)

[3/3] Loading fine-tuned model with LoRA...
  ✓ Fine-tuned model loaded from ../models/medium_roaster_lora

MODEL LOADING COMPLETE


## 4. Model Inference Functions

In [6]:
# Memory-optimized local model inference
def generate_local_critique(model, cv_text, max_new_tokens=50):
    """
    Generate critique using local model (base or fine-tuned).
    Optimized for low memory usage.
    """
    if model is None:
        return "[Model not loaded]"
    
    inputs_dict = None
    outputs = None
    
    try:
        # Short prompt to reduce memory
        cv_short = cv_text[:400]
        prompt = f"Criticize this CV briefly:\n{cv_short}\n\nCritique:"
        
        # Tokenize with short max length
        inputs_dict = tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,
            max_length=200
        )
        
        # Generate (greedy for consistency and less memory)
        with torch.no_grad():
            outputs = model.generate(
                inputs_dict['input_ids'],
                attention_mask=inputs_dict['attention_mask'],
                max_new_tokens=max_new_tokens,
                do_sample=False,  # Greedy
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )
        
        # Decode
        generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract critique
        if "Critique:" in generated:
            result = generated.split("Critique:")[1].strip()
        else:
            result = generated
        
        return result
        
    except Exception as e:
        return f"[Generation failed: {str(e)}]"
        
    finally:
        # Clean up
        if inputs_dict is not None:
            del inputs_dict
        if outputs is not None:
            del outputs
        gc.collect()

print("✓ Local model function defined")

✓ Local model function defined


In [7]:
# Gemini inference
ROASTER_PROMPT = """You are an experienced hiring manager providing direct, honest CV feedback.

Your approach:
1. Be direct and honest - no sugarcoating
2. Point out obvious flaws and red flags
3. Call out generic buzzwords and filler content
4. Be professional but don't hold back
5. Focus on what actually matters to employers

Structure:
FIRST IMPRESSION: What stands out
MAJOR ISSUES: Problems that need fixing
CONCERNS: Things that raise questions
WHAT WORKS: Brief strengths
BOTTOM LINE: Final verdict
"""

def generate_gemini_critique(cv_text):
    """Generate critique using Gemini."""
    model = genai.GenerativeModel(
        model_name="gemini-2.0-flash",
        generation_config=genai.GenerationConfig(
            temperature=0.7,
            top_p=0.95,
            max_output_tokens=1024,
        )
    )
    
    prompt = f"{ROASTER_PROMPT}\n\nReview this CV:\n\n{cv_text}"
    
    try:
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        return f"[ERROR: {str(e)}]"

print("✓ Gemini function defined")

✓ Gemini function defined


## 5. Generate Critiques from All Models

In [None]:
# Checkpoint file
CHECKPOINT = Path('../data/model_comparison_checkpoint.json')

# Load existing or start fresh
if CHECKPOINT.exists():
    with open(CHECKPOINT, 'r', encoding='utf-8') as f:
        results = json.load(f)
    completed = [r['cv_idx'] for r in results]
    print(f"✓ Loaded {len(results)} completed evaluations")
else:
    results = []
    completed = []
    print("Starting fresh...")

# Determine remaining
remaining = [idx for idx in test_cv_indices if idx not in completed]

print(f"\nProgress: {len(completed)}/{len(test_cv_indices)} CVs")
print(f"Remaining: {remaining}")
print(f"\nEstimated time: ~{len(remaining) * 10} seconds\n")

if len(remaining) > 0:
    print("="*80)
    print("GENERATING CRITIQUES FROM ALL 3 MODELS")
    print("="*80)
    
    for cv_idx in remaining:
        print(f"\nCV #{cv_idx} ({test_cv_indices.index(cv_idx)+1}/{len(test_cv_indices)})")
        print("-"*60)
        
        try:
            # Format CV
            cv_text = format_cv_for_llm(df.iloc[cv_idx])
            
            result = {
                'cv_idx': cv_idx,
                'cv_text': cv_text
            }
            
            # 1. Base model
            print("  [1/3] Base model...", end='', flush=True)
            result['base_critique'] = generate_local_critique(base_model, cv_text, max_new_tokens=50)
            print(" ✓")
            gc.collect()
            
            # 2. Fine-tuned model
            print("  [2/3] Fine-tuned...", end='', flush=True)
            if fine_tuned_model is not None:
                result['ft_critique'] = generate_local_critique(fine_tuned_model, cv_text, max_new_tokens=50)
                print(" ✓")
            else:
                result['ft_critique'] = "[Model not available]"
                print(" ⊙ (skipped)")
            gc.collect()
            
            # 3. Gemini
            print("  [3/3] Gemini...", end='', flush=True)
            result['gemini_critique'] = generate_gemini_critique(cv_text)
            print(" ✓")
            time.sleep(1.0)
            
            # Save result
            results.append(result)
            
            # Save checkpoint
            with open(CHECKPOINT, 'w', encoding='utf-8') as f:
                json.dump(results, f, indent=2, ensure_ascii=False)
            
            print(f"  ✓ Saved ({len(results)}/{len(test_cv_indices)})")
            
        except Exception as e:
            print(f"\n  ✗ Error: {e}")
            continue

print(f"\n{'='*80}")
print(f"✓ COMPLETE: Generated {len(results)} x 3 = {len(results)*3} critiques")
print(f"✓ Saved to: {CHECKPOINT}")
print("="*80)

Starting fresh...

Progress: 0/10 CVs
Remaining: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

Estimated time: ~100 seconds

GENERATING CRITIQUES FROM ALL 3 MODELS

CV #0 (1/10)
------------------------------------------------------------
  [1/3] Base model...

In [None]:
# Show example outputs
if len(results) > 0:
    example = results[0]
    
    print("\n" + "="*80)
    print("EXAMPLE CRITIQUES - CV #0")
    print("="*80)
    
    print("\n1. BASE MODEL:")
    print("-"*80)
    print(example['base_critique'][:300])
    
    print("\n2. FINE-TUNED MODEL:")
    print("-"*80)
    print(example['ft_critique'][:300])
    
    print("\n3. GEMINI:")
    print("-"*80)
    print(example['gemini_critique'][:300] + "...")

## 6. LLM-as-Judge Evaluation

In [None]:
JUDGE_PROMPT = """You are an expert evaluator of CV critique quality.

Evaluate this CV critique on these criteria (score 1-10 for each):

1. **Specificity**: How specific and actionable is the feedback?
2. **Relevance**: How relevant are the points to actual CV improvement?
3. **Coherence**: Is the critique coherent and well-structured?
4. **Completeness**: Does it cover important aspects of the CV?
5. **Overall Usefulness**: How useful would this be to the job seeker?

Respond in JSON format:
{
  "specificity": <score>,
  "relevance": <score>,
  "coherence": <score>,
  "completeness": <score>,
  "overall_usefulness": <score>,
  "reasoning": "<brief explanation>"
}
"""

def evaluate_critique(critique_text, model_name, cv_text):
    """Use LLM to evaluate critique quality."""
    model = genai.GenerativeModel(
        model_name="gemini-2.0-flash",
        generation_config=genai.GenerationConfig(temperature=0.2)
    )
    
    prompt = f"""{JUDGE_PROMPT}

Model: {model_name}

Original CV (excerpt):
{cv_text[:500]}...

Critique to Evaluate:
{critique_text}
"""
    
    try:
        response = model.generate_content(prompt)
        text = response.text
        
        # Extract JSON
        start = text.find('{')
        end = text.rfind('}') + 1
        if start != -1 and end != 0:
            json_str = text[start:end]
            return json.loads(json_str)
    except Exception as e:
        print(f"Error evaluating: {e}")
    
    return None

print("✓ Judge function defined")

In [None]:
# Run evaluations
print("="*80)
print("EVALUATING ALL CRITIQUES WITH LLM JUDGE")
print("="*80)
print(f"\nEvaluating {len(results)} CVs x 3 models = {len(results)*3} evaluations")
print("This will take ~3-5 minutes...\n")

evaluations = []

for result in tqdm(results, desc="Evaluating"):
    cv_idx = result['cv_idx']
    cv_text = result['cv_text']
    
    # Evaluate each model
    for model_name, critique_key in [
        ('Base', 'base_critique'),
        ('Fine-Tuned', 'ft_critique'),
        ('Gemini', 'gemini_critique')
    ]:
        critique = result[critique_key]
        
        # Skip if error or not available
        if critique.startswith('['):
            continue
        
        try:
            eval_result = evaluate_critique(critique, model_name, cv_text)
            
            if eval_result:
                eval_result['model'] = model_name
                eval_result['cv_idx'] = cv_idx
                evaluations.append(eval_result)
            
            time.sleep(0.5)
            
        except Exception as e:
            print(f"Error on CV {cv_idx} - {model_name}: {e}")
            continue

print(f"\n✓ Completed {len(evaluations)} evaluations")

# Save evaluations
eval_file = Path('../data/model_comparison_evaluations.json')
with open(eval_file, 'w', encoding='utf-8') as f:
    json.dump(evaluations, f, indent=2, ensure_ascii=False)
print(f"✓ Saved to: {eval_file}")

# Convert to DataFrame
df_eval = pd.DataFrame(evaluations)

## 7. Results & Comparison

In [None]:
# Calculate metrics
score_cols = ['specificity', 'relevance', 'coherence', 'completeness', 'overall_usefulness']
df_eval['average_score'] = df_eval[score_cols].mean(axis=1)

# Aggregate by model
model_summary = df_eval.groupby('model')[score_cols + ['average_score']].agg(['mean', 'std']).round(2)

print("="*80)
print("MODEL COMPARISON RESULTS")
print("="*80)
print(f"\nTest CVs: {len(results)}")
print(f"Evaluations per model: {df_eval.groupby('model').size().to_dict()}")
print(f"\nEvaluation method: LLM-as-Judge (Gemini 2.0 Flash)")

print("\n" + "="*80)
print("COMPARISON TABLE: Mean Scores (Scale 1-10)")
print("="*80)
print()

# Build comparison table
comparison_data = []
for model in ['Base', 'Fine-Tuned', 'Gemini']:
    if model in model_summary.index:
        row = {'Model': model}
        for metric in score_cols:
            mean_val = model_summary.loc[model, (metric, 'mean')]
            std_val = model_summary.loc[model, (metric, 'std')]
            row[metric.replace('_', ' ').title()] = f"{mean_val:.2f} ± {std_val:.2f}"
        
        mean_val = model_summary.loc[model, ('average_score', 'mean')]
        std_val = model_summary.loc[model, ('average_score', 'std')]
        row['Average'] = f"{mean_val:.2f} ± {std_val:.2f}"
        
        comparison_data.append(row)

comparison_df = pd.DataFrame(comparison_data)
print(comparison_df.to_string(index=False))

# Overall averages
print("\n" + "="*80)
print("OVERALL AVERAGE SCORES")
print("="*80)
for model in ['Base', 'Fine-Tuned', 'Gemini']:
    if model in model_summary.index:
        mean_val = model_summary.loc[model, ('average_score', 'mean')]
        std_val = model_summary.loc[model, ('average_score', 'std')]
        print(f"{model:15s}: {mean_val:.2f} ± {std_val:.2f} / 10")

In [None]:
# Statistical summary
print("\n" + "="*80)
print("STATISTICAL SUMMARY BY MODEL")
print("="*80)
print("\n", df_eval.groupby('model')['average_score'].describe().round(2))

## 8. Visualizations

In [None]:
# Create comparison visualizations
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: Grouped bar chart
ax1 = axes[0]
metric_names = [m.replace('_', ' ').title() for m in score_cols]
x = np.arange(len(metric_names))
width = 0.25

models_available = [m for m in ['Base', 'Fine-Tuned', 'Gemini'] if m in model_summary.index]
colors = {'Base': '#d62728', 'Fine-Tuned': '#ff7f0e', 'Gemini': '#2ca02c'}

for i, model in enumerate(models_available):
    scores = [model_summary.loc[model, (m, 'mean')] for m in score_cols]
    offset = (i - len(models_available)/2 + 0.5) * width
    bars = ax1.bar(x + offset, scores, width, label=model, color=colors[model], alpha=0.8)

ax1.set_ylabel('Score (1-10)', fontsize=12, fontweight='bold')
ax1.set_xlabel('Evaluation Metrics', fontsize=12, fontweight='bold')
ax1.set_title('Model Comparison Across All Metrics', fontsize=14, fontweight='bold')
ax1.set_xticks(x)
ax1.set_xticklabels(metric_names, rotation=45, ha='right')
ax1.legend(loc='upper left')
ax1.grid(axis='y', alpha=0.3)
ax1.set_ylim(0, 10)

# Plot 2: Overall average with error bars
ax2 = axes[1]
means = [model_summary.loc[m, ('average_score', 'mean')] for m in models_available]
stds = [model_summary.loc[m, ('average_score', 'std')] for m in models_available]
bar_colors = [colors[m] for m in models_available]

bars = ax2.bar(models_available, means, color=bar_colors, alpha=0.8, yerr=stds, capsize=10, error_kw={'linewidth': 2})

ax2.set_ylabel('Average Score (1-10)', fontsize=12, fontweight='bold')
ax2.set_xlabel('Model', fontsize=12, fontweight='bold')
ax2.set_title('Overall Average Score (Mean ± Std)', fontsize=14, fontweight='bold')
ax2.set_ylim(0, 10)
ax2.grid(axis='y', alpha=0.3)

# Add value labels
for bar, mean, std in zip(bars, means, stds):
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height + std + 0.2,
            f'{mean:.2f}±{std:.2f}', ha='center', va='bottom', 
            fontsize=11, fontweight='bold')

plt.tight_layout()
plt.savefig(RESULTS_DIR / 'model_comparison.png', dpi=300, bbox_inches='tight')
print(f"\n✓ Figure saved: {RESULTS_DIR / 'model_comparison.png'}")
plt.show()

In [None]:
# Heatmap comparison
fig, ax = plt.subplots(figsize=(10, 6))

# Prepare data
heatmap_data = []
for model in models_available:
    row = [model_summary.loc[model, (metric, 'mean')] for metric in score_cols]
    heatmap_data.append(row)

# Create heatmap
im = ax.imshow(heatmap_data, cmap='RdYlGn', aspect='auto', vmin=0, vmax=10)

# Set ticks
ax.set_xticks(np.arange(len(score_cols)))
ax.set_yticks(np.arange(len(models_available)))
ax.set_xticklabels([m.replace('_', ' ').title() for m in score_cols], rotation=45, ha='right')
ax.set_yticklabels(models_available)

# Colorbar
cbar = plt.colorbar(im, ax=ax)
cbar.set_label('Score (1-10)', rotation=270, labelpad=20, fontweight='bold')

# Add text annotations
for i in range(len(models_available)):
    for j in range(len(score_cols)):
        text = ax.text(j, i, f'{heatmap_data[i][j]:.1f}',
                      ha="center", va="center", color="black", fontweight='bold', fontsize=11)

ax.set_title('Model Performance Heatmap', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig(RESULTS_DIR / 'model_comparison_heatmap.png', dpi=300, bbox_inches='tight')
print(f"✓ Heatmap saved: {RESULTS_DIR / 'model_comparison_heatmap.png'}")
plt.show()

## 9. Export Results

In [None]:
# Export scores
df_eval.to_csv(RESULTS_DIR / 'model_comparison_scores.csv', index=False)
print(f"✓ Scores saved: {RESULTS_DIR / 'model_comparison_scores.csv'}")

# Export summary
summary_export = []
for model in models_available:
    for metric in score_cols + ['average_score']:
        summary_export.append({
            'Model': model,
            'Metric': metric.replace('_', ' ').title(),
            'Mean': model_summary.loc[model, (metric, 'mean')],
            'Std': model_summary.loc[model, (metric, 'std')]
        })

pd.DataFrame(summary_export).to_csv(RESULTS_DIR / 'model_comparison_summary.csv', index=False)
print(f"✓ Summary saved: {RESULTS_DIR / 'model_comparison_summary.csv'}")

print("\n" + "="*80)
print("EXPORT COMPLETE")
print("="*80)
print("\nGenerated files:")
print("  1. model_comparison.png - Bar charts")
print("  2. model_comparison_heatmap.png - Score heatmap")
print("  3. model_comparison_scores.csv - All evaluation scores")
print("  4. model_comparison_summary.csv - Summary statistics")

## 10. Key Findings

In [None]:
print("="*80)
print("KEY FINDINGS")
print("="*80)

# Calculate improvements
if 'Base' in model_summary.index and 'Fine-Tuned' in model_summary.index:
    base_avg = model_summary.loc['Base', ('average_score', 'mean')]
    ft_avg = model_summary.loc['Fine-Tuned', ('average_score', 'mean')]
    improvement = ft_avg - base_avg
    improvement_pct = (improvement / base_avg) * 100
    
    print(f"\n1. Fine-Tuning Impact:")
    print(f"   Base model:      {base_avg:.2f}/10")
    print(f"   Fine-tuned:      {ft_avg:.2f}/10")
    print(f"   Improvement:     {improvement:+.2f} points ({improvement_pct:+.1f}%)")
    
    if improvement > 0.5:
        print("   → Significant improvement from fine-tuning")
    elif improvement > 0:
        print("   → Modest improvement from fine-tuning")
    else:
        print("   → Minimal/no improvement from fine-tuning")

if 'Gemini' in model_summary.index:
    gemini_avg = model_summary.loc['Gemini', ('average_score', 'mean')]
    print(f"\n2. Gemini Performance:")
    print(f"   Score: {gemini_avg:.2f}/10")
    
    if 'Fine-Tuned' in model_summary.index:
        gap = gemini_avg - ft_avg
        print(f"   Gap from fine-tuned: {gap:.2f} points")
        print(f"   → Gemini is {gap:.1f}x better" if gap > 0 else "   → Models comparable")

# Best aspects
print(f"\n3. Strongest Metrics Across All Models:")
overall_means = df_eval[score_cols].mean().sort_values(ascending=False)
for i, (metric, score) in enumerate(overall_means.head(3).items(), 1):
    print(f"   {i}. {metric.replace('_', ' ').title()}: {score:.2f}/10")

print(f"\n4. Test Coverage:")
print(f"   CVs evaluated: {len(results)}")
print(f"   Total critiques: {len(results) * len(models_available)}")
print(f"   Evaluation completeness: {len(evaluations)/(len(results)*len(models_available))*100:.0f}%")

print("\n" + "="*80)
print("COMPARISON COMPLETE ✓")
print("="*80)