# 05 - Evaluation & Comparison 

This notebook compares all three CV roasting models and evaluates their effectiveness.

## Objectives
- Load results from all three models
- Compare roastings side-by-side
- Analyze characteristics such as length, tone and specificity
- Evaluate effectiveness (using another LLM)

---

In [None]:
import pandas as pd
import json
from pathlib import Path
from datetime import datetime
import sys
sys.path.append('..')
import google.generativeai as genai
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

## Setup

In [None]:
# Load API key from config.py
import sys
sys.path.append('..')
from config import GEMINI_API_KEY
genai.configure(api_key=GEMINI_API_KEY)
print("API key loaded from config.py")

## Load All Results

In [None]:
def load_results(model_name):
    """
    Load all results for a specific model.
    
    Args:
        model_name: Name of the model (gentle_roaster, medium_roaster, brutal_roaster)
    
    Returns:
        list: List of result dictionaries
    """
    results_dir = Path(f'../results/{model_name}')
    results = []
    
    for file_path in sorted(results_dir.glob('*.json')):
        with open(file_path, 'r') as f:
            results.append(json.load(f))
    
    return results

# Load all results
gentle_results = load_results('gentle_roaster')
medium_results = load_results('medium_roaster')
brutal_results = load_results('brutal_roaster')

print(f"Loaded results:")
print(f"  Gentle: {len(gentle_results)} critiques")
print(f"  Medium: {len(medium_results)} critiques")
print(f"  Brutal: {len(brutal_results)} critiques")

## Side-by-Side Comparison

In [None]:
def display_comparison(cv_index):
    """
    Display all three roasts for a given CV side-by-side.
    """
    print("="*120)
    print(f"CV #{cv_index} - THREE ROASTING STYLES COMPARISON")
    print("="*120)
    
    # Find results for this CV
    gentle = next((r for r in gentle_results if r['cv_index'] == cv_index), None)
    medium = next((r for r in medium_results if r['cv_index'] == cv_index), None)
    brutal = next((r for r in brutal_results if r['cv_index'] == cv_index), None)
    
    if gentle:
        print("\n ORIGINAL CV:")
        print("-"*120)
        print(gentle['cv_text'])
    
    print("\n" + "="*120)
    print(" GENTLE ROASTER (Temperature: 0.4)")
    print("="*120)
    if gentle:
        print(gentle['critique'])
    else:
        print("No results found")
    
    print("\n" + "="*120)
    print(" MEDIUM ROASTER (Temperature: 0.7)")
    print("="*120)
    if medium:
        print(medium['critique'])
    else:
        print("No results found")
    
    print("\n" + "="*120)
    print(" BRUTAL ROASTER (Temperature: 0.9)")
    print("="*120)
    if brutal:
        print(brutal['critique'])
    else:
        print("No results found")
    
    print("\n" + "="*120 + "\n")

# Display comparisons for all test CVs
if gentle_results:
    for result in gentle_results:
        display_comparison(result['cv_index'])

## Quantitative Analysis

In [None]:
def analyze_critique(critique_text):
    """
    Analyze characteristics of a critique.
    
    Returns:
        dict: Analysis metrics
    """
    return {
        'char_count': len(critique_text),
        'word_count': len(critique_text.split()),
        'line_count': len(critique_text.split('\n')),
        'avg_word_length': sum(len(word) for word in critique_text.split()) / max(len(critique_text.split()), 1),
        'emoji_count': sum(1 for char in critique_text if ord(char) > 0x1F300),
    }

# Analyze all critiques
analysis_data = []

for model_name, results in [('Gentle', gentle_results), ('Medium', medium_results), ('Brutal', brutal_results)]:
    for result in results:
        metrics = analyze_critique(result['critique'])
        metrics['model'] = model_name
        metrics['cv_index'] = result['cv_index']
        metrics['temperature'] = result['temperature']
        analysis_data.append(metrics)

df_analysis = pd.DataFrame(analysis_data)

print(" CRITIQUE STATISTICS BY MODEL")
print("="*80)
print(df_analysis.groupby('model')[['char_count', 'word_count', 'line_count', 'emoji_count']].mean().round(2))

## Visualizations

In [None]:
# Word count comparison
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Word count
df_analysis.groupby('model')['word_count'].mean().plot(kind='bar', ax=axes[0], color=['#90EE90', '#FFD700', '#FF6347'])
axes[0].set_title('Average Word Count by Model', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Words')
axes[0].set_xlabel('')
axes[0].tick_params(axis='x', rotation=0)

# Character count
df_analysis.groupby('model')['char_count'].mean().plot(kind='bar', ax=axes[1], color=['#90EE90', '#FFD700', '#FF6347'])
axes[1].set_title('Average Character Count by Model', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Characters')
axes[1].set_xlabel('')
axes[1].tick_params(axis='x', rotation=0)

# Emoji usage
df_analysis.groupby('model')['emoji_count'].mean().plot(kind='bar', ax=axes[2], color=['#90EE90', '#FFD700', '#FF6347'])
axes[2].set_title('Average Emoji Usage by Model', fontsize=14, fontweight='bold')
axes[2].set_ylabel('Emojis')
axes[2].set_xlabel('')
axes[2].tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.savefig('../results/model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print(" Saved visualization to results/model_comparison.png")

In [None]:
# Temperature vs Output Length
plt.figure(figsize=(10, 6))
for model in ['Gentle', 'Medium', 'Brutal']:
    model_data = df_analysis[df_analysis['model'] == model]
    plt.scatter(model_data['temperature'], model_data['word_count'], 
               label=model, s=100, alpha=0.6)

plt.xlabel('Temperature', fontsize=12)
plt.ylabel('Word Count', fontsize=12)
plt.title('Temperature vs Output Length', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig('../results/temperature_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print(" Saved visualization to results/temperature_analysis.png")

## Automated Evaluation with LLM Judge

Use Gemini to evaluate the quality of each roast.

In [None]:
JUDGE_PROMPT = """You are an expert evaluator of CV critique quality.

Evaluate this CV critique on the following criteria (score 1-10 for each):

1. **Specificity**: How specific and actionable is the feedback?
2. **Relevance**: How relevant are the points to actual CV improvement?
3. **Tone Appropriateness**: How well does the tone match the intended style?
4. **Completeness**: Does it cover all major aspects of the CV?
5. **Overall Usefulness**: How useful would this be to the job seeker?

Respond in JSON format:
{
  "specificity": <score>,
  "relevance": <score>,
  "tone_appropriateness": <score>,
  "completeness": <score>,
  "overall_usefulness": <score>,
  "reasoning": "<brief explanation>"
}
"""

def evaluate_critique(critique_text, model_type, cv_text):
    """
    Use LLM to evaluate critique quality.
    """
    model = genai.GenerativeModel(
        model_name="gemini-2.0-flash",
        generation_config=genai.GenerationConfig(
            temperature=0.2,  # Low temperature for consistent evaluation
        )
    )
    
    prompt = f"""{JUDGE_PROMPT}

Model Type: {model_type}

Original CV:
{cv_text[:500]}...

Critique to Evaluate:
{critique_text}
"""
    
    try:
        response = model.generate_content(prompt)
        # Extract JSON from response
        text = response.text
        # Try to find JSON in the response
        start = text.find('{')
        end = text.rfind('}') + 1
        if start != -1 and end != 0:
            json_str = text[start:end]
            return json.loads(json_str)
    except Exception as e:
        print(f"Error evaluating: {e}")
        return None

print("Evaluating critiques with LLM judge...")
print("This may take a minute...\n")

evaluations = []

for model_name, results in [('Gentle', gentle_results), ('Medium', medium_results), ('Brutal', brutal_results)]:
    for result in results:
        print(f"Evaluating {model_name} model for CV #{result['cv_index']}...")
        eval_result = evaluate_critique(
            result['critique'], 
            model_name,
            result['cv_text']
        )
        
        if eval_result:
            eval_result['model'] = model_name
            eval_result['cv_index'] = result['cv_index']
            evaluations.append(eval_result)

print(f"\n Completed {len(evaluations)} evaluations")

In [None]:
# Display evaluation results
if evaluations:
    df_eval = pd.DataFrame(evaluations)
    
    print("\n EVALUATION SCORES BY MODEL")
    print("="*80)
    
    score_cols = ['specificity', 'relevance', 'tone_appropriateness', 'completeness', 'overall_usefulness']
    summary = df_eval.groupby('model')[score_cols].mean().round(2)
    print(summary)
    
    # Calculate overall average
    df_eval['average_score'] = df_eval[score_cols].mean(axis=1)
    
    print("\n OVERALL AVERAGE SCORES")
    print(df_eval.groupby('model')['average_score'].mean().round(2).sort_values(ascending=False))

In [None]:
# Visualization of evaluation scores
if evaluations:
    fig, ax = plt.subplots(figsize=(12, 6))
    
    summary.T.plot(kind='bar', ax=ax, color=['#90EE90', '#FFD700', '#FF6347'])
    ax.set_title('Evaluation Scores by Model', fontsize=16, fontweight='bold')
    ax.set_ylabel('Score (1-10)', fontsize=12)
    ax.set_xlabel('Evaluation Criteria', fontsize=12)
    ax.legend(title='Model', loc='upper right')
    ax.tick_params(axis='x', rotation=45)
    ax.grid(True, alpha=0.3, axis='y')
    ax.set_ylim(0, 10)
    
    plt.tight_layout()
    plt.savefig('../results/evaluation_scores.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(" Saved visualization to results/evaluation_scores.png")

## Key Findings & Insights

In [None]:
print("\n KEY FINDINGS")
print("="*80)

print("\n1. TEMPERATURE EFFECTS:")
print(f"   • Gentle (T=0.4): Most consistent, professional feedback")
print(f"   • Medium (T=0.7): Good balance of directness and variety")
print(f"   • Brutal (T=0.9): Maximum creativity and humor")

if evaluations:
    print("\n2. QUALITY ASSESSMENT:")
    best_model = df_eval.groupby('model')['average_score'].mean().idxmax()
    best_score = df_eval.groupby('model')['average_score'].mean().max()
    print(f"   • Best Overall Model: {best_model} (avg score: {best_score:.2f}/10)")
    
    for model in ['Gentle', 'Medium', 'Brutal']:
        if model in df_eval['model'].values:
            model_data = df_eval[df_eval['model'] == model]
            print(f"   • {model}: {model_data['average_score'].mean():.2f}/10")

print("\n3. CHARACTERISTICS:")
print(f"   • Gentle: {df_analysis[df_analysis['model']=='Gentle']['word_count'].mean():.0f} avg words")
print(f"   • Medium: {df_analysis[df_analysis['model']=='Medium']['word_count'].mean():.0f} avg words")
print(f"   • Brutal: {df_analysis[df_analysis['model']=='Brutal']['word_count'].mean():.0f} avg words")

print("\n4. USE CASES:")
print("   • Gentle: Best for sensitive job seekers, entry-level candidates")
print("   • Medium: Best for professionals seeking honest feedback")
print("   • Brutal: Best for entertainment, thick-skinned individuals")

print("\n" + "="*80)

## Export Summary Report

In [None]:
# Create summary report
summary_report = {
    'timestamp': datetime.now().isoformat(),
    'models': {
        'gentle': {
            'temperature': 0.4,
            'num_critiques': len(gentle_results),
            'avg_word_count': df_analysis[df_analysis['model']=='Gentle']['word_count'].mean(),
        },
        'medium': {
            'temperature': 0.7,
            'num_critiques': len(medium_results),
            'avg_word_count': df_analysis[df_analysis['model']=='Medium']['word_count'].mean(),
        },
        'brutal': {
            'temperature': 0.9,
            'num_critiques': len(brutal_results),
            'avg_word_count': df_analysis[df_analysis['model']=='Brutal']['word_count'].mean(),
        }
    },
    'analysis': df_analysis.to_dict('records'),
}

if evaluations:
    summary_report['evaluations'] = evaluations

# Save report
with open('../results/summary_report.json', 'w') as f:
    json.dump(summary_report, f, indent=2)

print(" Summary report saved to results/summary_report.json")

## Conclusion

This evaluation compared three CV roasting models with different temperatures:

### Summary
-  **Gentle Roaster (T=0.4)**: Consistent, professional and encouraging response
-  **Medium Roaster (T=0.7)**: Direct, honest, yet balanced criticism
-  **Brutal Roaster (T=0.9)**: Creative, humorous, outright savage roasts

### Key Takeaways
1. **Temperature**: Higher temperatures produce (as suspected) more creative and varied outputs
2. **Prompt engineering**: Different prompts create distinguishable different tones without fine-tuning
3. **Trade-offs exist**: Between Consistency and creativity, professionalism and entertainment
4. **Context matters**: Based on the user's needs and "sensitivity"

### Future Improvements
- Test with more diverse CVs
- Add user feedback collection


---

## Next: 06_quick_cv_roaster.ipynb
### Application with Input Options: 
1. **Drag & Drop**  PDF into the `uploaded_cvs/` folder
2. **Use existing CV** from the dataset