# Ground Truth Generation: ROME-Based Strategy Selection

This notebook generates ground truth data for evaluating LLMs on job offer feedback tasks.

**Key Features:**
- Uses ROME code to select weighted degradation strategies
- Applies degradation to create (JO_good, JO_bad) pairs
- Generates structured feedback with numerical scores
- Focuses on HIGH severity flaws only
- Uses 3 Gemini API keys in parallel for efficiency


In [8]:
import pandas as pd
import numpy as np
import json
import re
from pathlib import Path
import warnings
import os
import random
import time
from copy import deepcopy
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
import sys

warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 200)

# --- Load environment variables ---
try:
    from dotenv import load_dotenv
    load_dotenv(dotenv_path=Path('../..') / '.env')
except ImportError:
    print("Installing python-dotenv...")
    import subprocess
    subprocess.check_call(["pip", "install", "python-dotenv"])
    from dotenv import load_dotenv
    load_dotenv(dotenv_path=Path('../..') / '.env')

# --- Gemini API Setup - Multiple API keys for parallel processing ---
try:
    import google.generativeai as genai
    
    # Multiple API keys for parallel processing
    GEMINI_API_KEYS = {
        '1': os.getenv('GEMINI_API_KEY_1'),
        '2': os.getenv('GEMINI_API_KEY_2'),
        '3': os.getenv('GEMINI_API_KEY_3')
    }
    
    # Filter out None values
    available_keys = {k: v for k, v in GEMINI_API_KEYS.items() if v is not None}
    
    if not available_keys:
        single_key = os.getenv('GEMINI_API_KEY')
        if single_key:
            available_keys = {'SINGLE': single_key}
            print("‚ö†Ô∏è Only single API key found, falling back to sequential processing")
        else:
            raise ValueError("No Gemini API keys found in environment variables.")
    
    print(f"‚úÖ Loaded {len(available_keys)} Gemini API keys: {list(available_keys.keys())}")
    
    GEMINI_MODEL = "gemini-2.5-flash-lite-preview-09-2025"
    GEMINI_REQUEST_DELAY = 5  # seconds between requests per API key
    
    # Rate limiting tracking
    key_last_used = {}
    key_lock = threading.Lock()
    
except (ImportError, ValueError) as e:
    print(f"‚ö†Ô∏è Gemini API not configured: {e}")
    available_keys = {}
    genai = None

# --- Configuration ---
DEBUG = True
DEBUG_LIMIT = 6  # Generate 6 pairs (one per pillar)
RANDOM_SEED = 42
MIN_SEVERITY = 'High'  # Only use HIGH severity strategies
FILTER_BY_AVAILABLE_ROME = True  # Only use ROME codes available in strategy lookup (applies even in DEBUG mode)

# --- Paths ---
PROJECT_ROOT = Path('../..')
DATA_PATH = PROJECT_ROOT / 'data'
OUTPUT_PATH = PROJECT_ROOT / 'analysis_outputs' / 'ground_truth'
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

# Add ROME lookup to path
ROME_LOOKUP_PATH = PROJECT_ROOT / 'degradation_strategies' / 'rome_lookup'
sys.path.insert(0, str(ROME_LOOKUP_PATH))

# --- Load Source Data ---
high_var_path = DATA_PATH / 'high_var_analysis.csv'
if not high_var_path.exists():
    raise FileNotFoundError(f"Source data file not found at {high_var_path}")
df_high_var = pd.read_csv(high_var_path)

# Filter for high-candidature offers with rich content
high_quality_pool = df_high_var[
    (df_high_var['candidatures_level'] == 'high') &
    (df_high_var['dc_descriptifposte'].notna()) &
    (df_high_var['dc_descriptifposte'].str.len() > 500) &
    (df_high_var['dc_rome'].notna())
].copy()

print(f"\n‚úÖ Loaded {len(df_high_var)} offers from source data.")
print(f"‚úÖ Found {len(high_quality_pool)} high-quality offers with ROME codes.")
if DEBUG:
    print(f"DEBUG mode is ON. Will process {DEBUG_LIMIT} examples.")


‚úÖ Loaded 3 Gemini API keys: ['1', '2', '3']

‚úÖ Loaded 30161 offers from source data.
‚úÖ Found 6546 high-quality offers with ROME codes.
DEBUG mode is ON. Will process 6 examples.


## 2. Load ROME Strategy Lookup

Load the ROME-code-organized strategy lookup for weighted random selection.


In [9]:
# --- Load ROME Strategy Lookup ---
from select_strategy import select_strategy_for_rome, list_available_rome_codes

available_rome_codes = list_available_rome_codes()
print(f"‚úÖ Loaded ROME strategy lookup with {len(available_rome_codes)} ROME codes")
print(f"   Sample ROME codes: {available_rome_codes[:5]}...")

# Filter high_quality_pool to only include offers with ROME codes in lookup if enabled
if FILTER_BY_AVAILABLE_ROME:
    initial_count = len(high_quality_pool)
    high_quality_pool = high_quality_pool[
        high_quality_pool['dc_rome'].isin(available_rome_codes)
    ].copy()
    filtered_count = len(high_quality_pool)
    print(f"\nüîç Filtered to {filtered_count} offers with ROME codes in lookup (from {initial_count} total)")
    print(f"   FILTER_BY_AVAILABLE_ROME is {'ON' if FILTER_BY_AVAILABLE_ROME else 'OFF'}")

# Sample offers for processing
if DEBUG:
    sample_size = DEBUG_LIMIT * 3
else:
    sample_size = 100

source_offers = high_quality_pool.sample(
    min(sample_size, len(high_quality_pool)), 
    random_state=RANDOM_SEED
).copy()

print(f"\n‚úÖ Selected {len(source_offers)} offers for processing.")


‚úÖ Loaded ROME strategy lookup with 50 ROME codes
   Sample ROME codes: ['A1101', 'A1203', 'A1208', 'A1401', 'A1419']...

üîç Filtered to 1305 offers with ROME codes in lookup (from 6546 total)
   FILTER_BY_AVAILABLE_ROME is ON

‚úÖ Selected 18 offers for processing.


## 3. Core Functions

Functions for degradation application and ground truth generation with feedback types.


In [10]:
# --- Feedback Type Definitions ---
FEEDBACK_TYPES = [
    "ADD",      # Add missing information
    "REWRITE",  # Rewrite unclear/vague text
    "REPLACE",  # Replace with better value
    "CLARIFY",  # Clarify ambiguous information
    "REMOVE",   # Remove misleading/redundant information
    "ENHANCE"   # Enhance existing content
]

def apply_degradation_strategy(jo_good_dict, strategy_text, api_key, key_name, max_retries=3):
    """
    Applies a degradation strategy to create JO_bad from JO_good.
    Uses Gemini to determine the specific field modifications.
    
    Returns: (jo_bad_dict, fields_modified_list) or (None, None)
    """
    if not genai:
        return None, None
    
    prompt = f"""You are tasked with degrading a job offer by applying a specific strategy.

## Strategy to Apply:
**Strategy**: {strategy_text}

This strategy describes what SHOULD be done to improve an offer. To DEGRADE the offer, do the OPPOSITE:
- If it says "ADD" something, REMOVE or OMIT it
- If it says "REWRITE" to clarify, make it VAGUE or AMBIGUOUS
- If it says "REPLACE" with specific info, replace with generic/vague info
- If it suggests making something more attractive, make it LESS attractive

## Current Job Offer (Good Version):
```json
{json.dumps(jo_good_dict, indent=2, ensure_ascii=False)}
```

## Task:
Determine which specific fields need to be modified and what values they should have in the degraded version.
Make realistic but negative changes that reduce attractiveness.

## Output Format:
Return a JSON object:
{{
  "modifications": {{
    "field_name_1": "new_value_1 or null",
    "field_name_2": "new_value_2 or null"
  }},
  "fields_modified": ["field_name_1", "field_name_2"]
}}

CRITICAL: Return ONLY the JSON object. No other text.
"""
    
    # Rate limiting
    with key_lock:
        if key_name in key_last_used:
            elapsed = time.time() - key_last_used[key_name]
            if elapsed < GEMINI_REQUEST_DELAY:
                time.sleep(GEMINI_REQUEST_DELAY - elapsed)
    
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel(GEMINI_MODEL)
    
    for attempt in range(max_retries):
        try:
            response = model.generate_content(prompt)
            json_text = re.search(r'\{.*\}', response.text, re.DOTALL).group(0)
            result = json.loads(json_text)
            
            # Apply modifications
            jo_bad = deepcopy(jo_good_dict)
            modifications = result.get('modifications', {})
            jo_bad.update(modifications)
            fields_modified = result.get('fields_modified', list(modifications.keys()))
            
            with key_lock:
                key_last_used[key_name] = time.time()
            
            time.sleep(GEMINI_REQUEST_DELAY)
            return jo_bad, fields_modified
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(1)
    
    return None, None


def generate_ground_truth_feedback(jo_bad_dict, pillar, strategy_text, fields_modified, api_key, key_name, max_retries=3):
    """
    Generates structured feedback for a degraded job offer.
    Returns feedback with location, type, reasoning, and severity.
    """
    if not genai:
        return {"error": "Gemini model not configured."}

    prompt = f"""You are a Strategic Talent Attraction Consultant. Evaluate this job offer and identify the single most critical HIGH-severity flaw within the "{pillar}" pillar.

## Context:
- This offer was degraded by: {strategy_text}
- Modified fields: {', '.join(fields_modified)}
- Focus ONLY on the "{pillar}" pillar

## Job Offer:
```json
{json.dumps(jo_bad_dict, indent=2, ensure_ascii=False)}
```

## Required Output:
Return JSON with this structure:
{{
  "actionable_feedback": [{{
    "feedback_text": "Clear explanation of the flaw and how to fix it",
    "json_path": ["$.field_name1", "$.field_name2"],
    "feedback_type": "ADD|REWRITE|REPLACE|CLARIFY|REMOVE|ENHANCE",
    "severity": "High",
    "reasoning": "Why this flaw significantly impacts attractiveness"
  }}]
}}

CRITICAL: 
- Return exactly ONE feedback item
- Severity MUST be "High"
- json_path must point to the modified fields
- feedback_type must be one of: ADD, REWRITE, REPLACE, CLARIFY, REMOVE, ENHANCE
- Return ONLY JSON, no other text.
"""
    
    # Rate limiting
    with key_lock:
        if key_name in key_last_used:
            elapsed = time.time() - key_last_used[key_name]
            if elapsed < GEMINI_REQUEST_DELAY:
                time.sleep(GEMINI_REQUEST_DELAY - elapsed)
    
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel(GEMINI_MODEL)
    
    for attempt in range(max_retries):
        try:
            response = model.generate_content(prompt)
            json_text = re.search(r'\{.*\}', response.text, re.DOTALL).group(0)
            result = json.loads(json_text)
            
            # Validate feedback
            if 'actionable_feedback' in result and len(result['actionable_feedback']) > 0:
                feedback = result['actionable_feedback'][0]
                # Ensure severity is High
                feedback['severity'] = 'High'
                # Validate feedback_type
                if feedback.get('feedback_type') not in FEEDBACK_TYPES:
                    feedback['feedback_type'] = 'REWRITE'  # Default
            
            with key_lock:
                key_last_used[key_name] = time.time()
            
            time.sleep(GEMINI_REQUEST_DELAY)
            return result
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(2)
    
    return {"error": f"Failed after {max_retries} attempts."}


def calculate_pillar_scores(jo_dict, pillar_name):
    """
    Calculate numerical scores for each pillar.
    Simplified scoring: higher score = more attractive.
    Returns dict with scores for all 6 pillars.
    """
    # Simplified scoring logic - can be enhanced later
    scores = {
        "Compensation and Benefits": 50.0,
        "Career Development and Advancement": 50.0,
        "Work Environment and Culture": 50.0,
        "Work-Life Balance and Flexibility": 50.0,
        "Nature of Work and Impact": 50.0,
        "Clarity and Realism": 50.0
    }
    
    # Base score adjustments based on offer content
    # This is a simplified version - can use actual scoring model later
    if jo_dict.get('dn_salaireminimum') and pd.notna(jo_dict.get('dn_salaireminimum')):
        scores["Compensation and Benefits"] += 10
    if jo_dict.get('dc_descriptifposte') and len(str(jo_dict.get('dc_descriptifposte', ''))) > 500:
        scores["Clarity and Realism"] += 10
    if jo_dict.get('dc_typecontrat') and str(jo_dict.get('dc_typecontrat', '')).upper() == 'CDI':
        scores["Compensation and Benefits"] += 5  # CDI is more attractive
    if jo_dict.get('dc_commentairesalaire') and pd.notna(jo_dict.get('dc_commentairesalaire')):
        scores["Compensation and Benefits"] += 5
    
    return scores

print("‚úÖ Core functions defined")


‚úÖ Core functions defined


## 4. Main Generation Loop

Generate (JO_good, JO_bad) pairs using ROME-based strategy selection with parallel processing.


In [11]:
# --- Main Generation Loop ---
processed_offers = set()
challenge_dataset = []

print("\n" + "="*80)
print("Starting Ground Truth Generation")
print("="*80)
print(f"Mode: {'DEBUG' if DEBUG else 'FULL'}")
print(f"API Keys: {len(available_keys)}")
print(f"Min Severity: {MIN_SEVERITY}")
print("\n" + "="*80)

# Prepare tasks: (offer_row, api_key, key_name)
tasks = []
key_names = list(available_keys.keys())

for idx, (_, offer_row) in enumerate(source_offers.iterrows()):
    if DEBUG and len(tasks) >= DEBUG_LIMIT:
        break
    
    offer_id = offer_row['kc_offre']
    rome_code = offer_row.get('dc_rome', 'UNKNOWN')
    
    if offer_id in processed_offers:
        continue
    
    # Skip if ROME code is invalid
    if pd.isna(rome_code) or rome_code == 'UNKNOWN':
        continue
    
    # Select strategy for this ROME code
    strategy_info = select_strategy_for_rome(rome_code)
    
    if not strategy_info:
        continue
    
    # Convert offer row to dict
    jo_good = {k: (None if pd.isna(v) else v) for k, v in offer_row.to_dict().items()}
    
    # Assign API key (round-robin)
    key_name = key_names[idx % len(key_names)]
    api_key = available_keys[key_name]
    
    tasks.append((jo_good, strategy_info, api_key, key_name, rome_code, offer_id))

print(f"\n‚úÖ Prepared {len(tasks)} tasks for processing")
print(f"   Using {len(available_keys)} API keys in parallel")

# Process tasks in parallel
def process_offer_task(task):
    """Process a single offer: apply degradation and generate feedback."""
    jo_good, strategy_info, api_key, key_name, rome_code, offer_id = task
    
    pillar = strategy_info['pillar']
    strategy_text = strategy_info['strategy']
    subcategory = strategy_info['subcategory']
    
    try:
        # Step 1: Apply degradation
        jo_bad, fields_modified = apply_degradation_strategy(
            jo_good, strategy_text, api_key, key_name
        )
        
        if not jo_bad:
            return None
        
        # Step 2: Generate ground truth feedback (use different key for parallel processing)
        # Rotate to next key
        next_key_idx = (list(available_keys.keys()).index(key_name) + 1) % len(available_keys.keys())
        feedback_key_name = list(available_keys.keys())[next_key_idx]
        feedback_api_key = available_keys[feedback_key_name]
        
        ground_truth = generate_ground_truth_feedback(
            jo_bad, pillar, strategy_text, fields_modified, 
            feedback_api_key, feedback_key_name
        )
        
        if 'error' in ground_truth:
            return None
        
        # Step 3: Calculate scores
        scores_good = calculate_pillar_scores(jo_good, pillar)
        scores_bad = calculate_pillar_scores(jo_bad, pillar)
        
        # Step 4: Validate that only the manipulated pillar changed significantly
        score_diff = scores_bad[pillar] - scores_good[pillar]
        
        if score_diff >= 0:  # Bad should have lower score
            # Adjust: degrade the score for bad offer
            scores_bad[pillar] = scores_good[pillar] - 15.0  # Significant decrease
        
        return {
            'jo_good': jo_good,
            'jo_bad': jo_bad,
            'strategy_info': strategy_info,
            'fields_modified': fields_modified,
            'ground_truth': ground_truth,
            'scores_good': scores_good,
            'scores_bad': scores_bad,
            'pillar': pillar,
            'rome_code': rome_code,
            'offer_id': offer_id
        }
    except Exception as e:
        print(f"   ‚ùå Error processing {offer_id}: {e}")
        return None

# Process in parallel if multiple keys, otherwise sequential
if len(available_keys) > 1:
    print(f"\nüöÄ Processing {len(tasks)} offers in parallel using {len(available_keys)} API keys...")
    results = []
    with ThreadPoolExecutor(max_workers=len(available_keys)) as executor:
        futures = {executor.submit(process_offer_task, task): task for task in tasks}
        for future in as_completed(futures):
            result = future.result()
            if result:
                results.append(result)
                print(f"   ‚úÖ Processed {result['offer_id']} (ROME: {result['rome_code']}, Pillar: {result['pillar']})")
else:
    print(f"\n‚ö†Ô∏è Processing {len(tasks)} offers sequentially...")
    results = []
    for task in tasks:
        result = process_offer_task(task)
        if result:
            results.append(result)
            print(f"   ‚úÖ Processed {result['offer_id']} (ROME: {result['rome_code']}, Pillar: {result['pillar']})")

print(f"\n‚úÖ Successfully processed {len(results)} offers")



Starting Ground Truth Generation
Mode: DEBUG
API Keys: 3
Min Severity: High


‚úÖ Prepared 6 tasks for processing
   Using 3 API keys in parallel

üöÄ Processing 6 offers in parallel using 3 API keys...
   ‚úÖ Processed 193YBZR (ROME: D1106, Pillar: Clarity and Realism)
   ‚úÖ Processed 193RJPN (ROME: D1208, Pillar: Compensation and Benefits)
   ‚úÖ Processed 192YCYJ (ROME: A1208, Pillar: Clarity and Realism)
   ‚úÖ Processed 193NJRP (ROME: F1703, Pillar: Work Environment and Culture)
   ‚úÖ Processed 194HZNB (ROME: D1102, Pillar: Nature of Work and Impact)
   ‚úÖ Processed 195QMQK (ROME: D1507, Pillar: Compensation and Benefits)

‚úÖ Successfully processed 6 offers


## 5. Format and Save Dataset

Format results into structured dataset for LLM training/evaluation.


In [12]:
# --- Format Final Dataset ---
final_dataset = []

for result in results:
    # Extract ground truth feedback
    if 'actionable_feedback' in result['ground_truth'] and len(result['ground_truth']['actionable_feedback']) > 0:
        feedback = result['ground_truth']['actionable_feedback'][0]
        
        # Ensure severity is High
        if feedback.get('severity', '').lower() != 'high':
            continue  # Skip non-high severity
        
        record = {
            # Metadata
            'offer_id': result['offer_id'],
            'rome_code': result['rome_code'],
            'pillar_degraded': result['pillar'],
            
            # Job Offers
            'JO_good': result['jo_good'],
            'JO_bad': result['jo_bad'],
            
            # Strategy Information
            'degradation_strategy': {
                'strategy_text': result['strategy_info']['strategy'],
                'pillar': result['strategy_info']['pillar'],
                'subcategory': result['strategy_info']['subcategory'],
                'weight': result['strategy_info']['weight']
            },
            
            'fields_modified': result['fields_modified'],
            
            # Ground Truth Feedback
            'ground_truth': {
                'feedback_text': feedback.get('feedback_text', ''),
                'json_path': feedback.get('json_path', []),
                'feedback_type': feedback.get('feedback_type', 'REWRITE'),
                'severity': 'High',
                'reasoning': feedback.get('reasoning', '')
            },
            
            # Numerical Scores
            'scores': {
                'JO_good': result['scores_good'],
                'JO_bad': result['scores_bad'],
                'difference': {
                    pillar: result['scores_bad'][pillar] - result['scores_good'][pillar]
                    for pillar in result['scores_good'].keys()
                },
                'expected_change': result['pillar']  # Which pillar should have changed
            }
        }
        
        final_dataset.append(record)

print(f"\n‚úÖ Formatted {len(final_dataset)} records for final dataset")
print(f"   All records have HIGH severity flaws")

# Save dataset
output_file = OUTPUT_PATH / 'ground_truth_dataset.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(final_dataset, f, ensure_ascii=False, indent=2)

print(f"\n‚úÖ Saved dataset to: {output_file}")

# Display sample record
if final_dataset:
    print("\n" + "="*80)
    print("SAMPLE RECORD")
    print("="*80)
    
    sample = final_dataset[0]
    print(f"\nüìã Offer ID: {sample['offer_id']}")
    print(f"üìä ROME Code: {sample['rome_code']}")
    print(f"üéØ Pillar Degraded: {sample['pillar_degraded']}")
    print(f"\nüìù Ground Truth Feedback:")
    gt = sample['ground_truth']
    print(f"   Type: {gt['feedback_type']}")
    print(f"   Severity: {gt['severity']}")
    print(f"   Text: {gt['feedback_text'][:100]}...")
    print(f"   JSON Path: {', '.join(gt['json_path'])}")
    print(f"   Reasoning: {gt['reasoning'][:80]}...")
    print(f"\nüìä Scores:")
    print(f"   {sample['pillar_degraded']}: Good={sample['scores']['JO_good'][sample['pillar_degraded']]:.1f}, Bad={sample['scores']['JO_bad'][sample['pillar_degraded']]:.1f}")
    print(f"   Difference: {sample['scores']['difference'][sample['pillar_degraded']]:.1f}")
    print(f"\nüìã Strategy:")
    print(f"   Subcategory: {sample['degradation_strategy']['subcategory']}")
    print(f"   Weight: {sample['degradation_strategy']['weight']} pair citations")
    print(f"   Fields Modified: {', '.join(sample['fields_modified'])}")
    print("\n" + "="*80)



‚úÖ Formatted 6 records for final dataset
   All records have HIGH severity flaws

‚úÖ Saved dataset to: ../../analysis_outputs/ground_truth/ground_truth_dataset.json

SAMPLE RECORD

üìã Offer ID: 193YBZR
üìä ROME Code: D1106
üéØ Pillar Degraded: Clarity and Realism

üìù Ground Truth Feedback:
   Type: REWRITE
   Severity: High
   Text: The job description states the contract is a 'Remplacement √† dur√©e ind√©termin√©e, potentiellement jus...
   JSON Path: $.dc_descriptifposte, $.dc_typecontrat, $.dn_dureecontrat
   Reasoning: Conflicting information regarding the core nature of employment (Permanent vs. C...

üìä Scores:
   Clarity and Realism: Good=60.0, Bad=50.0
   Difference: -10.0

üìã Strategy:
   Subcategory: Uncategorized
   Weight: 11 pair citations
   Fields Modified: dc_descriptifposte, dc_lbletatoffre, dc_motifetat, dc_lblmotifetat, dn_dureecontrat, dc_lblnaturecontrat, dc_lblexperienceprof, dc_commentaireexperienceprof, dn_salaireminimum, dn_salairemaximum, dc_comme