# Phase 05g: Quick Validation with 2,000 Ground Truth Samples

## Purpose
Validate if increasing sample size from 500 to 2,000 improves model performance:
- **Current (500 samples)**: ElasticNet R² = 0.127, Random Forest R² = 0.103
- **Expected (2000 samples)**: ElasticNet R² = 0.15-0.18, Random Forest R² = 0.18-0.22

## Key Questions
1.  Does more data significantly improve R²?
2.  Does Random Forest outperform ElasticNet with more data?
3.  Is overfitting gap reduced?
4.  Should we continue to 10k or full 34k samples?

## Data Source
- **OCEAN Ground Truth**: Generated from 05d using Gemma-2-9B
- **Samples**: 2,000 samples with high-quality OCEAN labels
- **Embeddings**: BGE-Large (1024 dimensions) to be extracted
- **Dimension ratio**: 1:5 (much better than 1:2 with 500 samples)

## Expected Training Time
- BGE embedding extraction: ~15-20 minutes (HF API)
- ElasticNet training: ~30 seconds
- Random Forest training: ~5 minutes
- Gradient Boosting training: ~4 minutes
- **Total**: ~25-30 minutes

## Prerequisites
You must run 05d_generate_2k_ocean_ground_truth.ipynb first to generate:
- ocean_targets_2000.csv
- samples_2000_with_desc.csv
- samples_2000_metadata.csv

## Setup

In [None]:
import pandas as pd
import numpy as np
import json
from datetime import datetime
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import time

# HuggingFace API
from huggingface_hub import InferenceClient
from dotenv import load_dotenv
import os

# Sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Load environment variables
load_dotenv()
HF_TOKEN = os.getenv('HF_TOKEN')

if not HF_TOKEN:
    raise ValueError("HF_TOKEN not found in .env file!")

# Initialize HF client
client = InferenceClient(token=HF_TOKEN)

# Set random seed
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print(f" Setup complete")
print(f"   HF API Token: {HF_TOKEN[:10]}...{HF_TOKEN[-5:]}")
print(f"   Random seed: {RANDOM_STATE}")

## Configuration

In [None]:
# Data configuration - Using 05d generated ground truth
OCEAN_GROUND_TRUTH_FILE = '../ocean_targets_2000.csv'
SAMPLES_WITH_DESC_FILE = '../samples_2000_with_desc.csv'
SAMPLES_METADATA_FILE = '../samples_2000_metadata.csv'

# BGE embedding configuration
BGE_MODEL = 'BAAI/bge-large-en-v1.5'
EMBEDDING_DIM = 1024

# Training configuration
SAMPLE_SIZE = 2000
TEST_SIZE = 0.2

# OCEAN dimensions
OCEAN_DIMS = ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']

# Output files
EMBEDDING_FILE = '../bge_embeddings_2k_ground_truth.npy'
REPORT_FILE = '../05g_validation_2k_report.json'
COMPARISON_CSV = '../05g_500_vs_2000_comparison.csv'
PLOT_FILE = '../05g_500_vs_2000_comparison.png'

print(f" Configuration:")
print(f"   OCEAN ground truth: {OCEAN_GROUND_TRUTH_FILE}")
print(f"   Samples file: {SAMPLES_WITH_DESC_FILE}")
print(f"   Sample size: {SAMPLE_SIZE:,}")
print(f"   Train/Test split: {int((1-TEST_SIZE)*100)}/{int(TEST_SIZE*100)}")
print(f"   BGE model: {BGE_MODEL}")
print(f"   Embedding dim: {EMBEDDING_DIM}")
print(f"   Dimension ratio: {SAMPLE_SIZE}:{EMBEDDING_DIM} ≈ {SAMPLE_SIZE/EMBEDDING_DIM:.1f}:1")

## Step 1: Load and Sample Data

In [None]:
print(f" Loading OCEAN ground truth from 05d...")

# Load OCEAN ground truth
if not os.path.exists(OCEAN_GROUND_TRUTH_FILE):
    raise FileNotFoundError(
        f"OCEAN ground truth file not found: {OCEAN_GROUND_TRUTH_FILE}\n"
        f"Please run 05d_generate_2k_ocean_ground_truth.ipynb first!"
    )

df_ocean = pd.read_csv(OCEAN_GROUND_TRUTH_FILE)
print(f"   Loaded OCEAN ground truth: {len(df_ocean):,} samples")

# Load sample descriptions
if not os.path.exists(SAMPLES_WITH_DESC_FILE):
    raise FileNotFoundError(
        f"Samples file not found: {SAMPLES_WITH_DESC_FILE}\n"
        f"Please run 05d_generate_2k_ocean_ground_truth.ipynb first!"
    )

df_samples = pd.read_csv(SAMPLES_WITH_DESC_FILE)
print(f"   Loaded sample descriptions: {len(df_samples):,} samples")

# Verify data consistency
assert len(df_ocean) == len(df_samples), "Mismatch between OCEAN targets and samples!"
assert len(df_ocean) == SAMPLE_SIZE, f"Expected {SAMPLE_SIZE} samples, got {len(df_ocean)}"

print(f"\n OCEAN statistics (ground truth):")
print(df_ocean[OCEAN_DIMS].describe())

# Check description lengths
df_samples['desc_length'] = df_samples['desc'].str.len()
print(f"\n Description length statistics:")
print(f"   Min: {df_samples['desc_length'].min()}")
print(f"   Mean: {df_samples['desc_length'].mean():.1f}")
print(f"   Max: {df_samples['desc_length'].max()}")

print(f"\n Data loading complete!")

## Step 2: Extract BGE Embeddings

**Note**: This will take ~15-20 minutes using HuggingFace API.
- Processing ~2,000 descriptions
- Using BAAI/bge-large-en-v1.5 (1024 dimensions)
- Progress bar will show estimated time remaining

In [None]:
def extract_bge_embedding(text, max_retries=3):
    """
    Extract BGE embedding for a single text using HuggingFace API.
    Includes retry logic for API failures.
    """
    for attempt in range(max_retries):
        try:
            result = client.feature_extraction(
                text=text,
                model=BGE_MODEL
            )
            
            # Convert to numpy array and compute mean pooling
            embedding = np.array(result)
            if embedding.ndim == 2:  # Token-level embeddings
                mean_embedding = embedding.mean(axis=0)
            else:
                mean_embedding = embedding
            
            # Verify dimension
            if len(mean_embedding) == EMBEDDING_DIM:
                return mean_embedding
            else:
                print(f"     Unexpected dimension: {len(mean_embedding)}, expected {EMBEDDING_DIM}")
                return None
                
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)  # Exponential backoff
                continue
            else:
                print(f"    Failed after {max_retries} attempts: {e}")
                return None
    
    return None

print(f" Extracting BGE embeddings for {len(df_sample):,} samples...")
print(f"   Model: {BGE_MODEL}")
print(f"   Expected time: ~15-20 minutes")
print(f"   Progress will be saved every 100 samples\n")

embeddings = []
failed_indices = []
start_time = time.time()

for idx, desc in enumerate(tqdm(df_sample['desc'], desc="Extracting embeddings")):
    embedding = extract_bge_embedding(desc)
    
    if embedding is not None:
        embeddings.append(embedding)
    else:
        embeddings.append(np.zeros(EMBEDDING_DIM))  # Fallback for failed extractions
        failed_indices.append(idx)
    
    # Save intermediate results every 100 samples
    if (idx + 1) % 100 == 0:
        temp_embeddings = np.array(embeddings)
        np.save(EMBEDDING_FILE + '.temp', temp_embeddings)
        elapsed = time.time() - start_time
        rate = (idx + 1) / elapsed
        remaining = (len(df_sample) - idx - 1) / rate
        print(f"   Progress: {idx+1}/{len(df_sample)} ({(idx+1)/len(df_sample)*100:.1f}%) | "
              f"Rate: {rate:.1f} samples/sec | ETA: {remaining/60:.1f} min")

# Convert to numpy array
embeddings = np.array(embeddings)

elapsed_time = time.time() - start_time
print(f"\n Embedding extraction complete!")
print(f"   Shape: {embeddings.shape}")
print(f"   Time: {elapsed_time/60:.1f} minutes")
print(f"   Rate: {len(df_sample)/elapsed_time:.2f} samples/sec")
print(f"   Failed: {len(failed_indices)} samples")

if failed_indices:
    print(f"     Failed indices: {failed_indices[:10]}..." if len(failed_indices) > 10 else f"     Failed indices: {failed_indices}")

# Save embeddings
np.save(EMBEDDING_FILE, embeddings)
print(f"\n Saved embeddings to {EMBEDDING_FILE}")

# Clean up temp file
import os
if os.path.exists(EMBEDDING_FILE + '.temp'):
    os.remove(EMBEDDING_FILE + '.temp')

## Step 3: Prepare Training Data

In [None]:
print(f" Preparing training data...")

# Load embeddings
X_full = np.load(EMBEDDING_FILE)
print(f"   Embeddings shape: {X_full.shape}")

# Verify dimension
assert X_full.shape[1] == EMBEDDING_DIM, f"Expected {EMBEDDING_DIM} dimensions, got {X_full.shape[1]}"

# Prepare OCEAN targets
y_full = df_sample[OCEAN_DIMS].values
print(f"   Targets shape: {y_full.shape}")

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_full, y_full, 
    test_size=TEST_SIZE, 
    random_state=RANDOM_STATE
)

print(f"\n Data split complete:")
print(f"   Training samples: {len(X_train):,}")
print(f"   Test samples: {len(X_test):,}")
print(f"   Feature dimensions: {X_train.shape[1]:,}")
print(f"   Dimension ratio (train): {len(X_train)}:{X_train.shape[1]} ≈ {len(X_train)/X_train.shape[1]:.1f}:1")

print(f" Preparing training data...")

# Load embeddings
X_full = np.load(EMBEDDING_FILE)
print(f"   Embeddings shape: {X_full.shape}")

# Verify dimension
assert X_full.shape[1] == EMBEDDING_DIM, f"Expected {EMBEDDING_DIM} dimensions, got {X_full.shape[1]}"

# Prepare OCEAN targets from ground truth
y_full = df_ocean[OCEAN_DIMS].values
print(f"   Targets shape: {y_full.shape}")

# Check for any missing values
missing_count = np.isnan(y_full).sum()
if missing_count > 0:
    print(f"   WARNING: {missing_count} missing OCEAN values detected")
    print(f"   Filling missing values with column means...")
    for i, dim in enumerate(OCEAN_DIMS):
        col_mean = np.nanmean(y_full[:, i])
        y_full[np.isnan(y_full[:, i]), i] = col_mean

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_full, y_full, 
    test_size=TEST_SIZE, 
    random_state=RANDOM_STATE
)

print(f"\n Data split complete:")
print(f"   Training samples: {len(X_train):,}")
print(f"   Test samples: {len(X_test):,}")
print(f"   Feature dimensions: {X_train.shape[1]:,}")
print(f"   Dimension ratio (train): {len(X_train)}:{X_train.shape[1]} ≈ {len(X_train)/X_train.shape[1]:.1f}:1")

In [None]:
print(f" Training ElasticNet models...\n")

elasticnet_results = {}
elasticnet_models = {}

# ElasticNet hyperparameters
ALPHAS = [0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
L1_RATIOS = [0.1, 0.3, 0.5, 0.7, 0.9, 0.95, 0.99]

start_time = time.time()

for i, dim in enumerate(OCEAN_DIMS):
    print(f"Training {dim}...", end=" ")
    
    # Train ElasticNet with cross-validation
    model = ElasticNetCV(
        alphas=ALPHAS,
        l1_ratio=L1_RATIOS,
        cv=5,
        random_state=RANDOM_STATE,
        max_iter=10000,
        n_jobs=-1
    )
    
    model.fit(X_train, y_train[:, i])
    
    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Metrics
    train_r2 = r2_score(y_train[:, i], y_train_pred)
    test_r2 = r2_score(y_test[:, i], y_test_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train[:, i], y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test[:, i], y_test_pred))
    
    # Sparsity
    non_zero = np.sum(model.coef_ != 0)
    sparsity = (1 - non_zero / len(model.coef_)) * 100
    
    elasticnet_results[dim] = {
        'train_r2': train_r2,
        'test_r2': test_r2,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'overfitting_gap': train_r2 - test_r2,
        'best_alpha': model.alpha_,
        'best_l1_ratio': model.l1_ratio_,
        'non_zero_features': int(non_zero),
        'sparsity_percent': sparsity
    }
    
    elasticnet_models[dim] = model
    
    print(f"R²={test_r2:.3f}, Gap={train_r2-test_r2:.3f}, Sparsity={sparsity:.1f}%")

elapsed = time.time() - start_time
print(f"\n ElasticNet training complete ({elapsed:.1f} seconds)")

# Average metrics
avg_test_r2 = np.mean([r['test_r2'] for r in elasticnet_results.values()])
avg_gap = np.mean([r['overfitting_gap'] for r in elasticnet_results.values()])
print(f"   Average Test R²: {avg_test_r2:.3f}")
print(f"   Average Overfitting Gap: {avg_gap:.3f}")

### Model 2: Random Forest (Optimized for 2k samples)

In [None]:
print(f" Training Random Forest models...\n")

randomforest_results = {}
randomforest_models = {}

# Random Forest hyperparameters (optimized for 2k samples with 1024 features)
RF_PARAM_GRID = {
    'n_estimators': [100, 200],
    'max_depth': [5, 7, 10],
    'min_samples_split': [20, 30],
    'min_samples_leaf': [10, 15],
    'max_features': ['sqrt', 'log2']
}

print(f"Hyperparameter grid: {sum([len(v) for v in RF_PARAM_GRID.values()])} combinations per dimension")
print(f"Expected time: ~5 minutes\n")

start_time = time.time()

for i, dim in enumerate(OCEAN_DIMS):
    dim_start = time.time()
    print(f"Training {dim}...", end=" ")
    
    # GridSearch with cross-validation
    rf = RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1)
    grid_search = GridSearchCV(
        rf,
        RF_PARAM_GRID,
        cv=5,
        scoring='r2',
        n_jobs=-1,
        verbose=0
    )
    
    grid_search.fit(X_train, y_train[:, i])
    best_model = grid_search.best_estimator_
    
    # Predictions
    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)
    
    # Metrics
    train_r2 = r2_score(y_train[:, i], y_train_pred)
    test_r2 = r2_score(y_test[:, i], y_test_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train[:, i], y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test[:, i], y_test_pred))
    
    randomforest_results[dim] = {
        'train_r2': train_r2,
        'test_r2': test_r2,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'overfitting_gap': train_r2 - test_r2,
        'best_params': grid_search.best_params_,
        'cv_best_score': grid_search.best_score_,
        'training_time': time.time() - dim_start
    }
    
    randomforest_models[dim] = best_model
    
    print(f"R²={test_r2:.3f}, Gap={train_r2-test_r2:.3f} ({time.time()-dim_start:.1f}s)")

elapsed = time.time() - start_time
print(f"\n Random Forest training complete ({elapsed/60:.1f} minutes)")

# Average metrics
avg_test_r2 = np.mean([r['test_r2'] for r in randomforest_results.values()])
avg_gap = np.mean([r['overfitting_gap'] for r in randomforest_results.values()])
print(f"   Average Test R²: {avg_test_r2:.3f}")
print(f"   Average Overfitting Gap: {avg_gap:.3f}")

### Model 3: Gradient Boosting

In [None]:
print(f" Training Gradient Boosting models...\n")

gb_results = {}
gb_models = {}

# Gradient Boosting hyperparameters
GB_PARAM_GRID = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8],
    'min_samples_leaf': [10, 20]
}

print(f"Hyperparameter grid: {sum([len(v) for v in GB_PARAM_GRID.values()])} combinations per dimension")
print(f"Expected time: ~4 minutes\n")

start_time = time.time()

for i, dim in enumerate(OCEAN_DIMS):
    dim_start = time.time()
    print(f"Training {dim}...", end=" ")
    
    # GridSearch with cross-validation
    gb = GradientBoostingRegressor(random_state=RANDOM_STATE)
    grid_search = GridSearchCV(
        gb,
        GB_PARAM_GRID,
        cv=5,
        scoring='r2',
        n_jobs=-1,
        verbose=0
    )
    
    grid_search.fit(X_train, y_train[:, i])
    best_model = grid_search.best_estimator_
    
    # Predictions
    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)
    
    # Metrics
    train_r2 = r2_score(y_train[:, i], y_train_pred)
    test_r2 = r2_score(y_test[:, i], y_test_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train[:, i], y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test[:, i], y_test_pred))
    
    gb_results[dim] = {
        'train_r2': train_r2,
        'test_r2': test_r2,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'overfitting_gap': train_r2 - test_r2,
        'best_params': grid_search.best_params_,
        'cv_best_score': grid_search.best_score_,
        'training_time': time.time() - dim_start
    }
    
    gb_models[dim] = best_model
    
    print(f"R²={test_r2:.3f}, Gap={train_r2-test_r2:.3f} ({time.time()-dim_start:.1f}s)")

elapsed = time.time() - start_time
print(f"\n Gradient Boosting training complete ({elapsed/60:.1f} minutes)")

# Average metrics
avg_test_r2 = np.mean([r['test_r2'] for r in gb_results.values()])
avg_gap = np.mean([r['overfitting_gap'] for r in gb_results.values()])
print(f"   Average Test R²: {avg_test_r2:.3f}")
print(f"   Average Overfitting Gap: {avg_gap:.3f}")

## Step 5: Compare with 500-Sample Baseline

In [None]:
print(f" Comparing 500-sample vs 2000-sample results...\n")

# Baseline results from 500-sample experiments
baseline_500 = {
    'ElasticNet': {
        'avg_test_r2': 0.127,
        'avg_overfitting_gap': 0.17
    },
    'RandomForest': {
        'avg_test_r2': 0.103,
        'avg_overfitting_gap': 0.45
    }
}

# Current 2000-sample results
results_2k = {
    'ElasticNet': {
        'avg_test_r2': np.mean([r['test_r2'] for r in elasticnet_results.values()]),
        'avg_overfitting_gap': np.mean([r['overfitting_gap'] for r in elasticnet_results.values()])
    },
    'RandomForest': {
        'avg_test_r2': np.mean([r['test_r2'] for r in randomforest_results.values()]),
        'avg_overfitting_gap': np.mean([r['overfitting_gap'] for r in randomforest_results.values()])
    },
    'GradientBoosting': {
        'avg_test_r2': np.mean([r['test_r2'] for r in gb_results.values()]),
        'avg_overfitting_gap': np.mean([r['overfitting_gap'] for r in gb_results.values()])
    }
}

# Calculate improvements
print("═" * 80)
print("                  500 SAMPLES vs 2000 SAMPLES COMPARISON")
print("═" * 80)
print(f"{'Model':<20} {'500-R²':<10} {'2k-R²':<10} {'Δ R²':<10} {'Gap-500':<10} {'Gap-2k':<10} {'Δ Gap':<10}")
print("─" * 80)

for model in ['ElasticNet', 'RandomForest']:
    r2_500 = baseline_500[model]['avg_test_r2']
    r2_2k = results_2k[model]['avg_test_r2']
    gap_500 = baseline_500[model]['avg_overfitting_gap']
    gap_2k = results_2k[model]['avg_overfitting_gap']
    
    r2_delta = r2_2k - r2_500
    gap_delta = gap_2k - gap_500
    
    print(f"{model:<20} {r2_500:<10.3f} {r2_2k:<10.3f} {r2_delta:+10.3f} {gap_500:<10.3f} {gap_2k:<10.3f} {gap_delta:+10.3f}")

# New model (no baseline)
print(f"{'GradientBoosting':<20} {'N/A':<10} {results_2k['GradientBoosting']['avg_test_r2']:<10.3f} {'NEW':<10} "
      f"{'N/A':<10} {results_2k['GradientBoosting']['avg_overfitting_gap']:<10.3f} {'NEW':<10}")

print("═" * 80)

# Determine best model for 2k samples
best_model_2k = max(results_2k.items(), key=lambda x: x[1]['avg_test_r2'])
print(f"\n Best model (2k samples): {best_model_2k[0]} with R² = {best_model_2k[1]['avg_test_r2']:.3f}")

# Calculate improvement magnitude
elasticnet_improvement = results_2k['ElasticNet']['avg_test_r2'] - baseline_500['ElasticNet']['avg_test_r2']
rf_improvement = results_2k['RandomForest']['avg_test_r2'] - baseline_500['RandomForest']['avg_test_r2']

print(f"\n R² Improvements:")
print(f"   ElasticNet: {elasticnet_improvement:+.3f} ({elasticnet_improvement/baseline_500['ElasticNet']['avg_test_r2']*100:+.1f}%)")
print(f"   Random Forest: {rf_improvement:+.3f} ({rf_improvement/baseline_500['RandomForest']['avg_test_r2']*100:+.1f}%)")

## Step 6: Decision & Recommendations

In [None]:
print("\n" + "═" * 80)
print("                         DECISION & RECOMMENDATIONS")
print("═" * 80 + "\n")

# Determine if improvement is significant
max_improvement = max(elasticnet_improvement, rf_improvement)

if max_improvement > 0.05:
    verdict = " SIGNIFICANT IMPROVEMENT"
    recommendation = "Continue to 10,000 samples"
    explanation = (
        f"Increasing from 500 to 2,000 samples improved R² by {max_improvement:.3f} ({max_improvement*100:.1f}%). "
        "This indicates that more data substantially helps model performance. "
        "Scaling to 10,000 samples (1:20 feature ratio) should yield R² ≈ 0.22-0.26."
    )
    next_steps = [
        " Create 05h_train_10k_samples.ipynb",
        " Focus on Random Forest or Gradient Boosting (non-linear models)",
        " Consider using all 34,529 samples for final production model",
        " Expected final R² with 34k samples: 0.28-0.35"
    ]
    
elif max_improvement > 0.02:
    verdict = "  MODERATE IMPROVEMENT"
    recommendation = "Try 10,000 samples with feature engineering"
    explanation = (
        f"R² improved by {max_improvement:.3f} ({max_improvement*100:.1f}%), showing modest gains. "
        "More data helps, but may not be sufficient alone. "
        "Consider combining more samples with feature engineering."
    )
    next_steps = [
        "  Try 10,000 samples but don't expect dramatic improvement",
        "  Add feature engineering:",
        "   - TF-IDF features from descriptions",
        "   - Loan metadata (amount, grade, purpose, etc.)",
        "   - Ensemble of 5 LLM labels (not just Llama)",
        "  Consider dimensionality reduction (PCA, feature selection)"
    ]
    
else:
    verdict = " MINIMAL IMPROVEMENT"
    recommendation = "Sample size is NOT the bottleneck"
    explanation = (
        f"R² only improved by {max_improvement:.3f} ({max_improvement*100:.1f}%), indicating that "
        "the problem is not lack of data, but rather: "
        "(1) BGE embeddings don't capture personality well, or "
        "(2) predicting OCEAN from loan descriptions is inherently difficult."
    )
    next_steps = [
        " Do NOT scale to 10k or 34k samples - it won't help significantly",
        " Root causes to investigate:",
        "   - Try different embeddings (sentence-transformers specialized for personality)",
        "   - Add non-text features (loan amount, income, employment, etc.)",
        "   - Ensemble 5 LLM labels instead of using just Llama",
        "   - Consider this may be the performance ceiling for this task",
        " Alternative: Multi-modal model (text + structured features)"
    ]

print(f"VERDICT: {verdict}")
print(f"\n Recommendation: {recommendation}")
print(f"\n Explanation:\n{explanation}")
print(f"\n Next Steps:")
for step in next_steps:
    print(f"   {step}")

print("\n" + "═" * 80)

## Step 7: Save Results

In [None]:
# Prepare comprehensive report
report = {
    'phase': '05g - Quick Validation with 2,000 Samples',
    'timestamp': datetime.now().isoformat(),
    'configuration': {
        'sample_size': SAMPLE_SIZE,
        'test_size': TEST_SIZE,
        'llm_model': LLM_MODEL,
        'embedding_model': BGE_MODEL,
        'embedding_dimension': EMBEDDING_DIM,
        'training_samples': len(X_train),
        'test_samples': len(X_test)
    },
    'baseline_500_samples': baseline_500,
    'results_2k_samples': {
        'elasticnet': {dim: elasticnet_results[dim] for dim in OCEAN_DIMS},
        'randomforest': {dim: randomforest_results[dim] for dim in OCEAN_DIMS},
        'gradient_boosting': {dim: gb_results[dim] for dim in OCEAN_DIMS}
    },
    'summary': {
        'elasticnet_avg_test_r2': results_2k['ElasticNet']['avg_test_r2'],
        'randomforest_avg_test_r2': results_2k['RandomForest']['avg_test_r2'],
        'gb_avg_test_r2': results_2k['GradientBoosting']['avg_test_r2'],
        'elasticnet_improvement': elasticnet_improvement,
        'rf_improvement': rf_improvement,
        'best_model': best_model_2k[0],
        'best_r2': best_model_2k[1]['avg_test_r2']
    },
    'decision': {
        'verdict': verdict,
        'recommendation': recommendation,
        'explanation': explanation,
        'next_steps': next_steps
    }
}

# Save JSON report
with open(REPORT_FILE, 'w') as f:
    json.dump(report, f, indent=2)

print(f" Saved detailed report to {REPORT_FILE}")

# Create comparison CSV
comparison_data = []
for model_name in ['ElasticNet', 'RandomForest']:
    for dim in OCEAN_DIMS:
        if model_name == 'ElasticNet':
            results_dict = elasticnet_results
        else:
            results_dict = randomforest_results
        
        comparison_data.append({
            'Model': model_name,
            'Dimension': dim,
            'Samples_500_R2': baseline_500[model_name]['avg_test_r2'],  # Using average for simplicity
            'Samples_2000_R2': results_dict[dim]['test_r2'],
            'R2_Improvement': results_dict[dim]['test_r2'] - baseline_500[model_name]['avg_test_r2'],
            'Samples_500_Gap': baseline_500[model_name]['avg_overfitting_gap'],
            'Samples_2000_Gap': results_dict[dim]['overfitting_gap'],
            'Gap_Improvement': results_dict[dim]['overfitting_gap'] - baseline_500[model_name]['avg_overfitting_gap']
        })

df_comparison = pd.DataFrame(comparison_data)
df_comparison.to_csv(COMPARISON_CSV, index=False)
print(f" Saved comparison CSV to {COMPARISON_CSV}")

print(f"\n Phase 05g complete!")

# Prepare comprehensive report
report = {
    'phase': '05g - Validation with 2,000 Ground Truth Samples',
    'timestamp': datetime.now().isoformat(),
    'configuration': {
        'sample_size': SAMPLE_SIZE,
        'test_size': TEST_SIZE,
        'ocean_source': '05d generated ground truth (Gemma-2-9B)',
        'embedding_model': BGE_MODEL,
        'embedding_dimension': EMBEDDING_DIM,
        'training_samples': len(X_train),
        'test_samples': len(X_test)
    },
    'baseline_500_samples': baseline_500,
    'results_2k_samples': {
        'elasticnet': {dim: elasticnet_results[dim] for dim in OCEAN_DIMS},
        'randomforest': {dim: randomforest_results[dim] for dim in OCEAN_DIMS},
        'gradient_boosting': {dim: gb_results[dim] for dim in OCEAN_DIMS}
    },
    'summary': {
        'elasticnet_avg_test_r2': results_2k['ElasticNet']['avg_test_r2'],
        'randomforest_avg_test_r2': results_2k['RandomForest']['avg_test_r2'],
        'gb_avg_test_r2': results_2k['GradientBoosting']['avg_test_r2'],
        'elasticnet_improvement': elasticnet_improvement,
        'rf_improvement': rf_improvement,
        'best_model': best_model_2k[0],
        'best_r2': best_model_2k[1]['avg_test_r2']
    },
    'decision': {
        'verdict': verdict,
        'recommendation': recommendation,
        'explanation': explanation,
        'next_steps': next_steps
    }
}

# Save JSON report
with open(REPORT_FILE, 'w') as f:
    json.dump(report, f, indent=2)

print(f" Saved detailed report to {REPORT_FILE}")

# Create comparison CSV
comparison_data = []
for model_name in ['ElasticNet', 'RandomForest']:
    for dim in OCEAN_DIMS:
        if model_name == 'ElasticNet':
            results_dict = elasticnet_results
        else:
            results_dict = randomforest_results
        
        comparison_data.append({
            'Model': model_name,
            'Dimension': dim,
            'Samples_500_R2': baseline_500[model_name]['avg_test_r2'],  # Using average for simplicity
            'Samples_2000_R2': results_dict[dim]['test_r2'],
            'R2_Improvement': results_dict[dim]['test_r2'] - baseline_500[model_name]['avg_test_r2'],
            'Samples_500_Gap': baseline_500[model_name]['avg_overfitting_gap'],
            'Samples_2000_Gap': results_dict[dim]['overfitting_gap'],
            'Gap_Improvement': results_dict[dim]['overfitting_gap'] - baseline_500[model_name]['avg_overfitting_gap']
        })

df_comparison = pd.DataFrame(comparison_data)
df_comparison.to_csv(COMPARISON_CSV, index=False)
print(f" Saved comparison CSV to {COMPARISON_CSV}")

print(f"\n Phase 05g complete!")

In [None]:
# Create comparison visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('500 Samples vs 2,000 Samples Comparison', fontsize=16, fontweight='bold')

# Plot 1: R² comparison
ax1 = axes[0, 0]
models = ['ElasticNet', 'RandomForest', 'GradientBoosting']
r2_500 = [baseline_500['ElasticNet']['avg_test_r2'], 
          baseline_500['RandomForest']['avg_test_r2'],
          0]  # No baseline for GB
r2_2k = [results_2k['ElasticNet']['avg_test_r2'],
         results_2k['RandomForest']['avg_test_r2'],
         results_2k['GradientBoosting']['avg_test_r2']]

x = np.arange(len(models))
width = 0.35
ax1.bar(x - width/2, r2_500, width, label='500 samples', alpha=0.8)
ax1.bar(x + width/2, r2_2k, width, label='2,000 samples', alpha=0.8)
ax1.set_ylabel('Test R²', fontsize=12)
ax1.set_title('Average Test R² Comparison', fontsize=13, fontweight='bold')
ax1.set_xticks(x)
ax1.set_xticklabels(models)
ax1.legend()
ax1.grid(axis='y', alpha=0.3)

# Plot 2: Overfitting Gap comparison
ax2 = axes[0, 1]
gap_500 = [baseline_500['ElasticNet']['avg_overfitting_gap'],
           baseline_500['RandomForest']['avg_overfitting_gap'],
           0]
gap_2k = [results_2k['ElasticNet']['avg_overfitting_gap'],
          results_2k['RandomForest']['avg_overfitting_gap'],
          results_2k['GradientBoosting']['avg_overfitting_gap']]

ax2.bar(x - width/2, gap_500, width, label='500 samples', alpha=0.8, color='orange')
ax2.bar(x + width/2, gap_2k, width, label='2,000 samples', alpha=0.8, color='green')
ax2.set_ylabel('Overfitting Gap (Train R² - Test R²)', fontsize=12)
ax2.set_title('Overfitting Gap Comparison (Lower is Better)', fontsize=13, fontweight='bold')
ax2.set_xticks(x)
ax2.set_xticklabels(models)
ax2.legend()
ax2.grid(axis='y', alpha=0.3)

# Plot 3: Per-dimension R² for best model
ax3 = axes[1, 0]
best_results = randomforest_results if results_2k['RandomForest']['avg_test_r2'] > results_2k['ElasticNet']['avg_test_r2'] else elasticnet_results
best_name = 'Random Forest' if results_2k['RandomForest']['avg_test_r2'] > results_2k['ElasticNet']['avg_test_r2'] else 'ElasticNet'
r2_per_dim = [best_results[dim]['test_r2'] for dim in OCEAN_DIMS]

ax3.barh(OCEAN_DIMS, r2_per_dim, color='steelblue', alpha=0.8)
ax3.set_xlabel('Test R²', fontsize=12)
ax3.set_title(f'Per-Dimension Performance ({best_name}, 2k samples)', fontsize=13, fontweight='bold')
ax3.grid(axis='x', alpha=0.3)

# Add R² values on bars
for i, v in enumerate(r2_per_dim):
    ax3.text(v + 0.005, i, f'{v:.3f}', va='center')

# Plot 4: Improvement summary
ax4 = axes[1, 1]
improvements = [
    ('ElasticNet\nR² Δ', elasticnet_improvement),
    ('RandomForest\nR² Δ', rf_improvement),
    ('ElasticNet\nGap Δ', results_2k['ElasticNet']['avg_overfitting_gap'] - baseline_500['ElasticNet']['avg_overfitting_gap']),
    ('RandomForest\nGap Δ', results_2k['RandomForest']['avg_overfitting_gap'] - baseline_500['RandomForest']['avg_overfitting_gap'])
]

labels, values = zip(*improvements)
colors = ['green' if v > 0 else 'red' for v in values[:2]] + ['red' if v < 0 else 'green' for v in values[2:]]
ax4.bar(range(len(labels)), values, color=colors, alpha=0.7)
ax4.set_xticks(range(len(labels)))
ax4.set_xticklabels(labels, fontsize=10)
ax4.set_ylabel('Change (500 → 2000 samples)', fontsize=12)
ax4.set_title('Improvement Analysis', fontsize=13, fontweight='bold')
ax4.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
ax4.grid(axis='y', alpha=0.3)

# Add value labels
for i, v in enumerate(values):
    ax4.text(i, v + 0.002 if v > 0 else v - 0.002, f'{v:+.3f}', 
             ha='center', va='bottom' if v > 0 else 'top', fontweight='bold')

plt.tight_layout()
plt.savefig(PLOT_FILE, dpi=300, bbox_inches='tight')
print(f" Saved comparison plot to {PLOT_FILE}")
plt.show()