# 05f - Train Ridge Regression Models (All LLMs)

**Purpose**: Train Ridge regression models for all 5 LLMs to learn the mapping from BGE embeddings to OCEAN scores

**Why Ridge?**
- Simple linear model with L2 regularization
- Baseline comparison for more complex methods (Elastic Net, PCA, etc.)
- Fast training and inference

**Input Files**:
- bge_embeddings_500.npy - BGE embeddings (500x1024)
- ocean_ground_truth/[llm]_ocean_500.csv - OCEAN ground truth for each LLM

**Output Files** (per LLM):
- ridge_models_[llm].pkl - 5 Ridge models + Scaler
- 05f_ridge_training_report_[llm].json - Training report

**Summary Output**:
- 05f_ridge_comparison.csv - Performance comparison across LLMs
- 05f_ridge_visualization.png - Performance visualization

**Estimated Time**: Approximately 5-10 minutes (5 LLMs)

## Step 1: Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import pickle
import os
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from datetime import datetime

print("Libraries loaded successfully")
print(f"Timestamp: {datetime.now()}")

## Step 2: Configuration

In [None]:
# LLM configurations
LLM_CONFIGS = {
    'llama': {
        'name': 'Llama-3.1-8B',
        'ocean_file': '../ocean_ground_truth/llama_3.1_8b_ocean_500.csv'
    },
    'gpt': {
        'name': 'GPT-OSS-120B',
        'ocean_file': '../ocean_ground_truth/gpt_oss_120b_ocean_500.csv'
    },
    'gemma': {
        'name': 'Gemma-2-9B',
        'ocean_file': '../ocean_ground_truth/gemma_2_9b_ocean_500.csv'
    },
    'deepseek': {
        'name': 'DeepSeek-V3.1',
        'ocean_file': '../ocean_ground_truth/deepseek_v3.1_ocean_500.csv'
    },
    'qwen': {
        'name': 'Qwen-2.5-72B',
        'ocean_file': '../ocean_ground_truth/qwen_2.5_72b_ocean_500.csv'
    }
}

# OCEAN dimensions
OCEAN_DIMS = ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']

# Ridge hyperparameters
RIDGE_ALPHA = 1.0  # Default Ridge regularization strength

# Random seed for reproducibility
RANDOM_STATE = 42

print(f"Configuration loaded:")
print(f"  LLM models: {len(LLM_CONFIGS)}")
print(f"  OCEAN dimensions: {len(OCEAN_DIMS)}")
print(f"  Ridge alpha: {RIDGE_ALPHA}")

## Step 3: Load BGE Embeddings (Shared)

In [None]:
print("="*80)
print("Loading BGE Embeddings")
print("="*80)

embedding_file = '../bge_embeddings_500.npy'
print(f"\nLoading: {embedding_file}")
X_full = np.load(embedding_file)
print(f"Embeddings shape: {X_full.shape}")
print(f"  Data type: {X_full.dtype}")
print(f"  Memory usage: {X_full.nbytes / 1024 / 1024:.1f} MB")
print(f"  Value range: [{X_full.min():.4f}, {X_full.max():.4f}]")

## Step 4: Train Ridge Models for Each LLM

In [None]:
# Storage for all results
all_results = {}

for llm_key, llm_config in LLM_CONFIGS.items():
    print("\n" + "="*80)
    print(f"Training Ridge Models: {llm_config['name']}")
    print("="*80)
    
    # Load OCEAN targets
    print(f"\n[1/6] Loading OCEAN targets...")
    ocean_file = llm_config['ocean_file']
    y_df = pd.read_csv(ocean_file)
    print(f"  Shape: {y_df.shape}")
    print(f"  Columns: {y_df.columns.tolist()}")
    
    # Check and handle NaN values
    nan_count_total = y_df.isnull().sum().sum()
    if nan_count_total > 0:
        print(f"  Warning: Found {nan_count_total} NaN values")
        nan_indices = y_df[y_df.isnull().any(axis=1)].index
        y_df = y_df.dropna()
        X = np.delete(X_full, nan_indices, axis=0)
        print(f"  After dropping NaN: {len(y_df)} samples")
    else:
        X = X_full.copy()
    
    # Verify consistency
    if len(X) != len(y_df):
        raise ValueError(f"Data inconsistency: X={len(X)}, y={len(y_df)}")
    
    # Train/test split
    print(f"\n[2/6] Splitting data (80/20)...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_df,
        test_size=0.2,
        random_state=RANDOM_STATE
    )
    print(f"  Training: {X_train.shape[0]} samples")
    print(f"  Test: {X_test.shape[0]} samples")
    print(f"  Feature-to-sample ratio: {X_train.shape[1] / X_train.shape[0]:.2f}:1")
    
    # Standardize
    print(f"\n[3/6] Standardizing features...")
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print(f"  Train mean={X_train_scaled.mean():.6f}, std={X_train_scaled.std():.6f}")
    print(f"  Test mean={X_test_scaled.mean():.6f}, std={X_test_scaled.std():.6f}")
    
    # Train models
    print(f"\n[4/6] Training Ridge models (5 dimensions)...")
    print(f"  Ridge alpha: {RIDGE_ALPHA}")
    
    ridge_models = {}
    training_results = {}
    
    for i, dim in enumerate(OCEAN_DIMS):
        print(f"\n  [{i+1}/5] Training {dim}...")
        
        # Get target
        y_train_dim = y_train[dim].values
        y_test_dim = y_test[dim].values
        
        # Train Ridge
        model = Ridge(
            alpha=RIDGE_ALPHA,
            random_state=RANDOM_STATE,
            max_iter=10000
        )
        model.fit(X_train_scaled, y_train_dim)
        
        # Predict
        y_train_pred = model.predict(X_train_scaled)
        y_test_pred = model.predict(X_test_scaled)
        
        # Metrics
        train_r2 = r2_score(y_train_dim, y_train_pred)
        test_r2 = r2_score(y_test_dim, y_test_pred)
        train_rmse = np.sqrt(mean_squared_error(y_train_dim, y_train_pred))
        test_rmse = np.sqrt(mean_squared_error(y_test_dim, y_test_pred))
        train_mae = mean_absolute_error(y_train_dim, y_train_pred)
        test_mae = mean_absolute_error(y_test_dim, y_test_pred)
        
        # Save model and results
        ridge_models[dim] = model
        training_results[dim] = {
            'train_r2': float(train_r2),
            'test_r2': float(test_r2),
            'train_rmse': float(train_rmse),
            'test_rmse': float(test_rmse),
            'train_mae': float(train_mae),
            'test_mae': float(test_mae),
            'model_coef_shape': model.coef_.shape,
            'model_intercept': float(model.intercept_)
        }
        
        print(f"      Train R²: {train_r2:.4f} | Test R²: {test_r2:.4f}")
        print(f"      Train RMSE: {train_rmse:.4f} | Test RMSE: {test_rmse:.4f}")
    
    # Save models
    print(f"\n[5/6] Saving models...")
    model_data = {
        'models': ridge_models,
        'scaler': scaler,
        'ocean_dims': OCEAN_DIMS,
        'training_results': training_results,
        'training_timestamp': datetime.now().isoformat(),
        'llm_model': llm_config['name'],
        'hyperparameters': {
            'alpha': RIDGE_ALPHA
        }
    }
    
    model_file = f'../ridge_models_{llm_key}.pkl'
    with open(model_file, 'wb') as f:
        pickle.dump(model_data, f)
    print(f"  Saved: {model_file} ({os.path.getsize(model_file) / 1024:.1f} KB)")
    
    # Generate report
    print(f"\n[6/6] Generating training report...")
    report = {
        'phase': f'05f - Train Ridge Models ({llm_config["name"]})',
        'timestamp': datetime.now().isoformat(),
        'llm_model': llm_config['name'],
        'embedding_model': 'BAAI/bge-large-en-v1.5',
        'embedding_dimension': 1024,
        'training_samples': int(X_train.shape[0]),
        'test_samples': int(X_test.shape[0]),
        'model_type': 'Ridge Regression',
        'model_alpha': RIDGE_ALPHA,
        'ocean_dimensions': OCEAN_DIMS,
        'model_file': model_file,
        'training_results': training_results
    }
    
    # Summary metrics
    test_r2_scores = [training_results[dim]['test_r2'] for dim in OCEAN_DIMS]
    test_rmse_scores = [training_results[dim]['test_rmse'] for dim in OCEAN_DIMS]
    test_mae_scores = [training_results[dim]['test_mae'] for dim in OCEAN_DIMS]
    
    report['summary_metrics'] = {
        'avg_test_r2': float(np.mean(test_r2_scores)),
        'avg_test_rmse': float(np.mean(test_rmse_scores)),
        'avg_test_mae': float(np.mean(test_mae_scores)),
        'min_test_r2': float(np.min(test_r2_scores)),
        'max_test_r2': float(np.max(test_r2_scores))
    }
    
    report_file = f'../05f_ridge_training_report_{llm_key}.json'
    with open(report_file, 'w') as f:
        json.dump(report, f, indent=2)
    print(f"  Report saved: {report_file}")
    
    # Store for final comparison
    all_results[llm_key] = {
        'name': llm_config['name'],
        'training_results': training_results,
        'summary': report['summary_metrics']
    }
    
    # Print summary
    print(f"\n  Summary for {llm_config['name']}:")
    print(f"    Avg Test R²: {report['summary_metrics']['avg_test_r2']:.4f}")
    print(f"    Test R² range: [{report['summary_metrics']['min_test_r2']:.4f}, {report['summary_metrics']['max_test_r2']:.4f}]")

print("\n" + "="*80)
print("All Ridge models trained successfully!")
print("="*80)

## Step 5: Generate Comparison Report

In [None]:
print("\n" + "="*80)
print("Generating Comparison Report")
print("="*80)

# Create comparison dataframe
comparison_data = []

for llm_key, results in all_results.items():
    for dim in OCEAN_DIMS:
        ridge_r2 = results['training_results'][dim]['test_r2']
        ridge_rmse = results['training_results'][dim]['test_rmse']
        ridge_mae = results['training_results'][dim]['test_mae']
        
        comparison_data.append({
            'LLM': results['name'],
            'llm_key': llm_key,
            'Dimension': dim,
            'Test_R2': ridge_r2,
            'Test_RMSE': ridge_rmse,
            'Test_MAE': ridge_mae
        })

comparison_df = pd.DataFrame(comparison_data)

# Save comparison table
comparison_file = '../05f_ridge_comparison.csv'
comparison_df.to_csv(comparison_file, index=False)
print(f"\nComparison table saved: {comparison_file}")
print(f"\nPreview:")
print(comparison_df.head(10))

## Step 6: Generate Visualizations

In [None]:
# Create visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Ridge Regression Performance (All LLMs, BGE Embeddings)', fontsize=16, fontweight='bold')

# 1. Test R² by Model-Dimension
ax1 = axes[0, 0]
x_pos = np.arange(len(comparison_df))
colors = ['#e74c3c' if x < 0 else '#2ecc71' for x in comparison_df['Test_R2']]
ax1.bar(x_pos, comparison_df['Test_R2'], color=colors, alpha=0.8)
ax1.axhline(y=0, color='black', linestyle='--', linewidth=1, alpha=0.5)
ax1.set_xlabel('Model-Dimension', fontsize=10)
ax1.set_ylabel('Test R² Score', fontsize=10)
ax1.set_title('Test R² by Model-Dimension', fontsize=12, fontweight='bold')
ax1.grid(True, alpha=0.3)
ax1.tick_params(axis='x', rotation=90, labelsize=7)
ax1.set_xticks(x_pos)
ax1.set_xticklabels([f"{row['llm_key'][:3]}-{row['Dimension'][:3]}" for _, row in comparison_df.iterrows()])

# 2. Average R² by LLM
ax2 = axes[0, 1]
avg_by_llm = comparison_df.groupby('LLM')['Test_R2'].mean().sort_values(ascending=False)
colors_llm = ['#e74c3c' if x < 0 else '#2ecc71' for x in avg_by_llm.values]
avg_by_llm.plot(kind='bar', ax=ax2, color=colors_llm, alpha=0.8)
ax2.axhline(y=0, color='black', linestyle='--', linewidth=1, alpha=0.5)
ax2.set_xlabel('LLM Model', fontsize=10)
ax2.set_ylabel('Average Test R²', fontsize=10)
ax2.set_title('Average Test R² by LLM', fontsize=12, fontweight='bold')
ax2.grid(True, alpha=0.3)
ax2.tick_params(axis='x', rotation=45)

# 3. Average R² by OCEAN Dimension
ax3 = axes[1, 0]
avg_by_dim = comparison_df.groupby('Dimension')['Test_R2'].mean().sort_values(ascending=False)
colors_dim = ['#e74c3c' if x < 0 else '#2ecc71' for x in avg_by_dim.values]
avg_by_dim.plot(kind='barh', ax=ax3, color=colors_dim, alpha=0.8)
ax3.axvline(x=0, color='black', linestyle='--', linewidth=1, alpha=0.5)
ax3.set_xlabel('Average Test R²', fontsize=10)
ax3.set_ylabel('OCEAN Dimension', fontsize=10)
ax3.set_title('Average Test R² by OCEAN Dimension', fontsize=12, fontweight='bold')
ax3.grid(True, alpha=0.3, axis='x')

# 4. RMSE by LLM
ax4 = axes[1, 1]
avg_rmse_by_llm = comparison_df.groupby('LLM')['Test_RMSE'].mean().sort_values()
avg_rmse_by_llm.plot(kind='barh', ax=ax4, color='#3498db', alpha=0.8)
ax4.set_xlabel('Average Test RMSE', fontsize=10)
ax4.set_ylabel('LLM Model', fontsize=10)
ax4.set_title('Average Test RMSE by LLM (Lower is Better)', fontsize=12, fontweight='bold')
ax4.grid(True, alpha=0.3, axis='x')

plt.tight_layout()
viz_file = '../05f_ridge_visualization.png'
plt.savefig(viz_file, dpi=300, bbox_inches='tight')
print(f"\nVisualization saved: {viz_file}")
plt.show()

## Step 7: Final Summary

In [None]:
print("\n" + "="*80)
print("FINAL SUMMARY - Ridge Regression (BGE Embeddings)")
print("="*80)

print("\n1. Overall Performance:")
print(f"   Average Test R² across all models: {comparison_df['Test_R2'].mean():.4f}")
print(f"   Best Test R²: {comparison_df['Test_R2'].max():.4f}")
print(f"   Worst Test R²: {comparison_df['Test_R2'].min():.4f}")
print(f"   Std Dev: {comparison_df['Test_R2'].std():.4f}")
print(f"   Models with positive R²: {(comparison_df['Test_R2'] > 0).sum()}/{len(comparison_df)}")

print("\n2. Best Performing LLM:")
best_llm = comparison_df.groupby('LLM')['Test_R2'].mean().idxmax()
best_r2 = comparison_df.groupby('LLM')['Test_R2'].mean().max()
print(f"   {best_llm}: {best_r2:.4f}")

print("\n3. Best Performing Dimension:")
best_dim = comparison_df.groupby('Dimension')['Test_R2'].mean().idxmax()
best_dim_r2 = comparison_df.groupby('Dimension')['Test_R2'].mean().max()
print(f"   {best_dim}: {best_dim_r2:.4f}")

print("\n4. Worst Performing:")
worst_row = comparison_df.loc[comparison_df['Test_R2'].idxmin()]
print(f"   {worst_row['LLM']} - {worst_row['Dimension']}: {worst_row['Test_R2']:.4f}")

print("\n" + "="*80)
print("Output Files Generated:")
print("="*80)
print("Models:")
for llm_key in LLM_CONFIGS.keys():
    print(f"  - ridge_models_{llm_key}.pkl")
print("\nReports:")
for llm_key in LLM_CONFIGS.keys():
    print(f"  - 05f_ridge_training_report_{llm_key}.json")
print("\nComparison:")
print(f"  - 05f_ridge_comparison.csv")
print(f"  - 05f_ridge_visualization.png")

print("\n" + "="*80)
print("05f Ridge Training Complete!")
print("="*80)
print("\nNote: Ridge shows severe overfitting (train R² ≈ 0.999, test R² < 0)")
print("Consider using Elastic Net (05f_train_elasticnet_all_models.ipynb) for better results.")