# 05f - Train Ridge Regression Models (Gemma-2-9B OCEAN)

**Purpose**: Train 5 Ridge regression models to learn the mapping from BGE embeddings to OCEAN scores (Gemma-2-9B Ground Truth)

**Input Files**:
- bge_embeddings_500.npy - BGE embeddings (500x1024)
- ocean_ground_truth/gemma_2_9b_ocean_500.csv - Gemma-2-9B OCEAN ground truth (500x5)

**Output Files**:
- ridge_models_gemma.pkl - 5 Ridge models + Scaler
- 05f_ridge_training_report_gemma.json - Training report
- 05f_ridge_evaluation_gemma.png - Visualization

**Estimated Time**: Approximately 2-3 minutes

## Step 1: Import Libraries and Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import pickle
import os
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from datetime import datetime

print("Libraries loaded successfully")

## Step 2: Load Embeddings and OCEAN Targets

In [None]:
print("="*80)
print("Loading data")
print("="*80)

# Load embeddings
embedding_file = '../bge_embeddings_500.npy'
print(f"\nLoading Embeddings: {embedding_file}")
X = np.load(embedding_file)
print(f"Embeddings shape: {X.shape}")
print(f"  Data type: {X.dtype}")
print(f"  Memory usage: {X.nbytes / 1024 / 1024:.1f} MB")

# Load OCEAN targets (Gemma-2-9B)
ocean_target_file = '../ocean_ground_truth/gemma_2_9b_ocean_500.csv'
print(f"\nLoading OCEAN targets (Gemma-2-9B): {ocean_target_file}")
y_df = pd.read_csv(ocean_target_file)
print(f"OCEAN shape: {y_df.shape}")
print(f"  Columns: {y_df.columns.tolist()}")
print(f"\nOCEAN statistics:")
print(y_df.describe())


# Check and handle NaN values
nan_count_total = y_df.isnull().sum().sum()
if nan_count_total > 0:
    print(f"\nWarning: Found {nan_count_total} NaN values in OCEAN data")
    print(f"NaN count per column:")
    for col in y_df.columns:
        nan_count = y_df[col].isnull().sum()
        if nan_count > 0:
            print(f"  {col}: {nan_count}")
    
    # Get indices of rows with NaN
    nan_indices = y_df[y_df.isnull().any(axis=1)].index
    
    # Drop rows with NaN from both X and y_df
    y_df = y_df.dropna()
    X = np.delete(X, nan_indices, axis=0)
    
    print(f"\nAfter dropping NaN rows: {len(y_df)} samples remaining")

# Verify data consistency
if len(X) == len(y_df):
    print(f"\nData consistency check passed ({len(X)} samples)")
else:
    raise ValueError(f"Data inconsistency: X={len(X)}, y={len(y_df)}")

## Step 3: Data Split and Standardization

In [None]:
print("="*80)
print("Data split and standardization")
print("="*80)

# Train/test split
print("\nPerforming 80/20 split...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y_df,
    test_size=0.2,
    random_state=42
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

# Standardize embeddings
print("\nStandardizing embeddings...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set standardized: mean={X_train_scaled.mean():.6f}, std={X_train_scaled.std():.6f}")
print(f"Test set standardized: mean={X_test_scaled.mean():.6f}, std={X_test_scaled.std():.6f}")

## Step 4: Train Ridge Models

In [None]:
print("="*80)
print("Training Ridge regression models (Gemma-2-9B Ground Truth)")
print("="*80)

# Define OCEAN dimensions
OCEAN_DIMS = ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']

ridge_models = {}
training_results = {}

for dim in OCEAN_DIMS:
    print(f"\n[{OCEAN_DIMS.index(dim)+1}/5] Training {dim}...")
    
    # Get target variable
    y_train_dim = y_train[dim].values
    y_test_dim = y_test[dim].values
    
    # Train Ridge model
    # alpha=1.0 is the default value, can be optimized through cross-validation
    model = Ridge(alpha=1.0, random_state=42, max_iter=10000)
    model.fit(X_train_scaled, y_train_dim)
    
    # Predict
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)
    
    # Evaluate
    train_r2 = r2_score(y_train_dim, y_train_pred)
    test_r2 = r2_score(y_test_dim, y_test_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train_dim, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test_dim, y_test_pred))
    train_mae = mean_absolute_error(y_train_dim, y_train_pred)
    test_mae = mean_absolute_error(y_test_dim, y_test_pred)
    
    # Save model
    ridge_models[dim] = model
    
    # Save results
    training_results[dim] = {
        'train_r2': float(train_r2),
        'test_r2': float(test_r2),
        'train_rmse': float(train_rmse),
        'test_rmse': float(test_rmse),
        'train_mae': float(train_mae),
        'test_mae': float(test_mae),
        'model_coef_shape': model.coef_.shape,
        'model_intercept': float(model.intercept_)
    }
    
    print(f"  Train R2:  {train_r2:.4f}")
    print(f"  Test R2:   {test_r2:.4f}")
    print(f"  Train RMSE: {train_rmse:.4f}")
    print(f"  Test RMSE:  {test_rmse:.4f}")
    print(f"  Train MAE:  {train_mae:.4f}")
    print(f"  Test MAE:   {test_mae:.4f}")

print(f"\nRidge model training complete")

## Step 5: Save Models

In [None]:
print("="*80)
print("Saving models")
print("="*80)

# Save Ridge models and Scaler
model_data = {
    'models': ridge_models,
    'scaler': scaler,
    'ocean_dims': OCEAN_DIMS,
    'training_results': training_results,
    'training_timestamp': datetime.now().isoformat(),
    'llm_model': 'Gemma-2-9B'
}

model_file = '../ridge_models_gemma.pkl'
with open(model_file, 'wb') as f:
    pickle.dump(model_data, f)

print(f"\nModels saved: {model_file}")
print(f"  File size: {os.path.getsize(model_file) / 1024 / 1024:.2f} MB")
print(f"  Contents:")
print(f"    - 5 Ridge models (Gemma-2-9B)")
print(f"    - 1 StandardScaler")
print(f"    - Training results")

# Verify loading
try:
    with open(model_file, 'rb') as f:
        loaded_data = pickle.load(f)
    print(f"\nModel loading verification passed")
    print(f"  Loaded OCEAN dimensions: {loaded_data['ocean_dims']}")
    print(f"  LLM model: {loaded_data['llm_model']}")
except Exception as e:
    print(f"\nModel loading failed: {e}")

## Step 6: Generate Training Report

In [None]:
# Generate training report
report = {
    'phase': '05f - Train Ridge Models (Gemma-2-9B)',
    'timestamp': datetime.now().isoformat(),
    'llm_model': 'Gemma-2-9B',
    'embedding_model': 'BAAI/bge-large-en-v1.5',
    'embedding_dimension': 1024,
    'training_samples': int(X_train.shape[0]),
    'test_samples': int(X_test.shape[0]),
    'model_type': 'Ridge Regression',
    'model_alpha': 1.0,
    'ocean_dimensions': OCEAN_DIMS,
    'model_file': model_file,
    'training_results': training_results,
    'summary_metrics': {}
}

# Calculate summary metrics
test_r2_scores = [training_results[dim]['test_r2'] for dim in OCEAN_DIMS]
test_rmse_scores = [training_results[dim]['test_rmse'] for dim in OCEAN_DIMS]
test_mae_scores = [training_results[dim]['test_mae'] for dim in OCEAN_DIMS]

report['summary_metrics'] = {
    'avg_test_r2': float(np.mean(test_r2_scores)),
    'avg_test_rmse': float(np.mean(test_rmse_scores)),
    'avg_test_mae': float(np.mean(test_mae_scores)),
    'min_test_r2': float(np.min(test_r2_scores)),
    'max_test_r2': float(np.max(test_r2_scores))
}

# Save report
report_file = '../05f_ridge_training_report_gemma.json'
with open(report_file, 'w') as f:
    json.dump(report, f, indent=2)

print(f"Training report saved: {report_file}")
print(f"\n" + "="*80)
print("Training Summary (Gemma-2-9B)")
print("="*80)
print(json.dumps(report['summary_metrics'], indent=2))
print("\n" + "="*80)
print("Detailed Results by Dimension")
print("="*80)
for dim in OCEAN_DIMS:
    result = training_results[dim]
    print(f"\n{dim}:")
    print(f"  Train R2: {result['train_r2']:.4f}, Test R2: {result['test_r2']:.4f}")
    print(f"  Train RMSE: {result['train_rmse']:.4f}, Test RMSE: {result['test_rmse']:.4f}")
    print(f"  Train MAE: {result['train_mae']:.4f}, Test MAE: {result['test_mae']:.4f}")

## Summary

Step 05f Complete (Gemma-2-9B)

**Output Files**:
- `ridge_models_gemma.pkl` - Trained Ridge models (5 dimensions)
- `05f_ridge_training_report_gemma.json` - Training report

**Model Performance**:
- Average Test R2: See report
- Average Test RMSE: See report

**Next Step**:
Run `05g_apply_ridge_gemma.ipynb` to generate OCEAN features for full dataset