# Nonlinear Model Training
**Purpose**: Train tree-based and ensemble models to capture nonlinear patterns in financial data

## Pipeline
1. Load prepared dataset & selected features from 03a
2. Train Random Forest, Gradient Boosting, and XGBoost
3. Hyperparameter tuning via validation set
4. Feature importance analysis
5. Compare all models (including linear baseline)
6. Save best model & artifacts

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
import json
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import joblib

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print('✓ Libraries loaded')

In [None]:
# === Configuration ===
DATA_DIR = Path('/home/archy/Desktop/Server/FinancialData/model_regime_comparison/data/prepared')
MODEL_DIR = Path('/home/archy/Desktop/Server/FinancialData/model_regime_comparison/models')
MODEL_DIR.mkdir(parents=True, exist_ok=True)

TARGET = 'forward_return_30d'
EXCLUDE_COLS = ['ticker', 'date', 'close', 'forward_return_5d', 'forward_return_30d']
RANDOM_STATE = 42

print(f'Data dir: {DATA_DIR}')
print(f'Target: {TARGET}')

## 1. Load Data & Features

In [None]:
train_df = pd.read_parquet(DATA_DIR / 'finance_train.parquet')
val_df = pd.read_parquet(DATA_DIR / 'finance_val.parquet')
test_df = pd.read_parquet(DATA_DIR / 'finance_test.parquet')

print(f'Train: {train_df.shape}')
print(f'Val:   {val_df.shape}')
print(f'Test:  {test_df.shape}')

In [None]:
# Load selected features from 03a (if available), otherwise use all numeric features
features_path = MODEL_DIR / 'selected_features.json'
if features_path.exists():
    with open(features_path) as f:
        feat_info = json.load(f)
    selected_features = feat_info['selected_features']
    print(f'Loaded {len(selected_features)} selected features from 03a')
else:
    all_feature_cols = [c for c in train_df.columns if c not in EXCLUDE_COLS]
    selected_features = train_df[all_feature_cols].select_dtypes(include=[np.number]).columns.tolist()
    print(f'No saved features found, using all {len(selected_features)} numeric features')

print(f'Features: {selected_features}')

In [None]:
# Extract features and target
X_train = train_df[selected_features].copy()
y_train = train_df[TARGET].copy()
X_val = val_df[selected_features].copy()
y_val = val_df[TARGET].copy()
X_test = test_df[selected_features].copy()
y_test = test_df[TARGET].copy()

# Fill missing values with training median
train_medians = X_train.median()
X_train = X_train.fillna(train_medians)
X_val = X_val.fillna(train_medians)
X_test = X_test.fillna(train_medians)

# Drop rows where target is NaN
train_mask = ~y_train.isna()
val_mask = ~y_val.isna()
test_mask = ~y_test.isna()
X_train, y_train = X_train[train_mask], y_train[train_mask]
X_val, y_val = X_val[val_mask], y_val[val_mask]
X_test, y_test = X_test[test_mask], y_test[test_mask]

# Drop any remaining NaN columns
still_nan = X_train.columns[X_train.isnull().any()].tolist()
if still_nan:
    print(f'Dropping {len(still_nan)} columns still containing NaN: {still_nan}')
    X_train = X_train.drop(columns=still_nan)
    X_val = X_val.drop(columns=still_nan)
    X_test = X_test.drop(columns=still_nan)
    selected_features = [c for c in selected_features if c not in still_nan]

print(f'Train: {X_train.shape}')
print(f'Val:   {X_val.shape}')
print(f'Test:  {X_test.shape}')

## 2. Train Nonlinear Models

In [None]:
def evaluate_model(model, X_tr, y_tr, X_v, y_v):
    """Train and evaluate a model, return metrics dict."""
    model.fit(X_tr, y_tr)
    y_tr_pred = model.predict(X_tr)
    y_v_pred = model.predict(X_v)
    return {
        'Train RMSE': np.sqrt(mean_squared_error(y_tr, y_tr_pred)),
        'Val RMSE': np.sqrt(mean_squared_error(y_v, y_v_pred)),
        'Train MAE': mean_absolute_error(y_tr, y_tr_pred),
        'Val MAE': mean_absolute_error(y_v, y_v_pred),
        'Train R²': r2_score(y_tr, y_tr_pred),
        'Val R²': r2_score(y_v, y_v_pred),
    }

In [None]:
# Define models with varying hyperparameters
models = {
    # Random Forest variants
    'RF (100, d=5)': RandomForestRegressor(
        n_estimators=100, max_depth=5, min_samples_leaf=20,
        random_state=RANDOM_STATE, n_jobs=-1),
    'RF (200, d=8)': RandomForestRegressor(
        n_estimators=200, max_depth=8, min_samples_leaf=10,
        random_state=RANDOM_STATE, n_jobs=-1),
    'RF (300, d=12)': RandomForestRegressor(
        n_estimators=300, max_depth=12, min_samples_leaf=5,
        random_state=RANDOM_STATE, n_jobs=-1),

    # Gradient Boosting variants
    'GBR (100, d=3, lr=0.1)': GradientBoostingRegressor(
        n_estimators=100, max_depth=3, learning_rate=0.1,
        subsample=0.8, random_state=RANDOM_STATE),
    'GBR (200, d=4, lr=0.05)': GradientBoostingRegressor(
        n_estimators=200, max_depth=4, learning_rate=0.05,
        subsample=0.8, random_state=RANDOM_STATE),
    'GBR (500, d=3, lr=0.01)': GradientBoostingRegressor(
        n_estimators=500, max_depth=3, learning_rate=0.01,
        subsample=0.8, random_state=RANDOM_STATE),

    # XGBoost variants
    'XGB (100, d=3, lr=0.1)': xgb.XGBRegressor(
        n_estimators=100, max_depth=3, learning_rate=0.1,
        subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=1.0,
        random_state=RANDOM_STATE, n_jobs=-1, verbosity=0),
    'XGB (200, d=4, lr=0.05)': xgb.XGBRegressor(
        n_estimators=200, max_depth=4, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=1.0,
        random_state=RANDOM_STATE, n_jobs=-1, verbosity=0),
    'XGB (500, d=3, lr=0.01)': xgb.XGBRegressor(
        n_estimators=500, max_depth=3, learning_rate=0.01,
        subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=1.0,
        random_state=RANDOM_STATE, n_jobs=-1, verbosity=0),
    'XGB (1000, d=4, lr=0.005)': xgb.XGBRegressor(
        n_estimators=1000, max_depth=4, learning_rate=0.005,
        subsample=0.8, colsample_bytree=0.8, reg_alpha=0.5, reg_lambda=2.0,
        random_state=RANDOM_STATE, n_jobs=-1, verbosity=0),
}

results = []
for name, model in models.items():
    metrics = evaluate_model(model, X_train, y_train, X_val, y_val)
    metrics['Model'] = name
    metrics['model_obj'] = model
    results.append(metrics)
    print(f'{name:32s}  Train RMSE={metrics["Train RMSE"]:.6f}  Val RMSE={metrics["Val RMSE"]:.6f}  Val R²={metrics["Val R²"]:.6f}')

results_df = pd.DataFrame(results)
print('\n✓ All models trained')

## 3. Model Comparison

In [None]:
# Sort by validation RMSE
display_df = results_df[['Model', 'Train RMSE', 'Val RMSE', 'Train MAE', 'Val MAE', 'Train R²', 'Val R²']]
display_df = display_df.sort_values('Val RMSE')
print(display_df.to_string(index=False))

In [None]:
# Model comparison plot
fig, axes = plt.subplots(1, 3, figsize=(20, 7))
sorted_df = results_df.sort_values('Val RMSE')
x = np.arange(len(sorted_df))
w = 0.35

for ax, metric, title in zip(axes,
    [('Train RMSE', 'Val RMSE'), ('Train MAE', 'Val MAE'), ('Train R²', 'Val R²')],
    ['RMSE', 'MAE', 'R² Score']):
    ax.bar(x - w/2, sorted_df[metric[0]], w, label='Train', alpha=0.8)
    ax.bar(x + w/2, sorted_df[metric[1]], w, label='Validation', alpha=0.8)
    ax.set_xticks(x)
    ax.set_xticklabels(sorted_df['Model'], rotation=55, ha='right', fontsize=8)
    ax.set_title(title)
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Overfitting analysis: train vs val gap
sorted_df = results_df.sort_values('Val RMSE').copy()
sorted_df['Overfit Gap'] = sorted_df['Train RMSE'] - sorted_df['Val RMSE']
sorted_df['Overfit Ratio'] = sorted_df['Val RMSE'] / sorted_df['Train RMSE']

fig, ax = plt.subplots(figsize=(12, 6))
x = np.arange(len(sorted_df))
ax.bar(x, sorted_df['Overfit Ratio'], color='coral', alpha=0.7)
ax.axhline(1.0, color='green', linestyle='--', lw=1.5, label='No overfit (ratio=1.0)')
ax.set_xticks(x)
ax.set_xticklabels(sorted_df['Model'], rotation=55, ha='right', fontsize=8)
ax.set_ylabel('Val RMSE / Train RMSE')
ax.set_title('Overfitting Analysis (closer to 1.0 = less overfit)')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 4. Best Model Evaluation

In [None]:
# Select best model by validation RMSE
best_idx = results_df['Val RMSE'].idxmin()
best_name = results_df.loc[best_idx, 'Model']
best_model = results_df.loc[best_idx, 'model_obj']

print(f'Best Model: {best_name}')
print(f'   Val RMSE: {results_df.loc[best_idx, "Val RMSE"]:.6f}')
print(f'   Val R²:   {results_df.loc[best_idx, "Val R²"]:.6f}')

# Test set evaluation
y_test_pred = best_model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f'\n   Test RMSE: {test_rmse:.6f}')
print(f'   Test MAE:  {test_mae:.6f}')
print(f'   Test R²:   {test_r2:.6f}')

In [None]:
# Prediction vs actual and residuals
y_val_pred = best_model.predict(X_val)

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Predicted vs actual
axes[0].scatter(y_val, y_val_pred, alpha=0.3, s=10)
lims = [min(y_val.min(), y_val_pred.min()), max(y_val.max(), y_val_pred.max())]
axes[0].plot(lims, lims, 'r--', lw=1.5)
axes[0].set_xlabel('Actual')
axes[0].set_ylabel('Predicted')
axes[0].set_title(f'Validation: Predicted vs Actual')
axes[0].grid(True, alpha=0.3)

# Residuals
residuals = y_val - y_val_pred
axes[1].scatter(y_val_pred, residuals, alpha=0.3, s=10)
axes[1].axhline(0, color='red', linestyle='--', lw=1.5)
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Residual')
axes[1].set_title('Residual Plot')
axes[1].grid(True, alpha=0.3)

# Residual distribution
axes[2].hist(residuals, bins=40, edgecolor='black', alpha=0.7)
axes[2].axvline(0, color='red', linestyle='--', lw=1.5)
axes[2].set_xlabel('Residual')
axes[2].set_ylabel('Count')
axes[2].set_title(f'Residual Distribution (μ={residuals.mean():.4f}, σ={residuals.std():.4f})')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Feature Importance

In [None]:
# Feature importance from best model
if hasattr(best_model, 'feature_importances_'):
    imp = best_model.feature_importances_
    imp_df = pd.DataFrame({
        'Feature': selected_features,
        'Importance': imp
    }).sort_values('Importance', ascending=False)

    fig, ax = plt.subplots(figsize=(12, max(6, len(imp_df) * 0.35)))
    ax.barh(range(len(imp_df)), imp_df['Importance'], color='teal', alpha=0.7)
    ax.set_yticks(range(len(imp_df)))
    ax.set_yticklabels(imp_df['Feature'])
    ax.set_xlabel('Feature Importance')
    ax.set_title(f'Feature Importance - {best_name}')
    ax.grid(True, alpha=0.3, axis='x')
    plt.tight_layout()
    plt.show()

    print(imp_df.to_string(index=False))
else:
    print('Best model does not expose feature_importances_')

In [None]:
# Compare feature importance across model families
# Pick the best from each family
families = {'RF': 'RF', 'GBR': 'GBR', 'XGB': 'XGB'}
family_best = {}
for prefix, label in families.items():
    family_rows = results_df[results_df['Model'].str.startswith(prefix)]
    if len(family_rows) > 0:
        best_family_idx = family_rows['Val RMSE'].idxmin()
        family_best[label] = family_rows.loc[best_family_idx, 'model_obj']

if len(family_best) > 1:
    fig, axes = plt.subplots(1, len(family_best), figsize=(7 * len(family_best), max(6, len(selected_features) * 0.35)))
    if len(family_best) == 1:
        axes = [axes]

    for ax, (label, model) in zip(axes, family_best.items()):
        if hasattr(model, 'feature_importances_'):
            imp = model.feature_importances_
            imp_df = pd.DataFrame({'Feature': selected_features, 'Importance': imp}).sort_values('Importance', ascending=True)
            ax.barh(range(len(imp_df)), imp_df['Importance'], alpha=0.7)
            ax.set_yticks(range(len(imp_df)))
            ax.set_yticklabels(imp_df['Feature'])
            ax.set_xlabel('Importance')
            ax.set_title(f'{label} (best)')
            ax.grid(True, alpha=0.3, axis='x')

    plt.suptitle('Feature Importance Comparison Across Model Families', fontsize=14)
    plt.tight_layout()
    plt.show()

## 6. Compare with Linear Baseline

In [None]:
# Load linear model results if available
meta_path = MODEL_DIR / 'model_metadata.json'
if meta_path.exists():
    with open(meta_path) as f:
        linear_meta = json.load(f)
    print('Linear baseline (from 03a):')
    print(f'  Model:     {linear_meta["model_name"]}')
    print(f'  Val RMSE:  {linear_meta["val_rmse"]:.6f}')
    print(f'  Val R²:    {linear_meta["val_r2"]:.6f}')
    print(f'  Test RMSE: {linear_meta["test_rmse"]:.6f}')
    print(f'  Test R²:   {linear_meta["test_r2"]:.6f}')
    print()

print(f'Best nonlinear model ({best_name}):')
print(f'  Val RMSE:  {results_df.loc[best_idx, "Val RMSE"]:.6f}')
print(f'  Val R²:    {results_df.loc[best_idx, "Val R²"]:.6f}')
print(f'  Test RMSE: {test_rmse:.6f}')
print(f'  Test R²:   {test_r2:.6f}')

if meta_path.exists():
    rmse_improvement = (linear_meta['test_rmse'] - test_rmse) / linear_meta['test_rmse'] * 100
    print(f'\nTest RMSE improvement over linear: {rmse_improvement:+.2f}%')
    r2_diff = test_r2 - linear_meta['test_r2']
    print(f'Test R² improvement over linear: {r2_diff:+.6f}')

## 7. Save Best Model & Artifacts

In [None]:
# Save model
model_path = MODEL_DIR / 'best_nonlinear_model.pkl'
joblib.dump(best_model, model_path)
print(f'✓ Model saved to {model_path}')

# Save metadata
nl_metadata = {
    'model_name': best_name,
    'model_type': 'nonlinear',
    'target': TARGET,
    'n_features': len(selected_features),
    'features': selected_features,
    'train_size': len(X_train),
    'val_rmse': float(results_df.loc[best_idx, 'Val RMSE']),
    'val_r2': float(results_df.loc[best_idx, 'Val R²']),
    'test_rmse': float(test_rmse),
    'test_mae': float(test_mae),
    'test_r2': float(test_r2),
    'all_results': [
        {k: v for k, v in row.items() if k != 'model_obj'}
        for _, row in results_df.iterrows()
    ]
}
meta_path = MODEL_DIR / 'nonlinear_model_metadata.json'
with open(meta_path, 'w') as f:
    json.dump(nl_metadata, f, indent=2)
print(f'✓ Metadata saved to {meta_path}')

print(f'\n{"="*60}')
print(f'Best Model: {best_name}')
print(f'Features: {len(selected_features)}')
print(f'Val RMSE:  {results_df.loc[best_idx, "Val RMSE"]:.6f}')
print(f'Val R²:    {results_df.loc[best_idx, "Val R²"]:.6f}')
print(f'Test RMSE: {test_rmse:.6f}')
print(f'Test R²:   {test_r2:.6f}')
print(f'{"="*60}')