# 04. Nonlinear Machine Learning Models

**Objective:** Implement and evaluate nonlinear ML models for inflation forecasting

**Models:**
1. Random Forest Regression
2. Support Vector Regression (SVR)
3. XGBoost (Extreme Gradient Boosting)

**Evaluation Metrics:**
- RMSFE (Root Mean Squared Forecast Error)
- MAPE (Mean Absolute Percentage Error)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Nonlinear ML models
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Custom metrics
import sys
sys.path.append('../')
from utils.metrics import rmsfe, mape, evaluate_model

# Paths
PROCESSED_DATA_PATH = Path('../data/processed')
RESULTS_PATH = Path('../results')
RESULTS_PATH.mkdir(parents=True, exist_ok=True)

## 1. Load Processed Data

In [None]:
# Load train and test sets
df_train = pd.read_csv(PROCESSED_DATA_PATH / 'df_train.csv', index_col=0, parse_dates=True)
df_test = pd.read_csv(PROCESSED_DATA_PATH / 'df_test.csv', index_col=0, parse_dates=True)

print(f"Train set: {df_train.shape}")
print(f"Test set:  {df_test.shape}")

# Target variable
target_col = df_train.columns[0]
print(f"\nTarget: {target_col}")

## 2. Prepare Data

In [None]:
# Create supervised learning format with lag features
def create_supervised_data(data, n_lags=12):
    """
    Transform time series into supervised learning format
    """
    df = pd.DataFrame()
    
    # Target (t)
    df['y'] = data.values
    
    # Lag features (t-1, t-2, ..., t-n)
    for i in range(1, n_lags + 1):
        df[f'lag_{i}'] = data.shift(i).values
    
    # Remove rows with NaN
    df = df.dropna()
    
    return df

# Create supervised dataset
n_lags = 12
df_train_supervised = create_supervised_data(df_train[target_col], n_lags=n_lags)
df_test_supervised = create_supervised_data(df_test[target_col], n_lags=n_lags)

# Split into X and y
X_train = df_train_supervised.drop('y', axis=1)
y_train = df_train_supervised['y']

X_test = df_test_supervised.drop('y', axis=1)
y_test = df_test_supervised['y']

print(f"\nX_train shape: {X_train.shape}")
print(f"X_test shape:  {X_test.shape}")

In [None]:
# Standardize for SVR (Random Forest and XGBoost don't require scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("✓ Features standardized")

## 3. Model 1: Random Forest Regression

Random Forest is an ensemble method that builds multiple decision trees and averages their predictions.

In [None]:
# Random Forest with hyperparameter tuning
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10]
}

rf_grid = GridSearchCV(
    RandomForestRegressor(random_state=42),
    rf_params,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

print("Training Random Forest... (this may take a while)")
rf_grid.fit(X_train, y_train)

print(f"Best params: {rf_grid.best_params_}")

# Best model
rf_model = rf_grid.best_estimator_
rf_predictions = rf_model.predict(X_test)

# Evaluate
rf_results = evaluate_model(y_test.values, rf_predictions, "Random Forest")

In [None]:
# Feature importance for Random Forest
feature_importance_rf = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nRandom Forest - Top 10 Important Features:")
print(feature_importance_rf.head(10))

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_rf['Feature'][:10], feature_importance_rf['Importance'][:10])
plt.xlabel('Importance')
plt.title('Random Forest - Top 10 Feature Importance')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig(RESULTS_PATH / 'figures' / 'rf_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

## 4. Model 2: Support Vector Regression (SVR)

SVR finds a hyperplane that best fits the data within a specified margin.

In [None]:
# SVR with hyperparameter tuning
svr_params = {
    'kernel': ['rbf', 'linear'],
    'C': [0.1, 1.0, 10.0],
    'epsilon': [0.01, 0.1, 0.5]
}

svr_grid = GridSearchCV(
    SVR(),
    svr_params,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

print("Training SVR... (this may take a while)")
svr_grid.fit(X_train_scaled, y_train)

print(f"Best params: {svr_grid.best_params_}")

# Best model
svr_model = svr_grid.best_estimator_
svr_predictions = svr_model.predict(X_test_scaled)

# Evaluate
svr_results = evaluate_model(y_test.values, svr_predictions, "SVR")

## 5. Model 3: XGBoost

XGBoost is a powerful gradient boosting algorithm that builds trees sequentially.

In [None]:
# XGBoost with hyperparameter tuning
xgb_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 1.0]
}

xgb_grid = GridSearchCV(
    XGBRegressor(random_state=42),
    xgb_params,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

print("Training XGBoost... (this may take a while)")
xgb_grid.fit(X_train, y_train)

print(f"Best params: {xgb_grid.best_params_}")

# Best model
xgb_model = xgb_grid.best_estimator_
xgb_predictions = xgb_model.predict(X_test)

# Evaluate
xgb_results = evaluate_model(y_test.values, xgb_predictions, "XGBoost")

In [None]:
# Feature importance for XGBoost
feature_importance_xgb = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': xgb_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nXGBoost - Top 10 Important Features:")
print(feature_importance_xgb.head(10))

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_xgb['Feature'][:10], feature_importance_xgb['Importance'][:10])
plt.xlabel('Importance')
plt.title('XGBoost - Top 10 Feature Importance')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig(RESULTS_PATH / 'figures' / 'xgb_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

## 6. Visualize Results

In [None]:
# Plot predictions vs actual
plt.figure(figsize=(14, 8))

plt.plot(range(len(y_test)), y_test.values, label='Actual', linewidth=2.5, marker='o', markersize=6)
plt.plot(range(len(y_test)), rf_predictions, label='Random Forest', linewidth=2, marker='s', alpha=0.7)
plt.plot(range(len(y_test)), svr_predictions, label='SVR', linewidth=2, marker='^', alpha=0.7)
plt.plot(range(len(y_test)), xgb_predictions, label='XGBoost', linewidth=2, marker='d', alpha=0.7)

plt.title('Nonlinear ML Models - Forecasts vs Actual', fontsize=16)
plt.xlabel('Test Sample Index', fontsize=12)
plt.ylabel('Inflation Rate (%)', fontsize=12)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(RESULTS_PATH / 'figures' / 'nonlinear_ml_forecasts.png', dpi=300, bbox_inches='tight')
plt.show()

## 7. Compare Model Performance

In [None]:
# Compile results
results_list = [rf_results, svr_results, xgb_results]
results_df = pd.DataFrame(results_list)
results_df = results_df.sort_values('RMSFE')

print("\n" + "="*50)
print("NONLINEAR ML MODELS PERFORMANCE SUMMARY")
print("="*50)
print(results_df.to_string(index=False))
print("="*50)

# Save results
results_df.to_csv(RESULTS_PATH / 'tables' / 'nonlinear_ml_results.csv', index=False)
print("\n✓ Results saved to results/tables/nonlinear_ml_results.csv")

In [None]:
# Save predictions
predictions_df = pd.DataFrame({
    'Actual': y_test.values,
    'Random_Forest': rf_predictions,
    'SVR': svr_predictions,
    'XGBoost': xgb_predictions
})

predictions_df.to_csv(RESULTS_PATH / 'tables' / 'nonlinear_ml_predictions.csv')
print("✓ Predictions saved to results/tables/nonlinear_ml_predictions.csv")

## Summary

**Nonlinear ML models completed:**
- ✓ Random Forest Regression
- ✓ Support Vector Regression (SVR)
- ✓ XGBoost

**Next steps:**
- Compare all models (econometric + linear ML + nonlinear ML) in Notebook 05