# Random Forest Model - Implementation

## Features

- **RandomForestModel**: Single-target Random Forest regressor
- **RandomForestGoalPredictor**: Dual model for home/away goals
- Feature importance (impurity-based and permutation)
- Out-of-bag (OOB) scoring
- Cross-validation support
- Bootstrap aggregating (bagging)

## Hyperparameters

| Parameter | Default | Range | Impact |
|-----------|---------|-------|--------|
| n_estimators | 200 | 50-500 | More trees = better but slower |
| max_depth | 15 | 5-30 | Deeper = more complex |
| min_samples_split | 5 | 2-20 | Higher = more regularization |
| min_samples_leaf | 2 | 1-10 | Higher = smoother predictions |
| max_features | 'sqrt' | 'sqrt', 'log2', float | Feature sampling per split |

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Random Forest ready!")

## RandomForestModel Class

In [None]:
class RandomForestModel:
    """
    Random Forest regression model for hockey goal prediction.
    
    Advantages:
    - Robust to overfitting via bagging
    - No feature scaling required
    - Built-in feature importance
    - Out-of-bag error estimation
    - Handles non-linear relationships
    """
    
    DEFAULT_PARAMS = {
        'n_estimators': 200,
        'max_depth': 15,
        'min_samples_split': 5,
        'min_samples_leaf': 2,
        'max_features': 'sqrt',
        'bootstrap': True,
        'oob_score': True,
        'random_state': 42,
        'n_jobs': -1,
    }
    
    def __init__(self, params=None):
        self.params = {**self.DEFAULT_PARAMS, **(params or {})}
        self.model = RandomForestRegressor(**self.params)
        self.feature_names = None
        self.is_fitted = False
    
    def fit(self, X, y):
        """Train the Random Forest model."""
        if isinstance(X, pd.DataFrame):
            self.feature_names = list(X.columns)
            X = X.values
        if isinstance(y, pd.Series):
            y = y.values
        
        self.model.fit(X, y)
        self.is_fitted = True
        
        # Log OOB score if available
        if hasattr(self.model, 'oob_score_'):
            print(f"OOB RÂ² Score: {self.model.oob_score_:.4f}")
        
        return self
    
    def predict(self, X):
        """Make predictions."""
        if isinstance(X, pd.DataFrame):
            X = X.values
        return self.model.predict(X)
    
    def evaluate(self, X, y):
        """Evaluate model performance."""
        predictions = self.predict(X)
        if isinstance(y, pd.Series):
            y = y.values
        
        return {
            'rmse': np.sqrt(mean_squared_error(y, predictions)),
            'mae': mean_absolute_error(y, predictions),
            'r2': r2_score(y, predictions),
            'oob_score': getattr(self.model, 'oob_score_', None),
        }
    
    def get_feature_importance(self, method='impurity'):
        """Get feature importance."""
        if method == 'impurity':
            importances = self.model.feature_importances_
        else:
            raise ValueError("Use permutation_importance for other methods")
        
        names = self.feature_names or [f"feature_{i}" for i in range(len(importances))]
        return pd.Series(importances, index=names).sort_values(ascending=False)

## Generate Sample Data

In [None]:
def generate_hockey_data(n_games=1000):
    """Generate synthetic hockey data with realistic features."""
    np.random.seed(42)
    
    data = {
        'home_elo': np.random.normal(1500, 100, n_games),
        'away_elo': np.random.normal(1500, 100, n_games),
        'home_goals_avg': np.random.uniform(2.5, 3.5, n_games),
        'away_goals_avg': np.random.uniform(2.5, 3.5, n_games),
        'home_goals_against_avg': np.random.uniform(2.5, 3.5, n_games),
        'away_goals_against_avg': np.random.uniform(2.5, 3.5, n_games),
        'home_pp_pct': np.random.uniform(0.15, 0.30, n_games),
        'away_pp_pct': np.random.uniform(0.15, 0.30, n_games),
        'home_pk_pct': np.random.uniform(0.75, 0.90, n_games),
        'away_pk_pct': np.random.uniform(0.75, 0.90, n_games),
        'home_rest_days': np.random.randint(1, 5, n_games),
        'away_rest_days': np.random.randint(1, 5, n_games),
    }
    
    df = pd.DataFrame(data)
    df['elo_diff'] = df['home_elo'] - df['away_elo']
    
    # Generate realistic goals based on features
    home_base = 3.0 + 0.001 * df['elo_diff'] + 0.3 * (df['home_goals_avg'] - 3.0)
    away_base = 3.0 - 0.001 * df['elo_diff'] + 0.3 * (df['away_goals_avg'] - 3.0)
    
    df['home_goals'] = np.random.poisson(np.maximum(home_base, 1.5))
    df['away_goals'] = np.random.poisson(np.maximum(away_base, 1.5))
    
    return df

# Generate data
games_df = generate_hockey_data(1000)
print(f"Generated {len(games_df)} games")
games_df.head()

## Train Random Forest Model

In [None]:
# Prepare features and target
feature_cols = [
    'home_elo', 'away_elo', 'elo_diff',
    'home_goals_avg', 'away_goals_avg',
    'home_goals_against_avg', 'away_goals_against_avg',
    'home_pp_pct', 'away_pp_pct',
    'home_pk_pct', 'away_pk_pct',
    'home_rest_days', 'away_rest_days',
]

X = games_df[feature_cols]
y = games_df['home_goals']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training: {len(X_train)} games, Test: {len(X_test)} games")

In [None]:
# Train model
rf_model = RandomForestModel()
rf_model.fit(X_train, y_train)

# Evaluate
train_metrics = rf_model.evaluate(X_train, y_train)
test_metrics = rf_model.evaluate(X_test, y_test)

print("\nTraining Metrics:")
for k, v in train_metrics.items():
    if v is not None:
        print(f"  {k}: {v:.4f}")

print("\nTest Metrics:")
for k, v in test_metrics.items():
    if v is not None:
        print(f"  {k}: {v:.4f}")

## Feature Importance Analysis

In [None]:
# Impurity-based importance
importance = rf_model.get_feature_importance('impurity')

# Plot
plt.figure(figsize=(10, 6))
importance.plot(kind='barh', color='forestgreen')
plt.xlabel('Importance (Mean Decrease in Impurity)')
plt.title('Random Forest Feature Importance')
plt.tight_layout()
plt.show()

print("\nTop 5 Features:")
print(importance.head())

In [None]:
# Permutation importance (more reliable)
perm_importance = permutation_importance(
    rf_model.model, X_test.values, y_test.values,
    n_repeats=10, random_state=42, n_jobs=-1
)

perm_imp_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': perm_importance.importances_mean,
    'std': perm_importance.importances_std
}).sort_values('importance', ascending=False)

# Plot
plt.figure(figsize=(10, 6))
plt.barh(perm_imp_df['feature'], perm_imp_df['importance'], 
         xerr=perm_imp_df['std'], color='steelblue', alpha=0.8)
plt.xlabel('Mean Decrease in Performance')
plt.title('Permutation Feature Importance')
plt.tight_layout()
plt.gca().invert_yaxis()
plt.show()

## Cross-Validation

In [None]:
# 5-fold cross-validation
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(
    RandomForestRegressor(**RandomForestModel.DEFAULT_PARAMS),
    X, y, cv=5, scoring='neg_mean_squared_error'
)

cv_rmse = np.sqrt(-cv_scores)
print(f"CV RMSE: {cv_rmse.mean():.4f} (+/- {cv_rmse.std():.4f})")
print(f"Fold scores: {cv_rmse}")

## RandomForestGoalPredictor (Dual Model)

In [None]:
class RandomForestGoalPredictor:
    """
    Dual Random Forest model for predicting both home and away goals.
    """
    
    def __init__(self, params=None):
        self.params = params
        self.home_model = RandomForestModel(params)
        self.away_model = RandomForestModel(params)
        self.feature_columns = None
    
    def fit(self, df, feature_columns=None):
        """Fit both home and away models."""
        if feature_columns:
            self.feature_columns = feature_columns
        else:
            # Auto-detect numeric columns
            exclude = ['home_goals', 'away_goals', 'total_goals']
            self.feature_columns = [c for c in df.select_dtypes(include=[np.number]).columns
                                    if c not in exclude]
        
        X = df[self.feature_columns]
        
        print("Training home goals model...")
        self.home_model.fit(X, df['home_goals'])
        
        print("Training away goals model...")
        self.away_model.fit(X, df['away_goals'])
        
        return self
    
    def predict_goals(self, df):
        """Predict home and away goals."""
        X = df[self.feature_columns]
        return self.home_model.predict(X), self.away_model.predict(X)
    
    def predict_winner(self, df):
        """Predict game winners."""
        home_pred, away_pred = self.predict_goals(df)
        results = []
        for h, a in zip(home_pred, away_pred):
            if h > a + 0.5:
                results.append('home')
            elif a > h + 0.5:
                results.append('away')
            else:
                results.append('tie')
        return pd.Series(results, index=df.index)
    
    def evaluate(self, df):
        """Evaluate both models."""
        home_pred, away_pred = self.predict_goals(df)
        return {
            'home_rmse': np.sqrt(mean_squared_error(df['home_goals'], home_pred)),
            'home_mae': mean_absolute_error(df['home_goals'], home_pred),
            'away_rmse': np.sqrt(mean_squared_error(df['away_goals'], away_pred)),
            'away_mae': mean_absolute_error(df['away_goals'], away_pred),
        }

In [None]:
# Train dual predictor
predictor = RandomForestGoalPredictor()
predictor.fit(games_df, feature_cols)

# Evaluate
metrics = predictor.evaluate(games_df)
print("\nDual Model Metrics:")
for k, v in metrics.items():
    print(f"  {k}: {v:.4f}")

## Hyperparameter Tuning Effect

In [None]:
# Test different n_estimators
n_trees_options = [50, 100, 200, 300, 500]
results = []

for n_trees in n_trees_options:
    model = RandomForestModel({'n_estimators': n_trees, 'oob_score': False})
    model.fit(X_train, y_train)
    metrics = model.evaluate(X_test, y_test)
    results.append({'n_estimators': n_trees, **metrics})

results_df = pd.DataFrame(results)

# Plot
plt.figure(figsize=(10, 5))
plt.plot(results_df['n_estimators'], results_df['rmse'], 'o-', color='forestgreen', linewidth=2)
plt.xlabel('Number of Trees')
plt.ylabel('Test RMSE')
plt.title('Effect of n_estimators on Performance')
plt.grid(True, alpha=0.3)
plt.show()

print("\nResults:")
print(results_df[['n_estimators', 'rmse', 'mae', 'r2']].to_string(index=False))

In [None]:
# Test different max_depth
depth_options = [5, 10, 15, 20, None]
results = []

for depth in depth_options:
    model = RandomForestModel({'max_depth': depth, 'n_estimators': 100, 'oob_score': False})
    model.fit(X_train, y_train)
    train_m = model.evaluate(X_train, y_train)
    test_m = model.evaluate(X_test, y_test)
    results.append({
        'max_depth': depth if depth else 'None',
        'train_rmse': train_m['rmse'],
        'test_rmse': test_m['rmse'],
    })

results_df = pd.DataFrame(results)

# Plot
plt.figure(figsize=(10, 5))
x = range(len(depth_options))
plt.plot(x, results_df['train_rmse'], 'o-', label='Train RMSE', color='steelblue')
plt.plot(x, results_df['test_rmse'], 's-', label='Test RMSE', color='coral')
plt.xticks(x, results_df['max_depth'])
plt.xlabel('Max Depth')
plt.ylabel('RMSE')
plt.title('Effect of max_depth on Overfitting')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## Save/Load Model

In [None]:
import pickle
from pathlib import Path

# Save model
output_dir = Path('../output/models')
output_dir.mkdir(parents=True, exist_ok=True)

model_path = output_dir / 'random_forest_model.pkl'

with open(model_path, 'wb') as f:
    pickle.dump({
        'model': rf_model.model,
        'feature_names': rf_model.feature_names,
        'params': rf_model.params,
    }, f)

print(f"Model saved to {model_path}")

# Load and verify
with open(model_path, 'rb') as f:
    loaded = pickle.load(f)

loaded_pred = loaded['model'].predict(X_test.values)
print(f"Loaded model RMSE: {np.sqrt(mean_squared_error(y_test, loaded_pred)):.4f}")

## Summary

### Key Takeaways

1. **Random Forest** uses bagging (bootstrap aggregating) for robust predictions
2. **OOB Score** provides free validation without holdout set
3. **Feature Importance** can be measured via impurity or permutation
4. **max_depth** is the key regularization parameter
5. **No scaling required** - tree-based methods are scale-invariant

### Best Practices

- Start with 100-200 trees, increase if OOB score improves
- Use `max_depth` to control overfitting
- Use permutation importance for reliable feature ranking
- Compare train vs test RMSE to detect overfitting