# Neural Network Model - Implementation

## Features

- **NeuralNetworkModel**: MLP regressor with automatic scaling
- **NeuralNetworkGoalPredictor**: Dual NN for home/away goals
- Training loss curve visualization
- Early stopping for regularization
- Architecture configuration

## Critical: Feature Scaling

⚠️ **Neural networks REQUIRE feature scaling!** Unlike tree-based models, NNs are very sensitive to feature magnitudes. This notebook handles scaling automatically.

## Hyperparameters

| Parameter | Default | Range | Impact |
|-----------|---------|-------|--------|
| hidden_layer_sizes | (64, 32) | Various | Architecture depth/width |
| activation | 'relu' | relu/tanh | Non-linearity type |
| alpha | 0.001 | 0.0001-0.1 | L2 regularization |
| learning_rate_init | 0.001 | 0.0001-0.01 | Step size |
| max_iter | 500 | 100-2000 | Training epochs |

In [None]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Neural Network ready!")

## NeuralNetworkModel Class

In [None]:
class NeuralNetworkModel:
    """
    Neural network with automatic feature scaling.
    
    CRITICAL: This class handles scaling automatically because
    neural networks require normalized features for proper training.
    """
    
    DEFAULT_PARAMS = {
        'hidden_layer_sizes': (64, 32),
        'activation': 'relu',
        'solver': 'adam',
        'alpha': 0.001,
        'learning_rate': 'adaptive',
        'learning_rate_init': 0.001,
        'max_iter': 500,
        'early_stopping': True,
        'validation_fraction': 0.1,
        'n_iter_no_change': 20,
        'random_state': 42,
    }
    
    def __init__(self, params=None, scaler_type='standard'):
        self.params = {**self.DEFAULT_PARAMS, **(params or {})}
        
        # Create scaler - REQUIRED for neural networks
        if scaler_type == 'standard':
            self.scaler = StandardScaler()
        elif scaler_type == 'robust':
            self.scaler = RobustScaler()
        else:
            raise ValueError(f"Unknown scaler: {scaler_type}")
        
        self.model = MLPRegressor(**self.params)
        self.feature_names = None
        self.is_fitted = False
    
    def fit(self, X, y):
        """Fit with automatic scaling."""
        if isinstance(X, pd.DataFrame):
            self.feature_names = list(X.columns)
            X = X.values
        if isinstance(y, pd.Series):
            y = y.values
        
        # Scale features
        X_scaled = self.scaler.fit_transform(X)
        
        # Train
        self.model.fit(X_scaled, y)
        self.is_fitted = True
        
        print(f"Converged in {self.model.n_iter_} iterations")
        if hasattr(self.model, 'best_loss_'):
            print(f"Best loss: {self.model.best_loss_:.6f}")
        
        return self
    
    def predict(self, X):
        """Predict with automatic scaling."""
        if isinstance(X, pd.DataFrame):
            X = X.values
        
        # Scale using fitted scaler
        X_scaled = self.scaler.transform(X)
        return self.model.predict(X_scaled)
    
    def evaluate(self, X, y):
        """Evaluate model performance."""
        predictions = self.predict(X)
        if isinstance(y, pd.Series):
            y = y.values
        
        return {
            'rmse': np.sqrt(mean_squared_error(y, predictions)),
            'mae': mean_absolute_error(y, predictions),
            'r2': r2_score(y, predictions),
        }
    
    def get_loss_curve(self):
        """Get training loss curve."""
        return self.model.loss_curve_ if hasattr(self.model, 'loss_curve_') else None
    
    def get_architecture(self):
        """Get network architecture summary."""
        return {
            'layers': len(self.params['hidden_layer_sizes']),
            'hidden_layer_sizes': self.params['hidden_layer_sizes'],
            'total_neurons': sum(self.params['hidden_layer_sizes']),
            'activation': self.params['activation'],
        }

## Generate Sample Data

In [None]:
def generate_hockey_data(n_games=1000):
    """Generate synthetic hockey data with realistic features."""
    np.random.seed(42)
    
    data = {
        'home_elo': np.random.normal(1500, 100, n_games),
        'away_elo': np.random.normal(1500, 100, n_games),
        'home_goals_avg': np.random.uniform(2.5, 3.5, n_games),
        'away_goals_avg': np.random.uniform(2.5, 3.5, n_games),
        'home_goals_against_avg': np.random.uniform(2.5, 3.5, n_games),
        'away_goals_against_avg': np.random.uniform(2.5, 3.5, n_games),
        'home_pp_pct': np.random.uniform(0.15, 0.30, n_games),
        'away_pp_pct': np.random.uniform(0.15, 0.30, n_games),
        'home_pk_pct': np.random.uniform(0.75, 0.90, n_games),
        'away_pk_pct': np.random.uniform(0.75, 0.90, n_games),
    }
    
    df = pd.DataFrame(data)
    df['elo_diff'] = df['home_elo'] - df['away_elo']
    
    # Generate realistic goals
    home_base = 3.0 + 0.001 * df['elo_diff'] + 0.3 * (df['home_goals_avg'] - 3.0)
    away_base = 3.0 - 0.001 * df['elo_diff'] + 0.3 * (df['away_goals_avg'] - 3.0)
    
    df['home_goals'] = np.random.poisson(np.maximum(home_base, 1.5))
    df['away_goals'] = np.random.poisson(np.maximum(away_base, 1.5))
    
    return df

games_df = generate_hockey_data(1000)
print(f"Generated {len(games_df)} games")

# Prepare features
feature_cols = [c for c in games_df.columns if c not in ['home_goals', 'away_goals']]
X = games_df[feature_cols]
y = games_df['home_goals']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Train: {len(X_train)}, Test: {len(X_test)}")

## Why Scaling Matters

In [None]:
# Show feature value ranges
print("Feature Value Ranges (unscaled):")
print(X.describe().loc[['min', 'max', 'mean', 'std']].T)

print("\n⚠️ Notice: ELO values (~1500) are 1000x larger than percentages (~0.2)")
print("Without scaling, the network would be dominated by large-magnitude features.")

In [None]:
# Compare scaled vs unscaled training
print("Training WITHOUT scaling (will likely fail or converge poorly):")
try:
    unscaled_model = MLPRegressor(
        hidden_layer_sizes=(64, 32),
        max_iter=100,
        random_state=42
    )
    unscaled_model.fit(X_train.values, y_train.values)
    print(f"  Iterations: {unscaled_model.n_iter_}")
    pred = unscaled_model.predict(X_test.values)
    print(f"  Test RMSE: {np.sqrt(mean_squared_error(y_test, pred)):.4f}")
except Exception as e:
    print(f"  Error: {e}")

print("\nTraining WITH scaling (proper approach):")
scaled_model = NeuralNetworkModel({'max_iter': 100})
scaled_model.fit(X_train, y_train)
metrics = scaled_model.evaluate(X_test, y_test)
print(f"  Test RMSE: {metrics['rmse']:.4f}")

## Train Neural Network Model

In [None]:
# Train with default parameters
nn_model = NeuralNetworkModel()
nn_model.fit(X_train, y_train)

# Print architecture
arch = nn_model.get_architecture()
print(f"\nArchitecture: {arch['hidden_layer_sizes']}")
print(f"Total hidden neurons: {arch['total_neurons']}")
print(f"Activation: {arch['activation']}")

# Evaluate
train_metrics = nn_model.evaluate(X_train, y_train)
test_metrics = nn_model.evaluate(X_test, y_test)

print("\nTraining Metrics:")
for k, v in train_metrics.items():
    print(f"  {k}: {v:.4f}")

print("\nTest Metrics:")
for k, v in test_metrics.items():
    print(f"  {k}: {v:.4f}")

## Training Loss Curve

In [None]:
# Plot loss curve
loss_curve = nn_model.get_loss_curve()

if loss_curve:
    plt.figure(figsize=(10, 5))
    plt.plot(loss_curve, color='steelblue', linewidth=2)
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.title('Training Loss Curve')
    plt.grid(True, alpha=0.3)
    
    # Mark early stopping point
    if nn_model.params['early_stopping']:
        plt.axvline(x=len(loss_curve)-1, color='red', linestyle='--', 
                    label=f'Early stop @ iter {len(loss_curve)}')
        plt.legend()
    
    plt.tight_layout()
    plt.show()
else:
    print("No loss curve available")

## Architecture Comparison

In [None]:
# Test different architectures
architectures = [
    (32,),               # Shallow, narrow
    (64,),               # Single layer
    (64, 32),            # Two layers
    (128, 64),           # Wider
    (128, 64, 32),       # Three layers
    (256, 128, 64),      # Deep and wide
]

results = []
for arch in architectures:
    model = NeuralNetworkModel({'hidden_layer_sizes': arch, 'max_iter': 200})
    model.fit(X_train, y_train)
    
    train_m = model.evaluate(X_train, y_train)
    test_m = model.evaluate(X_test, y_test)
    
    results.append({
        'architecture': str(arch),
        'n_layers': len(arch),
        'total_neurons': sum(arch),
        'train_rmse': train_m['rmse'],
        'test_rmse': test_m['rmse'],
        'iterations': model.model.n_iter_,
    })

results_df = pd.DataFrame(results)
print("\nArchitecture Comparison:")
print(results_df.to_string(index=False))

In [None]:
# Plot comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# RMSE by architecture
x = range(len(results_df))
axes[0].bar(x, results_df['train_rmse'], width=0.4, label='Train', color='steelblue', alpha=0.7)
axes[0].bar([i+0.4 for i in x], results_df['test_rmse'], width=0.4, label='Test', color='coral', alpha=0.7)
axes[0].set_xticks([i+0.2 for i in x])
axes[0].set_xticklabels(results_df['architecture'], rotation=45, ha='right')
axes[0].set_ylabel('RMSE')
axes[0].set_title('RMSE by Architecture')
axes[0].legend()

# Overfitting (gap between train and test)
overfit_gap = results_df['test_rmse'] - results_df['train_rmse']
colors = ['red' if g > 0.1 else 'green' for g in overfit_gap]
axes[1].bar(results_df['architecture'], overfit_gap, color=colors)
axes[1].axhline(y=0.1, color='orange', linestyle='--', label='Warning threshold')
axes[1].set_ylabel('Test RMSE - Train RMSE')
axes[1].set_title('Overfitting Gap')
axes[1].tick_params(axis='x', rotation=45)
axes[1].legend()

plt.tight_layout()
plt.show()

## NeuralNetworkGoalPredictor (Dual Model)

In [None]:
class NeuralNetworkGoalPredictor:
    """
    Dual neural network for predicting both home and away goals.
    Each model has its own scaler.
    """
    
    def __init__(self, params=None):
        self.params = params
        self.home_model = NeuralNetworkModel(params)
        self.away_model = NeuralNetworkModel(params)
        self.feature_columns = None
    
    def fit(self, df, feature_columns=None):
        """Fit both home and away models."""
        if feature_columns:
            self.feature_columns = feature_columns
        else:
            exclude = ['home_goals', 'away_goals']
            self.feature_columns = [c for c in df.select_dtypes(include=[np.number]).columns
                                    if c not in exclude]
        
        X = df[self.feature_columns]
        
        print("Training home goals model...")
        self.home_model.fit(X, df['home_goals'])
        
        print("\nTraining away goals model...")
        self.away_model.fit(X, df['away_goals'])
        
        return self
    
    def predict_goals(self, df):
        """Predict home and away goals."""
        X = df[self.feature_columns]
        return self.home_model.predict(X), self.away_model.predict(X)
    
    def predict_winner(self, df):
        """Predict game winners."""
        home_pred, away_pred = self.predict_goals(df)
        results = []
        for h, a in zip(home_pred, away_pred):
            if h > a + 0.5:
                results.append('home')
            elif a > h + 0.5:
                results.append('away')
            else:
                results.append('tie')
        return pd.Series(results, index=df.index)
    
    def evaluate(self, df):
        """Evaluate both models."""
        home_pred, away_pred = self.predict_goals(df)
        return {
            'home_rmse': np.sqrt(mean_squared_error(df['home_goals'], home_pred)),
            'away_rmse': np.sqrt(mean_squared_error(df['away_goals'], away_pred)),
            'home_mae': mean_absolute_error(df['home_goals'], home_pred),
            'away_mae': mean_absolute_error(df['away_goals'], away_pred),
        }
    
    def get_loss_curves(self):
        """Get loss curves from both models."""
        return {
            'home': self.home_model.get_loss_curve(),
            'away': self.away_model.get_loss_curve(),
        }

In [None]:
# Train dual predictor
predictor = NeuralNetworkGoalPredictor({'hidden_layer_sizes': (128, 64)})
predictor.fit(games_df, feature_cols)

# Evaluate
metrics = predictor.evaluate(games_df)
print("\nDual Model Metrics:")
for k, v in metrics.items():
    print(f"  {k}: {v:.4f}")

In [None]:
# Plot loss curves for both models
loss_curves = predictor.get_loss_curves()

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for idx, (name, curve) in enumerate(loss_curves.items()):
    if curve:
        axes[idx].plot(curve, color='steelblue', linewidth=2)
        axes[idx].set_xlabel('Iteration')
        axes[idx].set_ylabel('Loss')
        axes[idx].set_title(f'{name.title()} Goals Model - Training Loss')
        axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Hyperparameter Effects

In [None]:
# Test different alpha (L2 regularization)
alphas = [0.0001, 0.001, 0.01, 0.1]
results = []

for alpha in alphas:
    model = NeuralNetworkModel({'alpha': alpha, 'max_iter': 200})
    model.fit(X_train, y_train)
    
    train_m = model.evaluate(X_train, y_train)
    test_m = model.evaluate(X_test, y_test)
    
    results.append({
        'alpha': alpha,
        'train_rmse': train_m['rmse'],
        'test_rmse': test_m['rmse'],
    })

results_df = pd.DataFrame(results)

# Plot
plt.figure(figsize=(10, 5))
plt.semilogx(results_df['alpha'], results_df['train_rmse'], 'o-', label='Train', color='steelblue')
plt.semilogx(results_df['alpha'], results_df['test_rmse'], 's-', label='Test', color='coral')
plt.xlabel('Alpha (L2 Regularization)')
plt.ylabel('RMSE')
plt.title('Effect of Regularization on Performance')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print("\nRegularization Results:")
print(results_df.to_string(index=False))

## Save/Load Model (with Scaler)

In [None]:
import pickle
from pathlib import Path

# CRITICAL: Must save both model AND scaler
output_dir = Path('../output/models')
output_dir.mkdir(parents=True, exist_ok=True)

model_path = output_dir / 'neural_network_model.pkl'

# Save both model and scaler
with open(model_path, 'wb') as f:
    pickle.dump({
        'model': nn_model.model,
        'scaler': nn_model.scaler,  # CRITICAL!
        'feature_names': nn_model.feature_names,
        'params': nn_model.params,
    }, f)

print(f"Model saved to {model_path}")

# Load and verify
with open(model_path, 'rb') as f:
    loaded = pickle.load(f)

# Use loaded scaler for prediction
X_scaled = loaded['scaler'].transform(X_test.values)
loaded_pred = loaded['model'].predict(X_scaled)
print(f"Loaded model RMSE: {np.sqrt(mean_squared_error(y_test, loaded_pred)):.4f}")

## Summary

### Key Takeaways

1. **Feature scaling is REQUIRED** for neural networks
2. **Architecture matters**: Start simple, add complexity if needed
3. **Early stopping** prevents overfitting
4. **Alpha** controls L2 regularization strength
5. **Save both model AND scaler** for deployment

### Best Practices

- Use StandardScaler for normally distributed features
- Use RobustScaler if outliers are present
- Start with (64, 32) architecture, adjust based on performance
- Monitor loss curve to detect convergence issues
- Use early_stopping=True to prevent overfitting