In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load data
data = pd.read_csv('forestfires.csv')

# Preprocess data
data['month'] = LabelEncoder().fit_transform(data['month'])
data['day'] = LabelEncoder().fit_transform(data['day'])
X = data.drop(['area'], axis=1).values
y = data['area'].values

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Configure the outer cross-validation procedure
cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)

# Define models and hyperparameters
models_params = {
    'RandomForestRegressor': {
        'model': RandomForestRegressor(random_state=1),
        'params': {
            'n_estimators': [50, 100, 150],
            'max_depth': [10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    },
    'GradientBoostingRegressor': {
        'model': GradientBoostingRegressor(random_state=1),
        'params': {
            'n_estimators': [50, 100, 150],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 3]
        }
    }
}

# Outer loop
for name, mp in models_params.items():
    outer_mse = []
    print(f"Evaluating {name}")
    for train_ix, test_ix in cv_outer.split(X_scaled):
        # Split data
        X_train, X_test = X_scaled[train_ix, :], X_scaled[test_ix, :]
        y_train, y_test = y[train_ix], y[test_ix]

        # Configure the inner cross-validation procedure
        cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)

        # Define search
        search = GridSearchCV(mp['model'], mp['params'], scoring='neg_mean_squared_error', cv=cv_inner, refit=True)

        # Execute search
        result = search.fit(X_train, y_train)

        # Get the best performing model fit on the whole training set
        best_model = result.best_estimator_

        # Evaluate model on the hold out dataset
        yhat = best_model.predict(X_test)
        mse = mean_squared_error(y_test, yhat)

        # Store the result
        outer_mse.append(mse)

        # Report progress
        print(f'>mse={mse:.3f}, est={result.best_score_:.3f}, cfg={result.best_params_}')

    # Summarize the estimated performance of the model
    print(f"{name} Mean MSE: {np.mean(outer_mse):.3f} (std: {np.std(outer_mse):.3f})")


Evaluating RandomForestRegressor
>mse=11132.567, est=-3448.867, cfg={'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
>mse=1561.467, est=-4705.215, cfg={'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}
>mse=613.307, est=-4698.607, cfg={'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 150}
>mse=23165.546, est=-2060.178, cfg={'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 150}
>mse=308.830, est=-4956.732, cfg={'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}
>mse=730.909, est=-4775.501, cfg={'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 150}
>mse=1001.482, est=-4756.966, cfg={'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 150}
>mse=1378.946, est=-4701.893, cfg={'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
>ms