In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNetCV, LassoLarsCV, SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive

from sklearn.model_selection import TimeSeriesSplit

tpot_data = pd.read_csv('SCADA-FINAL.csv')
tpot_data['Timestamp'] = pd.to_datetime(tpot_data['Timestamp'], format='%Y-%m-%d %H:%M:%S')
tpot_data.set_index('Timestamp', inplace=True)
features = tpot_data.drop(['Forecast','day_sin', 'day_cos', 'D2_TEMPERATURE', 'Q-influent_MGD'], axis = 1)

feature_list = list(features.columns)
features = np.array(features)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['Forecast'], shuffle=False, test_size=0.2)
cv=TimeSeriesSplit(gap=300)

# Average CV score on the training set was: -329.51976744763334
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=SGDRegressor(alpha=0.01, eta0=1.0, fit_intercept=False, l1_ratio=0.25, learning_rate="invscaling", loss="huber", penalty="elasticnet", power_t=0.5)),
    StackingEstimator(estimator=ElasticNetCV(l1_ratio=0.8500000000000001, tol=0.1)),
    StackingEstimator(estimator=SGDRegressor(alpha=0.01, eta0=1.0, fit_intercept=False, l1_ratio=0.25, learning_rate="invscaling", loss="huber", penalty="elasticnet", power_t=0.5)),
    LassoLarsCV(normalize=False)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 1)

exported_pipeline.fit(training_features, training_target)

In [2]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
import math

# Predict using the fitted model
y_train_ml = exported_pipeline.predict(training_features)
y_test_ml = exported_pipeline.predict(testing_features)

mse_train_ml = mean_squared_error(training_target, y_train_ml)
r2_train_ml = r2_score(training_target, y_train_ml)
mape_train_ml = mean_absolute_percentage_error(training_target, y_train_ml)
rmse_train = math.sqrt(mse_train_ml)

mse_test_ml = mean_squared_error(testing_target, y_test_ml)
r2_test_ml = r2_score(testing_target, y_test_ml)
mape_test_ml = mean_absolute_percentage_error(testing_target, y_test_ml)
rmse_test = math.sqrt(mse_test_ml)

# Print training and testing error
print(f"Training Mean Squared Error (MSE): {round(mse_train_ml,3)}")
print(f"Training Root Mean Squared Error (MSE): {round(rmse_train,3)}")
print(f"Training Mean Absolute Percentage Error (MAPE): {round(mape_train_ml,3)}")
print(f"Training R-squared (R2) Score: {round(r2_train_ml,3)}")
print(f"Testing Mean Squared Error (MSE): {round(mse_test_ml,3)}")
print(f"Testing Root Mean Squared Error (MSE): {round(rmse_test,3)}")
print(f"Testing Mean Absolute Percentage Error (MAPE): {round(mape_test_ml,3)}")
print(f"Testing R-squared (R2) Score: {round(r2_test_ml,3)}")

def adjusted_r2(r2, n, k):
    return 1 - ((1 - r2) * (n - 1)) / (n - k - 1)

n = len(testing_features)  # Number of samples in test data
k = len(feature_list)  # Number of predictors in the model

# Calculate Adjusted R-squared
adjusted_r2_value = adjusted_r2(r2_test_ml, n, k)
print("Adjusted R-squared:", round(adjusted_r2_value,3))

TeEI = (mape_test_ml*mape_test_ml*rmse_test)/adjusted_r2_value
print("TeEI:", round(TeEI,3))

Training Mean Squared Error (MSE): 331.003
Training Root Mean Squared Error (MSE): 18.193
Training Mean Absolute Percentage Error (MAPE): 0.152
Training R-squared (R2) Score: 0.75
Testing Mean Squared Error (MSE): 518.35
Testing Root Mean Squared Error (MSE): 22.767
Testing Mean Absolute Percentage Error (MAPE): 0.15
Testing R-squared (R2) Score: 0.718
Adjusted R-squared: 0.718
TeEI: 0.714
