In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVR
from tpot.export_utils import set_param_recursive

from sklearn.model_selection import TimeSeriesSplit

tpot_data = pd.read_csv('LABS-LAGGED.csv')
tpot_data['Timestamp'] = pd.to_datetime(tpot_data['Timestamp'], format='%Y-%m-%d')
tpot_data.set_index('Timestamp', inplace=True)
features = tpot_data.drop('Forecast', axis=1)
features.drop(columns=['DayOfWeek','HSW-COD-load','HSW-COD-load_1', 'HSW-COD-load_2', 'HSW-COD-load_3', 'HSW-COD-load_4',
                  'HSW-COD-load_5', 'HSW-COD-load_6', 'Dig-stability', 'Biogas_6', 'TWAS-VS-load_2', 'PS-VS-load_3'
                  , 'PS-VS-load_5', 'PS-VS-load_6', 'TWAS-VS-load_4','TWAS-VS-load_3','SRT','PS-VS-load_1','HSW-VS-load_2'
                  , 'Biogas_4','HSW-VS-load_5','PS-VS-load','Biogas_3','PS-VS-load_2','TWAS-VS-load_5','TWAS-VS-load_6'
                  , 'PS-VS-load_4', 'Biogas_2','HSW-VS-load_3','HSW-VS-load_4','TWAS-VS-load','TWAS-VS-load_1','BOD-load'], inplace=True)
feature_list = list(features.columns)
features = np.array(features)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['Forecast'], shuffle=False, test_size=0.2)
cv=TimeSeriesSplit(gap=5)

# Average CV score on the training set was: -562.0504895488633
exported_pipeline = make_pipeline(
    MinMaxScaler(),
    LinearSVR(C=20.0, dual=True, epsilon=0.0001, loss="squared_epsilon_insensitive", tol=0.1)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

exported_pipeline.fit(training_features, training_target)



In [2]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_percentage_error
import math
# Predict using the fitted model
y_train_ml = exported_pipeline.predict(training_features)
y_test_ml = exported_pipeline.predict(testing_features)

# Calculate training and testing error
mse_train_ml = mean_squared_error(training_target, y_train_ml)
r2_train_ml = r2_score(training_target, y_train_ml)
mape_train_ml = mean_absolute_percentage_error(training_target, y_train_ml)
rmse_train = math.sqrt(mse_train_ml)

mse_test_ml = mean_squared_error(testing_target, y_test_ml)
r2_test_ml = r2_score(testing_target, y_test_ml)
mape_test_ml = mean_absolute_percentage_error(testing_target, y_test_ml)
rmse_test = math.sqrt(mse_test_ml)

# Print training and testing error
print(f"Training Mean Squared Error (MSE): {round(mse_train_ml,3)}")
print(f"Training Root Mean Squared Error (MSE): {round(rmse_train,3)}")
print(f"Training Mean Absolute Percentage Error (MAPE): {round(mape_train_ml,3)}")
print(f"Training R-squared (R2) Score: {round(r2_train_ml,3)}")
print(f"Testing Mean Squared Error (MSE): {round(mse_test_ml,3)}")
print(f"Testing Root Mean Squared Error (MSE): {round(rmse_test,3)}")
print(f"Testing Mean Absolute Percentage Error (MAPE): {round(mape_test_ml,3)}")
print(f"Testing R-squared (R2) Score: {round(r2_test_ml,3)}")

def adjusted_r2(r2, n, k):
    return 1 - ((1 - r2) * (n - 1)) / (n - k - 1)

n = len(testing_features)  # Number of samples in test data
k = len(feature_list)  # Number of predictors in the model

# Calculate Adjusted R-squared
adjusted_r2_value = adjusted_r2(r2_test_ml, n, k)
print("Adjusted R-squared:", round(adjusted_r2_value,3))
print(k)
TeEI = (mape_test_ml*mape_test_ml*rmse_test)/adjusted_r2_value
print("TeEI:", round(TeEI,3))

Training Mean Squared Error (MSE): 504.741
Training Root Mean Squared Error (MSE): 22.466
Training Mean Absolute Percentage Error (MAPE): 0.331
Training R-squared (R2) Score: 0.699
Testing Mean Squared Error (MSE): 597.86
Testing Root Mean Squared Error (MSE): 24.451
Testing Mean Absolute Percentage Error (MAPE): 0.271
Testing R-squared (R2) Score: 0.618
Adjusted R-squared: 0.607
6
TeEI: 2.957
