In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import preprocessing
import matplotlib.pyplot as plt
from datetime import datetime

import numpy as np
import math

data = pd.read_csv('SCADA-FINAL.csv')
data['Timestamp'] = pd.to_datetime(data['Timestamp'], format='%Y-%m-%d %H:%M:%S')
data.set_index('Timestamp', inplace=True)

data.drop(columns=['day_sin', 'day_cos', 'D2_TEMPERATURE', 'Q-influent_MGD'], inplace=True)

data.columns

Index(['D1_TEMPERATURE', 'Q-PS_MGD', 'Q-TWAS_GPM', 'V-Boiler_FT3', 'H-HSW_ft',
       'H-Dig1_FT', 'H-Dig2_FT', 'Biogas_burner', 'Biogas_boiler', 'Q-HSW_GPM',
       'Biogas_prev_hour_avg', 'hour_sin', 'hour_cos', 'H-Dig1-deriv_FTpermin',
       'H-Dig2-deriv_FTpermin', 'V-burner_FT3-2', 'Q-PS_MGD_prev_24h',
       'Q-TWAS_GPM_prev_24h', 'Q-HSW_GPM_prev_24h', 'Biogas_prev_24h',
       'Biogas_prev_24h_1', 'Biogas_prev_24h_2', 'Biogas_prev_24h_3',
       'Biogas_prev_24h_4', 'Q-HSW_GPM_prev_24h_1', 'Q-HSW_GPM_prev_24h_2',
       'Q-HSW_GPM_prev_24h_3', 'Q-HSW_GPM_prev_24h_4', 'Q-PS_MGD_prev_24h_1',
       'Q-PS_MGD_prev_24h_2', 'Q-PS_MGD_prev_24h_3', 'Q-PS_MGD_prev_24h_4',
       'Q-TWAS_GPM_prev_24h_1', 'Q-TWAS_GPM_prev_24h_2',
       'Q-TWAS_GPM_prev_24h_3', 'Q-TWAS_GPM_prev_24h_4', 'Forecast'],
      dtype='object')

In [6]:
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler

x_train, x_test, y_train, y_test = train_test_split(
    data.drop('Forecast', axis=1), data['Forecast'], test_size=0.2,
    shuffle=False)

# Initialize scaler object
scaler = StandardScaler()

# Fit the scaler on the training dataset
scaler.fit(x_train)

# Transform the training and testing dataset
x_train_s = pd.DataFrame(scaler.transform(x_train), index=x_train.index)
x_test_s = pd.DataFrame(scaler.transform(x_test), index=x_test.index)

# Add the column names back
x_train_s.columns = x_train.columns
x_test_s.columns = x_test.columns

# Check to make sure everything looks right
x_train_s

Unnamed: 0_level_0,D1_TEMPERATURE,Q-PS_MGD,Q-TWAS_GPM,V-Boiler_FT3,H-HSW_ft,H-Dig1_FT,H-Dig2_FT,Biogas_burner,Biogas_boiler,Q-HSW_GPM,...,Q-HSW_GPM_prev_24h_3,Q-HSW_GPM_prev_24h_4,Q-PS_MGD_prev_24h_1,Q-PS_MGD_prev_24h_2,Q-PS_MGD_prev_24h_3,Q-PS_MGD_prev_24h_4,Q-TWAS_GPM_prev_24h_1,Q-TWAS_GPM_prev_24h_2,Q-TWAS_GPM_prev_24h_3,Q-TWAS_GPM_prev_24h_4
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-03-23 00:00:00,-0.568071,-0.191843,-0.339807,3.033354,-0.518552,0.467235,1.786058,0.959180,1.270919,0.848766,...,-0.795087,0.298485,-0.485030,-0.429640,-0.427304,-0.380711,0.621969,0.564729,0.600907,0.720826
2022-03-23 00:01:00,-0.469175,5.150879,-0.339807,-1.359258,-0.508768,0.483399,1.786058,0.939302,1.304285,0.848766,...,-0.795089,0.297501,-0.462671,-0.429839,-0.438358,-0.389442,0.621969,0.564733,0.588226,0.733499
2022-03-23 00:02:00,-0.644991,5.042593,-0.339807,-1.354836,-0.498985,0.483399,1.786058,0.931350,1.316394,0.882829,...,-0.795090,0.296514,-0.440125,-0.435584,-0.447302,-0.394109,0.621973,0.564733,0.588230,0.733491
2022-03-23 00:03:00,-0.491152,-0.191843,-0.339807,-1.350406,-0.479417,0.483399,1.786058,0.890267,1.313434,0.847550,...,-0.795089,0.295528,-0.453339,-0.442693,-0.426987,-0.394113,0.621977,0.564729,0.588234,0.733487
2022-03-23 00:04:00,-0.568071,-0.191843,-0.339807,-1.345978,-0.489201,0.483399,1.786058,0.927374,1.316394,0.847550,...,-0.795090,0.294542,-0.453339,-0.460705,-0.408983,-0.394113,0.621977,0.564733,0.588234,0.733483
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-21 12:47:00,-0.084576,-0.192523,-0.340818,0.917007,0.684853,1.097632,2.144065,1.673486,1.881204,0.851199,...,-0.780391,-0.483564,-0.088183,-0.075908,-0.008870,0.050406,0.293713,0.483171,0.375190,0.150280
2022-12-21 12:48:00,-0.117542,-0.192523,-0.339807,0.922629,0.684853,1.097632,2.144065,1.665535,1.939864,0.848766,...,-0.780361,-0.485536,-0.088183,-0.075911,-0.008867,0.050406,0.293721,0.483167,0.375186,0.150284
2022-12-21 12:49:00,-0.051611,-0.192523,-0.340818,0.928309,0.675069,1.097632,2.144065,1.693365,1.953050,0.885262,...,-0.780362,-0.487509,-0.088183,-0.075911,-0.008870,0.050409,0.293717,0.473164,0.385194,0.150284
2022-12-21 12:50:00,-0.062599,-0.191843,-0.339807,0.933717,0.675069,1.081468,2.144065,1.617827,1.945515,0.847550,...,-0.780334,-0.489478,-0.088183,-0.075911,-0.008870,0.050406,0.293717,0.463024,0.395336,0.150280


In [7]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import TimeSeriesSplit

cv=TimeSeriesSplit(gap = 300)
model_lm = RidgeCV(cv=cv) 

model_lm.fit(x_train_s, y_train)

# Predict using the fitted model
y_train_lm = model_lm.predict(x_train_s)
y_test_lm = model_lm.predict(x_test_s)

mse_train_lm = mean_squared_error(y_train, y_train_lm)
mape_train_lm = mean_absolute_percentage_error(y_train, y_train_lm)
r2_train_lm = r2_score(y_train, y_train_lm)
rmse_train = math.sqrt(mse_train_lm)
mse_test_lm = mean_squared_error(y_test, y_test_lm)
mape_test_lm = mean_absolute_percentage_error(y_test, y_test_lm)
r2_test_lm = r2_score(y_test, y_test_lm)
rmse_test = math.sqrt(mse_test_lm)

# Print training and testing error
print(f"Training Mean Squared Error (MSE): {round(mse_train_lm,3)}")
print(f"Training Root Mean Squared Error (MSE): {round(rmse_train,3)}")
print(f"Training Mean Absolute Percentage Error (MAPE): {round(mape_train_lm,3)}")
print(f"Training R-squared (R2) Score: {round(r2_train_lm,3)}")
print(f"Testing Mean Squared Error (MSE): {round(mse_test_lm,3)}")
print(f"Testing Root Mean Squared Error (MSE): {round(rmse_test,3)}")
print(f"Testing Mean Absolute Percentage Error (MAPE): {round(mape_test_lm,3)}")
print(f"Testing R-squared (R2) Score: {round(r2_test_lm,3)}")

def adjusted_r2(r2, n, k):
    return 1 - ((1 - r2) * (n - 1)) / (n - k - 1)

n = len(x_test_s)  # Number of samples in test data
k = len(x_train_s.columns)  # Number of predictors in the model

# Calculate Adjusted R-squared
adjusted_r2_value = adjusted_r2(r2_test_lm, n, k)
print("Adjusted R-squared:", round(adjusted_r2_value,3))
TeEI = (mape_test_lm*mape_test_lm*rmse_test)/adjusted_r2_value
print("TeEI:", round(TeEI,3))

Training Mean Squared Error (MSE): 299.782
Training Root Mean Squared Error (MSE): 17.314
Training Mean Absolute Percentage Error (MAPE): 0.143
Training R-squared (R2) Score: 0.774
Testing Mean Squared Error (MSE): 438.53
Testing Root Mean Squared Error (MSE): 20.941
Testing Mean Absolute Percentage Error (MAPE): 0.137
Testing R-squared (R2) Score: 0.761
Adjusted R-squared: 0.761
TeEI: 0.515


In [8]:
y_train_err = y_train - y_train_lm
y_test_err = y_test - y_test_lm

In [9]:
from sklearn.neural_network import MLPRegressor
# Set the random seed
random_seed = 1

model_ann1 = MLPRegressor(max_iter=200,
                          early_stopping=True,
                          random_state=random_seed,
                          hidden_layer_sizes=(60,),
                          solver='sgd',
                          learning_rate='invscaling',
                          alpha=500)
                         
# Fit the model

model_ann1.fit(x_train_s.values, y_train_err)

y_train_ann1 = model_ann1.predict(x_train_s.values)
y_test_ann1 = model_ann1.predict(x_test_s.values)

In [10]:
y_train_hybrid = y_train_lm + y_train_ann1
y_test_hybrid = y_test_lm + y_test_ann1

# Calculate training and testing error
mse_train_hyb = mean_squared_error(y_train, y_train_hybrid)
mape_train_hyb = mean_absolute_percentage_error(y_train, y_train_hybrid)
rmse_train = math.sqrt(mse_train_hyb)
r2_train_hyb = r2_score(y_train, y_train_hybrid)
mse_test_hyb = mean_squared_error(y_test, y_test_hybrid)
mape_test_hyb = mean_absolute_percentage_error(y_test, y_test_hybrid)
r2_test_hyb = r2_score(y_test, y_test_hybrid)
rmse_test = math.sqrt(mse_test_hyb)

# Print training and testing error
print(f"Training Mean Squared Error (MSE): {round(mse_train_hyb,3)}")
print(f"Training Root Mean Squared Error (MSE): {round(rmse_train,3)}")
print(f"Training Mean Absolute Percentage Error (MAPE): {round(mape_train_hyb,3)}")
print(f"Training R-squared (R2) Score: {round(r2_train_hyb,3)}")
print(f"Testing Mean Squared Error (MSE): {round(mse_test_hyb,3)}")
print(f"Testing Root Mean Squared Error (MSE): {round(rmse_test,3)}")
print(f"Testing Mean Absolute Percentage Error (MAPE): {round(mape_test_hyb,3)}")
print(f"Testing R-squared (R2) Score: {round(r2_test_hyb,3)}")

def adjusted_r2(r2, n, k):
    return 1 - ((1 - r2) * (n - 1)) / (n - k - 1)

n = len(x_test_s)  # Number of samples in test data
k = len(x_train_s.columns)  # Number of predictors in the model

# Calculate Adjusted R-squared
adjusted_r2_value = adjusted_r2(r2_test_hyb, n, k)
print("Adjusted R-squared:", round(adjusted_r2_value,3))
print("Number of variables:",k)
TeEI = (mape_test_hyb*mape_test_hyb*rmse_test)/adjusted_r2_value
print("TeEI:", round(TeEI,2))

Training Mean Squared Error (MSE): 299.797
Training Root Mean Squared Error (MSE): 17.315
Training Mean Absolute Percentage Error (MAPE): 0.143
Training R-squared (R2) Score: 0.774
Testing Mean Squared Error (MSE): 439.59
Testing Root Mean Squared Error (MSE): 20.966
Testing Mean Absolute Percentage Error (MAPE): 0.137
Testing R-squared (R2) Score: 0.761
Adjusted R-squared: 0.761
Number of variables: 36
TeEI: 0.52
