In [47]:
import plotly.express as px
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV
import pandas as pd
from tqdm import tqdm
import joblib
import datetime as dt
from sklearn.model_selection import TimeSeriesSplit
from skimpy import skim
import matplotlib.pyplot as plt
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split

In [48]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import RANSACRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel, RBF

In [49]:
import warnings
warnings.filterwarnings('ignore')

In [50]:
import sys
sys.path.append('..')
import fx

In [51]:
data = fx.pull_data(days=90)

In [52]:
data.head(5)

Unnamed: 0_level_0,ANM,Non-ANM,Total,Direction,Lead_hours,Source_time,Speed
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-11-25 09:00:00+00:00,16.825317,16.509628,33.334944,S,1,1669360000.0,11.176
2022-11-25 12:00:00+00:00,16.38664,17.59085,33.97749,S,1,1669370000.0,12.96416
2022-11-25 15:00:00+00:00,15.765571,18.399217,34.164788,SSW,1,1669381000.0,8.9408
2022-11-25 18:00:00+00:00,13.754039,18.151967,31.906006,SW,1,1669392000.0,12.07008
2022-11-25 21:00:00+00:00,12.167921,18.977528,31.145449,WSW,1,1669403000.0,12.96416


In [53]:
anm_pipeline = Pipeline(steps=[
    ("col_transformer", ColumnTransformer(transformers=[
        ("time", None, []),
        ("Speed", None, ["Speed"]),
        ("Direction", None, ["Direction"]),
        ], remainder="drop")),
    ("model", None)
])

anm_params = {
    'col_transformer__time' : ["drop", None, fx.TimestampTransformer()],
    'col_transformer__Speed': [None, StandardScaler(), PolynomialFeatures(), fx.EmpiricalWaveletTransform(level=5)],
    'col_transformer__Direction': ["drop", fx.WindDirectionMapper(), fx.CompassToCartesianTransformer()],
    'model': [
        LinearRegression(), 
        MLPRegressor(hidden_layer_sizes=(150, 150), activation='tanh', solver='sgd'), 
        SVR(kernel='rbf', gamma='scale', C=1.0, epsilon=0.1),
        HuberRegressor(epsilon=1.35, alpha=0.0001),
        RANSACRegressor(min_samples=0.1, max_trials=100),
        GaussianProcessRegressor(alpha=0.1, kernel=RBF()) 
    ]
}

In [54]:
non_anm_pipeline = Pipeline(steps=[
    ("col_transformer", ColumnTransformer(transformers=[
        ("Speed", None, ["Speed"]),
        ("Direction", None, ["Direction"]),
        ], remainder="drop")),
    ("model", None)
])

non_anm_params = {
    'col_transformer__Speed': [None, StandardScaler(), PolynomialFeatures(), fx.EmpiricalWaveletTransform(level=5)],
    'col_transformer__Direction': ["drop", fx.WindDirectionMapper(), fx.CompassToCartesianTransformer()],
    'model': [
        LinearRegression(), 
        MLPRegressor(hidden_layer_sizes=(150, 150), activation='tanh', solver='sgd'), 
        SVR(kernel='rbf', gamma='scale', C=1.0, epsilon=0.1),
        HuberRegressor(epsilon=1.35, alpha=0.0001),
        RANSACRegressor(min_samples=0.1, max_trials=100),
        GaussianProcessRegressor(alpha=0.1, kernel=RBF()) 
    ]
}

In [55]:
tscv = TimeSeriesSplit(n_splits=5)

In [56]:
anm_gridsearch = GridSearchCV(anm_pipeline, anm_params, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
non_anm_gridsearch = GridSearchCV(non_anm_pipeline, non_anm_params, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)

In [57]:
ANM_X_train, ANM_y_train, ANM_X_test, ANM_y_test = fx.data_splitting(data, output_val="ANM")
non_ANM_X_train, non_ANM_y_train, non_ANM_X_test, non_ANM_y_test = fx.data_splitting(data, output_val="Non-ANM")
total_X_train, total_y_train, total_X_test, total_y_test = fx.data_splitting(data, output_val="Total")

In [58]:
# check that all test sets are same length
len(ANM_y_test) == len(non_ANM_y_test) == len(total_y_test)

True

In [59]:
def train_models(X_train, y_train, X_test, y_test, gridsearch):
    gridsearch.fit(X_train, y_train)
    print("Best params: ", gridsearch.best_params_)
    print("Best score: ", gridsearch.best_score_)
    print("Test score: ", gridsearch.score(X_test, y_test))
    return gridsearch

In [60]:
anm_gridsearch = train_models(ANM_X_train, ANM_y_train, ANM_X_test, ANM_y_test, anm_gridsearch)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best params:  {'col_transformer__Direction': CompassToCartesianTransformer(), 'col_transformer__Speed': StandardScaler(), 'col_transformer__time': TimestampTransformer(), 'model': MLPRegressor(activation='tanh', hidden_layer_sizes=(150, 150), solver='sgd')}
Best score:  -12.036068520350797
Test score:  -9.60592446888633


In [61]:
non_anm_gridsearch = train_models(non_ANM_X_train, non_ANM_y_train, non_ANM_X_test, non_ANM_y_test, non_anm_gridsearch)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best params:  {'col_transformer__Direction': CompassToCartesianTransformer(), 'col_transformer__Speed': EmpiricalWaveletTransform(), 'model': MLPRegressor(activation='tanh', hidden_layer_sizes=(150, 150), solver='sgd')}
Best score:  -13.926715032253156
Test score:  -18.503316684263513


In [62]:
def predict_and_combine(ANM_X_test, non_ANM_X_test, y_test, anm_gridsearch, non_anm_gridsearch):
    anm_pred = anm_gridsearch.predict(ANM_X_test)
    non_anm_pred = non_anm_gridsearch.predict(non_ANM_X_test)
    pred = anm_pred + non_anm_pred
    print(f"Overall test score: {fx.MSE(pred, y_test)}")
    return pred

In [63]:
combined_pred = predict_and_combine(ANM_X_test, non_ANM_X_test, total_y_test, anm_gridsearch, non_anm_gridsearch)

Overall test score: 35.85336273741996


In [64]:
def save_models(anm_gridsearch, non_anm_gridsearch):
    joblib.dump(anm_gridsearch, rf"models\anm_gridsearch_{dt.date.today()}.pkl")
    joblib.dump(non_anm_gridsearch, rf"models\non_anm_gridsearch_{dt.date.today()}.pkl")
    return

In [65]:
save_models(anm_gridsearch, non_anm_gridsearch)

In [66]:
def load_models_and_train_on_all_data(data, anm_gridsearch, non_anm_gridsearch):
    # naming schemes is not my strong suit
    anm_X_train, anm_y_train = fx.final_data_splitting(data, output_val="ANM")
    non_anm_X_train, non_anm_y_train = fx.final_data_splitting(data, output_val="Non-ANM")
    anm_gridsearch.fit(anm_X_train, anm_y_train)
    non_anm_gridsearch.fit(non_anm_X_train, non_anm_y_train)
    return anm_gridsearch, non_anm_gridsearch

In [67]:
anm_model, non_anm_model = load_models_and_train_on_all_data(data, anm_gridsearch, non_anm_gridsearch)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [187]:
forecast = fx.load_forecasts()

In [188]:
forecast

Unnamed: 0_level_0,Direction,Lead_hours,Source_time,Speed
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-02-23 09:00:00+00:00,SW,2,1677132000,12.96416
2023-02-23 09:00:00+00:00,W,100,1676779200,9.83488
2023-02-23 09:00:00+00:00,W,63,1676912400,8.94080
2023-02-23 09:00:00+00:00,W,64,1676908800,8.94080
2023-02-23 09:00:00+00:00,W,65,1676905200,8.94080
...,...,...,...,...
2023-02-27 21:00:00+00:00,NNE,114,1677117600,4.02336
2023-02-27 21:00:00+00:00,NNE,115,1677114000,4.02336
2023-02-27 21:00:00+00:00,W,118,1677103200,3.12928
2023-02-27 21:00:00+00:00,WNW,116,1677110400,3.12928


In [189]:
total_X_train, total_y_train = fx.final_data_splitting(data, output_val="Total")
anm_pred = anm_model.predict(forecast)
non_anm_pred = non_anm_model.predict(forecast)

In [190]:
def create_forecast_df(forecast, anm_pred, non_anm_pred):
    future = anm_pred + non_anm_pred
    forecast["Power Generation Forecast"] = future
    forecast = forecast.resample("3H").mean()
    forecast.drop(columns=["Source_time"], inplace=True)
    return forecast

In [194]:
forecast_df = create_forecast_df(forecast, anm_pred, non_anm_pred)

In [197]:
def create_final_plotting_df(forecast_df, data):
    # this code is just for plotting the final graph
    ANM_X_train, ANM_y_train, ANM_X_test, ANM_y_test = fx.data_splitting(data, output_val="ANM")
    non_ANM_X_train, non_ANM_y_train, non_ANM_X_test, non_ANM_y_test = fx.data_splitting(data, output_val="Non-ANM")
    total_X_train, total_y_train, total_X_test, total_y_test = fx.data_splitting(data, output_val="Total")

    test_anm_pred = anm_model.predict(ANM_X_test)
    test_non_anm_pred = non_anm_model.predict(non_ANM_X_test)

    test_prediction = test_anm_pred + test_non_anm_pred
    test_data = fx.create_timestamps(test_prediction, total_X_test, total_y_test)

    # slice total_x_test data to only get data up to the forecast datapoint
    total_X_test = total_X_test.loc[:forecast_df.index[0]]

    wind_speed_data = pd.concat([forecast_df["Speed"], total_X_test["Speed"]], axis=0)

    # combine testdata and forecastdf for easy plotting
    final_df = pd.concat([test_data[["predict", "actual"]], forecast_df["Power Generation Forecast"]], axis=0)
    final_df.columns = ["Model", "Actual", "Forecast"]
    return final_df, wind_speed_data

In [202]:
final_df, wind_speed_data = create_final_plotting_df(forecast_df, data)
final_df

time
2023-02-23 09:00:00+00:00    10.520341
2023-02-23 12:00:00+00:00    12.317016
2023-02-23 15:00:00+00:00    13.768832
2023-02-23 18:00:00+00:00    13.879528
2023-02-23 21:00:00+00:00    13.798635
                               ...    
2023-02-22 18:00:00+00:00     8.940800
2023-02-22 21:00:00+00:00     4.023360
2023-02-23 00:00:00+00:00     5.811520
2023-02-23 03:00:00+00:00     1.788160
2023-02-23 06:00:00+00:00     9.834880
Name: Speed, Length: 156, dtype: float64

In [203]:
# plot final_df using plotly
fig = px.line(final_df, x=final_df.index, y=["Model", "Actual", "Forecast"], title="Power Generation Forecast (Test Data and Forecasted Future)")
# add x and y axis labels
fig.update_xaxes(title_text="Date")
fig.update_yaxes(title_text="Power Generation (MW)")
# change legend heading
fig.update_layout(legend_title_text="")
# add wind speed info to hover
customdata = wind_speed_data
fig.update_traces(hovertemplate="<b>Power Generation: %{y:.2f} MW </b><br> Wind Speed: %{customdata:.2f} m/s <extra></extra>", customdata=customdata)
fig.show()
