# LSDA Assignment 1 Submission by Jannik Elsäßer \
Please run all blocks.

In [147]:
import plotly.express as px
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV
import pandas as pd
from tqdm import tqdm
import joblib
import datetime as dt
from sklearn.model_selection import TimeSeriesSplit
from skimpy import skim
import matplotlib.pyplot as plt
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split

In [148]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import RANSACRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel, RBF

In [149]:
import warnings
warnings.filterwarnings('ignore')

In [150]:
import sys
sys.path.append('..')
import fx

In [151]:
data = fx.pull_data(days=90)

In [152]:
data.head(5)

Unnamed: 0_level_0,ANM,Non-ANM,Total,Direction,Lead_hours,Source_time,Speed
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-11-24 18:00:00+00:00,16.997392,17.6511,34.648492,SSE,1,1669306000.0,13.85824
2022-11-24 21:00:00+00:00,13.939188,16.848844,30.788033,SSE,1,1669316000.0,12.07008
2022-11-25 00:00:00+00:00,17.813118,16.657667,34.470785,SSE,1,1669327000.0,9.83488
2022-11-25 03:00:00+00:00,15.732884,16.246978,31.979861,S,1,1669338000.0,13.85824
2022-11-25 06:00:00+00:00,16.856965,16.369333,33.226298,S,1,1669349000.0,9.83488


In [153]:
anm_pipeline = Pipeline(steps=[
    ("col_transformer", ColumnTransformer(transformers=[
        ("time", None, []),
        ("Speed", None, ["Speed"]),
        ("Direction", None, ["Direction"]),
        ], remainder="drop")),
    ("model", None)
])

anm_params = {
    'col_transformer__time' : ["drop", None, fx.TimestampTransformer()],
    'col_transformer__Speed': [None, StandardScaler(), PolynomialFeatures(), fx.EmpiricalWaveletTransform(level=5)],
    'col_transformer__Direction': ["drop", fx.WindDirectionMapper(), fx.CompassToCartesianTransformer()],
    'model': [
        LinearRegression(), 
        MLPRegressor(hidden_layer_sizes=(150, 150), activation='tanh', solver='sgd'), 
        SVR(kernel='rbf', gamma='scale', C=1.0, epsilon=0.1),
        HuberRegressor(epsilon=1.35, alpha=0.0001),
        RANSACRegressor(min_samples=0.1, max_trials=100),
        GaussianProcessRegressor(alpha=0.1, kernel=RBF()) 
    ]
}

In [154]:
non_anm_pipeline = Pipeline(steps=[
    ("col_transformer", ColumnTransformer(transformers=[
        ("Speed", None, ["Speed"]),
        ("Direction", None, ["Direction"]),
        ], remainder="drop")),
    ("model", None)
])

non_anm_params = {
    'col_transformer__Speed': [None, StandardScaler(), PolynomialFeatures(), fx.EmpiricalWaveletTransform(level=5)],
    'col_transformer__Direction': ["drop", fx.WindDirectionMapper(), fx.CompassToCartesianTransformer()],
    'model': [
        LinearRegression(), 
        MLPRegressor(hidden_layer_sizes=(150, 150), activation='tanh', solver='sgd'), 
        SVR(kernel='rbf', gamma='scale', C=1.0, epsilon=0.1),
        HuberRegressor(epsilon=1.35, alpha=0.0001),
        RANSACRegressor(min_samples=0.1, max_trials=100),
        GaussianProcessRegressor(alpha=0.1, kernel=RBF()) 
    ]
}

In [155]:
tscv = TimeSeriesSplit(n_splits=5)

In [156]:
anm_gridsearch = GridSearchCV(anm_pipeline, anm_params, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
non_anm_gridsearch = GridSearchCV(non_anm_pipeline, non_anm_params, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)

In [157]:
ANM_X_train, ANM_y_train, ANM_X_test, ANM_y_test = fx.data_splitting(data, output_val="ANM")
non_ANM_X_train, non_ANM_y_train, non_ANM_X_test, non_ANM_y_test = fx.data_splitting(data, output_val="Non-ANM")
total_X_train, total_y_train, total_X_test, total_y_test = fx.data_splitting(data, output_val="Total")

In [158]:
# check that all test sets are same length
len(ANM_y_test) == len(non_ANM_y_test) == len(total_y_test)

True

In [159]:
def train_models(X_train, y_train, X_test, y_test, gridsearch):
    gridsearch.fit(X_train, y_train)
    print("Best params: ", gridsearch.best_params_)
    print("Best score: ", gridsearch.best_score_)
    print("Test score: ", gridsearch.score(X_test, y_test))
    return gridsearch

In [176]:
anm_gridsearch = train_models(ANM_X_train, ANM_y_train, ANM_X_test, ANM_y_test, anm_gridsearch)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best params:  {'col_transformer__Direction': CompassToCartesianTransformer(), 'col_transformer__Speed': StandardScaler(), 'col_transformer__time': TimestampTransformer(), 'model': MLPRegressor(activation='tanh', hidden_layer_sizes=(150, 150), solver='sgd')}
Best score:  -11.910435755481538
Test score:  -9.661580434384579


In [161]:
non_anm_gridsearch = train_models(non_ANM_X_train, non_ANM_y_train, non_ANM_X_test, non_ANM_y_test, non_anm_gridsearch)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best params:  {'col_transformer__Direction': CompassToCartesianTransformer(), 'col_transformer__Speed': StandardScaler(), 'model': MLPRegressor(activation='tanh', hidden_layer_sizes=(150, 150), solver='sgd')}
Best score:  -16.46528164184328
Test score:  -6.044395923015864


In [162]:
def predict_and_combine(ANM_X_test, non_ANM_X_test, y_test, anm_gridsearch, non_anm_gridsearch):
    anm_pred = anm_gridsearch.predict(ANM_X_test)
    non_anm_pred = non_anm_gridsearch.predict(non_ANM_X_test)
    pred = anm_pred + non_anm_pred
    print(f"Overall test score: {fx.MSE(pred, y_test)}")
    return pred

In [163]:
combined_pred = predict_and_combine(ANM_X_test, non_ANM_X_test, total_y_test, anm_gridsearch, non_anm_gridsearch)

Overall test score: 25.278479349283906


In [164]:
def save_models(anm_gridsearch, non_anm_gridsearch):
    joblib.dump(anm_gridsearch, rf"models\anm_gridsearch_{dt.date.today()}.pkl")
    joblib.dump(non_anm_gridsearch, rf"models\non_anm_gridsearch_{dt.date.today()}.pkl")
    return

In [165]:
save_models(anm_gridsearch, non_anm_gridsearch)

In [166]:
def load_models_and_train_on_all_data(data, anm_gridsearch, non_anm_gridsearch):
    # naming schemes is not my strong suit
    anm_X_train, anm_y_train = fx.final_data_splitting(data, output_val="ANM")
    non_anm_X_train, non_anm_y_train = fx.final_data_splitting(data, output_val="Non-ANM")
    anm_gridsearch.fit(anm_X_train, anm_y_train)
    non_anm_gridsearch.fit(non_anm_X_train, non_anm_y_train)
    return anm_gridsearch, non_anm_gridsearch

In [167]:
anm_model, non_anm_model = load_models_and_train_on_all_data(data, anm_gridsearch, non_anm_gridsearch)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [168]:
forecast = fx.load_forecasts()

In [169]:
total_X_train, total_y_train = fx.final_data_splitting(data, output_val="Total")
anm_pred = anm_model.predict(forecast)
non_anm_pred = non_anm_model.predict(forecast)

In [170]:
def create_forecast_df(forecast, anm_pred, non_anm_pred):
    future = anm_pred + non_anm_pred
    forecast["Power Generation Forecast"] = future
    forecast = forecast.resample("3H").mean()
    forecast.drop(columns=["Speed", "Source_time"], inplace=True)
    return forecast

In [171]:
forecast_df = create_forecast_df(forecast, anm_pred, non_anm_pred)

In [172]:
def create_final_plotting_df(forecast_df, data):
    # this code is just for plotting the final graph
    ANM_X_train, ANM_y_train, ANM_X_test, ANM_y_test = fx.data_splitting(data, output_val="ANM")
    non_ANM_X_train, non_ANM_y_train, non_ANM_X_test, non_ANM_y_test = fx.data_splitting(data, output_val="Non-ANM")
    total_X_train, total_y_train, total_X_test, total_y_test = fx.data_splitting(data, output_val="Total")

    test_anm_pred = anm_model.predict(ANM_X_test)
    test_non_anm_pred = non_anm_model.predict(non_ANM_X_test)

    test_prediction = test_anm_pred + test_non_anm_pred
    test_data = fx.create_timestamps(test_prediction, total_X_test, total_y_test)

    # combine testdata and forecastdf for easy plotting
    final_df = pd.concat([test_data, forecast_df], axis=0)

    final_df.columns = ["Model", "Actual", "Forecast"]
    return final_df

In [173]:
final_df = create_final_plotting_df(forecast_df, data)

In [175]:
# plot final_df using plotly
fig = px.line(final_df, x=final_df.index, y=["Model", "Actual", "Forecast"], title="Power Generation Forecast (Test Data and Forecasted Future)")
# add x and y axis labels
fig.update_xaxes(title_text="Date")
fig.update_yaxes(title_text="Power Generation (MW)")
# change legend heading
fig.update_layout(legend_title_text="")
fig.show()
