In [176]:
import plotly.express as px
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV
import pandas as pd
from tqdm import tqdm
import joblib
import datetime as dt
from sklearn.model_selection import TimeSeriesSplit
from skimpy import skim
import matplotlib.pyplot as plt
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split

In [177]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import RANSACRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel, RBF

In [178]:
import warnings
warnings.filterwarnings('ignore')

In [179]:
import sys
sys.path.append('..')
import fx

In [180]:
data = fx.pull_data(days=90)

In [181]:
anm_pipeline = Pipeline(steps=[
    ("col_transformer", ColumnTransformer(transformers=[
        ("time", fx.TimestampTransformer(), []),
        ("Speed", None, ["Speed"]),
        ("Direction", None, ["Direction"]),
        ], remainder="drop")),
    ("model", None)
])

anm_params = {
    'col_transformer__Speed': [None, StandardScaler(), PolynomialFeatures(), fx.EmpiricalWaveletTransform(level=5)],
    'col_transformer__Direction': ["drop", fx.WindDirectionMapper(), fx.CompassToCartesianTransformer()],
    'model': [
        LinearRegression(), 
        MLPRegressor(hidden_layer_sizes=(150, 150), activation='tanh', solver='sgd'), 
        SVR(kernel='rbf', gamma='scale', C=1.0, epsilon=0.1),
        HuberRegressor(epsilon=1.35, alpha=0.0001),
        RANSACRegressor(min_samples=0.1, max_trials=100),
        GaussianProcessRegressor(alpha=0.1, kernel=RBF()) 
    ]
}

In [182]:
non_anm_pipeline = Pipeline(steps=[
    ("col_transformer", ColumnTransformer(transformers=[
        ("Speed", None, ["Speed"]),
        ("Direction", None, ["Direction"]),
        ], remainder="drop")),
    ("model", None)
])

non_anm_params = {
    'col_transformer__Speed': [None, StandardScaler(), PolynomialFeatures(), fx.EmpiricalWaveletTransform(level=5)],
    'col_transformer__Direction': ["drop", fx.WindDirectionMapper(), fx.CompassToCartesianTransformer()],
    'model': [
        LinearRegression(), 
        MLPRegressor(hidden_layer_sizes=(150, 150), activation='tanh', solver='sgd'), 
        SVR(kernel='rbf', gamma='scale', C=1.0, epsilon=0.1),
        HuberRegressor(epsilon=1.35, alpha=0.0001),
        RANSACRegressor(min_samples=0.1, max_trials=100),
        GaussianProcessRegressor(alpha=0.1, kernel=RBF()) 
    ]
}

In [183]:
tscv = TimeSeriesSplit(n_splits=5)

In [184]:
anm_gridsearch = GridSearchCV(anm_pipeline, anm_params, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
non_anm_gridsearch = GridSearchCV(non_anm_pipeline, non_anm_params, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)

In [185]:
ANM_X_train, ANM_y_train, ANM_X_test, ANM_y_test = fx.data_splitting(data, output_val="ANM")
non_ANM_X_train, non_ANM_y_train, non_ANM_X_test, non_ANM_y_test = fx.data_splitting(data, output_val="Non-ANM")
total_X_train, total_y_train, total_X_test, total_y_test = fx.data_splitting(data, output_val="Total")

In [186]:
anm_gridsearch.fit(ANM_X_train, ANM_y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [187]:
anm_gridsearch.best_params_

{'col_transformer__Direction': CompassToCartesianTransformer(),
 'col_transformer__Speed': StandardScaler(),
 'model': MLPRegressor(activation='tanh', hidden_layer_sizes=(150, 150), solver='sgd')}

In [188]:
non_anm_gridsearch.fit(non_ANM_X_train, non_ANM_y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [189]:
non_anm_gridsearch.best_params_

{'col_transformer__Direction': CompassToCartesianTransformer(),
 'col_transformer__Speed': StandardScaler(),
 'model': MLPRegressor(activation='tanh', hidden_layer_sizes=(150, 150), solver='sgd')}

In [190]:
anm_pred = anm_gridsearch.predict(ANM_X_test)
non_anm_pred = non_anm_gridsearch.predict(non_ANM_X_test)

In [191]:
combined_pred = anm_pred + non_anm_pred

In [192]:
fx.MSE(combined_pred, total_y_test)


25.535402510189304

In [193]:
# plot total_pred vs total_y_test
fig = px.line(x=total_y_test.index, y=total_y_test.values, title="Total Actual")
fig.add_scatter(x=total_y_test.index, y=combined_pred, mode='lines', name='Total Predicted')
fig.show()

In [194]:
# combine combined_pred and total_y_test
combined_pred_df = pd.DataFrame(combined_pred, index=total_y_test.index, columns=["Total Predicted"])
combined_pred_df["Total Actual"] = total_y_test.values
combined_pred_df["Speed"] = total_X_test["Speed"].values

In [195]:
# plot combined_pred_df speed vs total
px.scatter(
    data_frame=combined_pred_df, 
    x="Speed", 
    y=["Total Predicted", "Total Actual"], 
    title="Speed vs Total", 
    labels={"value": "Power Generation (MW)", "variable": "Speed (m/s)"},
    color="variable"
    )