In [1]:
import plotly.express as px
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV
import pandas as pd
from tqdm import tqdm
import joblib
import datetime as dt
from sklearn.model_selection import TimeSeriesSplit
from skimpy import skim
import matplotlib.pyplot as plt

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import RANSACRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel, RBF

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import sys
sys.path.append('..')
import fx

In [5]:
data = fx.pull_data(days=90)

In [6]:
X_train, y_train, X_test, y_test = fx.data_splitting(data)

In [7]:
pipeline = Pipeline(steps=[
    ("col_transformer", ColumnTransformer(transformers=[
        ("Speed", None, ["Speed"]),
        ("Direction", None, ["Direction"]),
        ], remainder="drop")),
    ("model", None)
])


In [8]:
param_grid = {
    'col_transformer__Speed': [None, StandardScaler(), PolynomialFeatures(), fx.EmpiricalWaveletTransform(level=5)],
    'col_transformer__Direction': ["drop", fx.WindDirectionMapper(), fx.CompassToCartesianTransformer()],
    'model': [
        LinearRegression(), 
        MLPRegressor(hidden_layer_sizes=(150, 150), activation='tanh', solver='sgd'), 
        SVR(kernel='rbf', gamma='scale', C=1.0, epsilon=0.1),
        HuberRegressor(epsilon=1.35, alpha=0.0001),
        RANSACRegressor(min_samples=0.1, max_trials=100),
        GaussianProcessRegressor(alpha=0.1, kernel=RBF()) 
    ]
}

# mlp_param_grid = {
#     'col_transformer__Speed': [None, StandardScaler(), PolynomialFeatures(), fx.EmpiricalWaveletTransform(level=5)],
#     'col_transformer__Direction': ["drop", fx.WindDirectionMapper(), fx.CompassToCartesianTransformer()],
#     'model': [MLPRegressor()],
#     'model__hidden_layer_sizes': [(150, 150),(250, 250)],
#     'model__activation': ['tanh'],
#     'model__solver': ['sgd']
# }

# svr_param_grid = {
#     'col_transformer__Speed': [None, StandardScaler(), PolynomialFeatures(), fx.EmpiricalWaveletTransform(level=5)],
#     'col_transformer__Direction': ["drop", fx.WindDirectionMapper(), fx.CompassToCartesianTransformer()],
#     'model': [SVR(kernel='rbf')],
# }

# huber_param_grid = {
#     'col_transformer__Speed': [None, StandardScaler(), PolynomialFeatures(), fx.EmpiricalWaveletTransform(level=5)],
#     'col_transformer__Direction': ["drop", fx.WindDirectionMapper(), fx.CompassToCartesianTransformer()],
#     'model': [HuberRegressor()],
#     'model__epsilon': [1.35, 1.5, 1.75],
#     'model__alpha': [0.0001, 0.001, 0.01],
# }

# ransac_param_grid = {
#     'col_transformer__Speed': [None, StandardScaler(), PolynomialFeatures(), fx.EmpiricalWaveletTransform(level=5)],
#     'col_transformer__Direction': ["drop", fx.WindDirectionMapper(), fx.CompassToCartesianTransformer()],
#     'model': [RANSACRegressor()],
#     'model__min_samples': [0.1, 0.2, 0.3],
#     'model__max_trials': [100, 200, 300],
# }

# gpr_param_grid = {
#     'col_transformer__Speed': [None, StandardScaler(), PolynomialFeatures(), fx.EmpiricalWaveletTransform(level=5)],
#     'col_transformer__Direction': ["drop", fx.WindDirectionMapper(), fx.CompassToCartesianTransformer()],
#     'model': [GaussianProcessRegressor()],
#     'model__alpha': [0.1, 0.5, 1.0],
#     'model__kernel': [RBF(), DotProduct(), WhiteKernel()],
# }

# create transformer which converts wind direction and speed to imaginary number

In [9]:
# param_grids = {"Linear Regression": param_grid, "MLP Regressor": mlp_param_grid, "SVR": svr_param_grid, "Huber Regressor": huber_param_grid, "RANSAC Regressor": ransac_param_grid, "Gaussian Process Regressor": gpr_param_grid}

In [10]:
tscv = TimeSeriesSplit(n_splits=5)

In [11]:
GS = GridSearchCV(pipeline, param_grid, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
GS.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [12]:
pd.DataFrame(GS.cv_results_).sort_values(by='mean_test_score', ascending=False).head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_col_transformer__Direction,param_col_transformer__Speed,param_model,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
55,0.628642,0.349181,0.006398,0.002059,CompassToCartesianTransformer(),StandardScaler(),"MLPRegressor(activation='tanh', hidden_layer_s...",{'col_transformer__Direction': CompassToCartes...,-48.254271,-55.591201,-56.712185,-35.842986,-36.419924,-46.564114,9.001949,1
60,0.011107,0.000805,0.0074,0.00049,CompassToCartesianTransformer(),PolynomialFeatures(),LinearRegression(),{'col_transformer__Direction': CompassToCartes...,-45.894601,-48.605776,-63.788977,-33.353922,-42.519145,-46.832484,9.917897,2
63,0.034409,0.002562,0.006604,0.000801,CompassToCartesianTransformer(),PolynomialFeatures(),HuberRegressor(),{'col_transformer__Direction': CompassToCartes...,-41.228956,-47.382744,-68.539847,-36.514543,-44.057961,-47.54481,11.086323,3
64,0.163976,0.01131,0.008401,0.00102,CompassToCartesianTransformer(),PolynomialFeatures(),RANSACRegressor(min_samples=0.1),{'col_transformer__Direction': CompassToCartes...,-48.812906,-48.381595,-78.164942,-40.81862,-34.314149,-50.098442,15.017241,4
38,0.024213,0.008732,0.014901,0.002762,WindDirectionMapper(),PolynomialFeatures(),SVR(),{'col_transformer__Direction': WindDirectionMa...,-102.605281,-53.737719,-55.571862,-38.049298,-32.717463,-56.536325,24.658546,5
56,0.019198,0.00656,0.013507,0.001794,CompassToCartesianTransformer(),StandardScaler(),SVR(),{'col_transformer__Direction': CompassToCartes...,-108.223203,-48.765419,-55.311042,-39.432097,-33.09535,-56.965422,26.737745,6
11,0.473537,0.393725,0.009209,0.00192,drop,StandardScaler(),"GaussianProcessRegressor(alpha=0.1, kernel=RBF...","{'col_transformer__Direction': 'drop', 'col_tr...",-86.899389,-67.399882,-52.287437,-47.372775,-35.205931,-57.833083,17.824332,7
12,0.0104,0.001356,0.004202,0.000399,drop,PolynomialFeatures(),LinearRegression(),"{'col_transformer__Direction': 'drop', 'col_tr...",-92.46032,-59.553436,-47.862719,-48.030832,-41.714888,-57.924439,18.205779,8
54,0.0106,0.00102,0.006603,0.000493,CompassToCartesianTransformer(),StandardScaler(),LinearRegression(),{'col_transformer__Direction': CompassToCartes...,-42.209545,-55.367073,-63.537988,-43.326286,-86.773858,-58.24295,16.308879,9
14,0.018302,0.006571,0.01201,0.0051,drop,PolynomialFeatures(),SVR(),"{'col_transformer__Direction': 'drop', 'col_tr...",-91.034091,-63.578149,-50.537012,-50.174732,-37.640141,-58.592825,18.177263,10


In [13]:
def run_pipelines(param_grids, X_train, y_train, X_test, y_test):
    results = []
    models = []
    for name, params in tqdm(param_grids.items()):
        grid_search = GridSearchCV(pipeline, params, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        results.append([name, fx.MSE(grid_search.predict(X_train), y_train), fx.MSE(grid_search.predict(X_test), y_test), grid_search.best_params_])
        models.append(grid_search.best_estimator_)
    return results, models

In [14]:
def gridsearch_df(param_grids, X_train, y_train, X_test, y_test):
    results = pd.DataFrame()
    for name, params in tqdm(param_grids.items()):
        grid_search = GridSearchCV(pipeline, params, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        results = pd.concat([results, pd.DataFrame(grid_search.cv_results_)])
    return results

In [15]:
# results = gridsearch_df(param_grids, X_train, y_train, X_test, y_test)

In [16]:
# results, models = run_pipelines(param_grids, X_train, y_train, X_test, y_test)

In [17]:
def choose_best_model(results, models):
    # choose best model and save it
    df = pd.DataFrame(results, columns=["Model", "Train MSE", "Test MSE", "Best Params"])
    df = df.sort_values(by="Test MSE")
    best_model = models[df.index[0]]
    name = df["Model"][df.index[0]]
    joblib.dump(best_model, f"{name}-{dt.date.today()}.pkl")
    return df

In [18]:
# choose_best_model(results, models)

In [19]:
# best_model = joblib.dump(GS.best_estimator_, f"{dt.date.today()}.pkl")
best_model = GS.best_estimator_

In [20]:
fx.MSE(best_model.predict(X_test), y_test)

34.06704700477288

In [21]:
forecast = fx.load_forecasts()

# load best model
best_model = joblib.load(r"C:\Users\janni\OneDrive - ITU\Documents\UNI\4Semester\LSDA\Assignments\A1\2023-02-18.pkl")

future = best_model.predict(forecast)
forecast["Power Generation Forecast"] = future
forecast = forecast.resample("3H").mean()
forecast.drop(columns=["Speed", "Source_time"], inplace=True)

test_prediction = best_model.predict(X_test)
test_data = fx.create_timestamps(test_prediction, X_test, y_test)


# combine testforecast and gen_forecast
final_df = pd.concat([test_data, forecast], axis=0)

final_df.columns = ["Model", "Actual", "Forecast"]
final_df

Unnamed: 0,Model,Actual,Forecast
2023-02-04 18:00:00+00:00,26.178810,22.127652,
2023-02-04 21:00:00+00:00,26.178810,22.833597,
2023-02-05 00:00:00+00:00,24.915292,20.463497,
2023-02-05 03:00:00+00:00,24.085332,20.181322,
2023-02-05 06:00:00+00:00,21.053886,16.238313,
...,...,...,...
2023-02-23 09:00:00+00:00,,,18.193600
2023-02-23 12:00:00+00:00,,,20.415093
2023-02-23 15:00:00+00:00,,,24.837403
2023-02-23 18:00:00+00:00,,,25.121895


In [22]:
best_model.fit(X_train, y_train)

In [23]:
fx.MSE(best_model.predict(X_test), y_test)

27.356064404601025

In [24]:
# plot train data and model using px.line
fig = px.line(x=X_train.index, y=y_train, title="Train Data")
fig.add_scatter(x=X_train.index, y=best_model.predict(X_train), mode="lines", name="Model")
fig.show()


In [25]:
# plot final_df
fig = px.line(final_df, x=final_df.index, y=["Model", "Actual", "Forecast"], title="Power Generation Forecast")
fig.update_xaxes(title_text="Time")
fig.update_yaxes(title_text="Power Generation")
fig.update_layout(legend_title_text="Legend")
fig.show()