In [1]:
import plotly.express as px
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV
import pandas as pd
from tqdm import tqdm
import joblib
import datetime as dt
from sklearn.model_selection import TimeSeriesSplit
from skimpy import skim
import matplotlib.pyplot as plt

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import RANSACRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel, RBF

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import sys
sys.path.append('..')
import fx

In [5]:
data = fx.pull_data(days=90)

In [6]:
X_train, y_train, X_test, y_test = fx.data_splitting(data, output_val="Non-ANM")

In [1]:
pipeline = Pipeline(steps=[
    ("col_transformer", ColumnTransformer(transformers=[
        ("Speed", None, ["Speed"]),
        ("Direction", None, ["Direction"]),
        ], remainder="drop")),
    ("model", None)
])


NameError: name 'Pipeline' is not defined

In [8]:
X_test

Unnamed: 0_level_0,ANM,Non-ANM,Total,Direction,Lead_hours,Source_time,Speed
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-02-04 12:00:00+00:00,11.439659,11.481322,22.920981,S,1,1.675505e+09,11.17600
2023-02-04 15:00:00+00:00,11.556457,11.412967,22.969424,W,1,1.675516e+09,13.85824
2023-02-04 18:00:00+00:00,10.810869,11.316783,22.127652,WNW,1,1.675526e+09,12.96416
2023-02-04 21:00:00+00:00,11.411241,11.422356,22.833597,WNW,1,1.675537e+09,12.96416
2023-02-05 00:00:00+00:00,9.339153,11.124344,20.463497,WNW,1,1.675548e+09,12.07008
...,...,...,...,...,...,...,...
2023-02-18 21:00:00+00:00,2.097053,4.954194,7.051247,W,1,1.676747e+09,4.91744
2023-02-19 00:00:00+00:00,4.987375,10.023011,15.010387,W,1,1.676758e+09,8.94080
2023-02-19 03:00:00+00:00,5.683093,10.547728,16.230821,WSW,1,1.676768e+09,11.17600
2023-02-19 06:00:00+00:00,7.619722,10.629117,18.248839,SW,1,1.676779e+09,11.17600


In [9]:
param_grid = {
    'col_transformer__Speed': [None, StandardScaler(), PolynomialFeatures(), fx.EmpiricalWaveletTransform(level=5)],
    'col_transformer__Direction': ["drop", fx.WindDirectionMapper(), fx.CompassToCartesianTransformer()],
    'model': [
        LinearRegression(), 
        MLPRegressor(hidden_layer_sizes=(150, 150), activation='tanh', solver='sgd'), 
        SVR(kernel='rbf', gamma='scale', C=1.0, epsilon=0.1),
        HuberRegressor(epsilon=1.35, alpha=0.0001),
        RANSACRegressor(min_samples=0.1, max_trials=100),
        GaussianProcessRegressor(alpha=0.1, kernel=RBF()) 
    ]
}

# mlp_param_grid = {
#     'col_transformer__Speed': [None, StandardScaler(), PolynomialFeatures(), fx.EmpiricalWaveletTransform(level=5)],
#     'col_transformer__Direction': ["drop", fx.WindDirectionMapper(), fx.CompassToCartesianTransformer()],
#     'model': [MLPRegressor()],
#     'model__hidden_layer_sizes': [(150, 150),(250, 250)],
#     'model__activation': ['tanh'],
#     'model__solver': ['sgd']
# }

# svr_param_grid = {
#     'col_transformer__Speed': [None, StandardScaler(), PolynomialFeatures(), fx.EmpiricalWaveletTransform(level=5)],
#     'col_transformer__Direction': ["drop", fx.WindDirectionMapper(), fx.CompassToCartesianTransformer()],
#     'model': [SVR(kernel='rbf')],
# }

# huber_param_grid = {
#     'col_transformer__Speed': [None, StandardScaler(), PolynomialFeatures(), fx.EmpiricalWaveletTransform(level=5)],
#     'col_transformer__Direction': ["drop", fx.WindDirectionMapper(), fx.CompassToCartesianTransformer()],
#     'model': [HuberRegressor()],
#     'model__epsilon': [1.35, 1.5, 1.75],
#     'model__alpha': [0.0001, 0.001, 0.01],
# }

# ransac_param_grid = {
#     'col_transformer__Speed': [None, StandardScaler(), PolynomialFeatures(), fx.EmpiricalWaveletTransform(level=5)],
#     'col_transformer__Direction': ["drop", fx.WindDirectionMapper(), fx.CompassToCartesianTransformer()],
#     'model': [RANSACRegressor()],
#     'model__min_samples': [0.1, 0.2, 0.3],
#     'model__max_trials': [100, 200, 300],
# }

# gpr_param_grid = {
#     'col_transformer__Speed': [None, StandardScaler(), PolynomialFeatures(), fx.EmpiricalWaveletTransform(level=5)],
#     'col_transformer__Direction': ["drop", fx.WindDirectionMapper(), fx.CompassToCartesianTransformer()],
#     'model': [GaussianProcessRegressor()],
#     'model__alpha': [0.1, 0.5, 1.0],
#     'model__kernel': [RBF(), DotProduct(), WhiteKernel()],
# }

# create transformer which converts wind direction and speed to imaginary number

In [10]:
# param_grids = {"Linear Regression": param_grid, "MLP Regressor": mlp_param_grid, "SVR": svr_param_grid, "Huber Regressor": huber_param_grid, "RANSAC Regressor": ransac_param_grid, "Gaussian Process Regressor": gpr_param_grid}

In [11]:
tscv = TimeSeriesSplit(n_splits=5)

In [12]:
GS = GridSearchCV(pipeline, param_grid, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
GS.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [13]:
pd.DataFrame(GS.cv_results_).sort_values(by='mean_test_score', ascending=False).head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_col_transformer__Direction,param_col_transformer__Speed,param_model,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
55,0.950269,0.55554,0.009399,0.00372,CompassToCartesianTransformer(),StandardScaler(),"MLPRegressor(activation='tanh', hidden_layer_s...",{'col_transformer__Direction': CompassToCartes...,-17.812618,-10.880214,-23.03607,-15.872964,-12.862455,-16.092864,4.214767,1
60,0.017198,0.006941,0.009402,0.004927,CompassToCartesianTransformer(),PolynomialFeatures(),LinearRegression(),{'col_transformer__Direction': CompassToCartes...,-23.054782,-11.517416,-25.134515,-15.253234,-13.943289,-17.780647,5.333612,2
63,0.0378,0.002639,0.007801,0.001327,CompassToCartesianTransformer(),PolynomialFeatures(),HuberRegressor(),{'col_transformer__Direction': CompassToCartes...,-29.334378,-11.559704,-25.984217,-16.866358,-14.910012,-19.730934,6.775629,3
12,0.01,0.001898,0.004999,0.000895,drop,PolynomialFeatures(),LinearRegression(),"{'col_transformer__Direction': 'drop', 'col_tr...",-35.698316,-15.686535,-17.656734,-19.538522,-13.90674,-20.497369,7.831193,4
11,0.475027,0.436317,0.0084,0.002245,drop,StandardScaler(),"GaussianProcessRegressor(alpha=0.1, kernel=RBF...","{'col_transformer__Direction': 'drop', 'col_tr...",-34.828515,-18.311913,-18.534014,-18.98753,-13.154804,-20.763355,7.346453,5
56,0.025997,0.011023,0.0128,0.00204,CompassToCartesianTransformer(),StandardScaler(),SVR(),{'col_transformer__Direction': CompassToCartes...,-38.007182,-12.856233,-24.637864,-17.017648,-12.792784,-21.062342,9.507863,6
17,0.255827,0.382948,0.011802,0.004534,drop,PolynomialFeatures(),"GaussianProcessRegressor(alpha=0.1, kernel=RBF...","{'col_transformer__Direction': 'drop', 'col_tr...",-35.483965,-19.246511,-18.848818,-19.32264,-12.935779,-21.167542,7.552382,7
66,0.023,0.007669,0.0112,0.001832,CompassToCartesianTransformer(),EmpiricalWaveletTransform(),LinearRegression(),{'col_transformer__Direction': CompassToCartes...,-29.437625,-16.732053,-24.353134,-17.765636,-17.952915,-21.248273,4.900817,8
7,0.778428,0.576586,0.006201,0.001167,drop,StandardScaler(),"MLPRegressor(activation='tanh', hidden_layer_s...","{'col_transformer__Direction': 'drop', 'col_tr...",-38.248946,-17.238489,-17.622277,-19.807867,-14.895027,-21.562521,8.487565,9
69,0.049802,0.020088,0.018802,0.016618,CompassToCartesianTransformer(),EmpiricalWaveletTransform(),HuberRegressor(),{'col_transformer__Direction': CompassToCartes...,-27.605741,-17.91983,-27.716696,-16.903052,-18.138343,-21.656732,4.920464,10


In [14]:
def run_pipelines(param_grids, X_train, y_train, X_test, y_test):
    results = []
    models = []
    for name, params in tqdm(param_grids.items()):
        grid_search = GridSearchCV(pipeline, params, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        results.append([name, fx.MSE(grid_search.predict(X_train), y_train), fx.MSE(grid_search.predict(X_test), y_test), grid_search.best_params_])
        models.append(grid_search.best_estimator_)
    return results, models

In [15]:
def gridsearch_df(param_grids, X_train, y_train, X_test, y_test):
    results = pd.DataFrame()
    for name, params in tqdm(param_grids.items()):
        grid_search = GridSearchCV(pipeline, params, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        results = pd.concat([results, pd.DataFrame(grid_search.cv_results_)])
    return results

In [16]:
# results = gridsearch_df(param_grids, X_train, y_train, X_test, y_test)

In [17]:
# results, models = run_pipelines(param_grids, X_train, y_train, X_test, y_test)

In [18]:
def choose_best_model(results, models):
    # choose best model and save it
    df = pd.DataFrame(results, columns=["Model", "Train MSE", "Test MSE", "Best Params"])
    df = df.sort_values(by="Test MSE")
    best_model = models[df.index[0]]
    name = df["Model"][df.index[0]]
    joblib.dump(best_model, f"{name}-{dt.date.today()}.pkl")
    return df

In [19]:
# choose_best_model(results, models)

In [20]:
# best_model = joblib.dump(GS.best_estimator_, f"{dt.date.today()}.pkl")
best_model = GS.best_estimator_

In [21]:
forecast = fx.load_forecasts()

# load best model
# best_model = joblib.load(r"C:\Users\janni\OneDrive - ITU\Documents\UNI\4Semester\LSDA\Assignments\A1\2023-02-19.pkl")

future = best_model.predict(forecast)
forecast["Power Generation Forecast"] = future
forecast = forecast.resample("3H").mean()
forecast.drop(columns=["Speed", "Source_time"], inplace=True)

test_prediction = best_model.predict(X_test)
test_data = fx.create_timestamps(test_prediction, X_test, y_test)


# combine testforecast and gen_forecast
final_df = pd.concat([test_data, forecast], axis=0)

final_df.columns = ["Model", "Actual", "Forecast"]
final_df

Unnamed: 0,Model,Actual,Forecast
2023-02-04 12:00:00+00:00,14.853033,11.481322,
2023-02-04 15:00:00+00:00,12.103725,11.412967,
2023-02-04 18:00:00+00:00,11.989028,11.316783,
2023-02-04 21:00:00+00:00,11.989028,11.422356,
2023-02-05 00:00:00+00:00,11.808380,11.124344,
...,...,...,...
2023-02-23 09:00:00+00:00,,,9.925750
2023-02-23 12:00:00+00:00,,,10.885034
2023-02-23 15:00:00+00:00,,,11.994987
2023-02-23 18:00:00+00:00,,,11.855169


In [22]:
best_model.fit(X_train, y_train)

In [23]:
fx.MSE(best_model.predict(X_test), y_test)

7.204722221715

In [24]:
# plot train data and model using px.line
fig = px.line(x=X_train.index, y=y_train, title="Train Data")
fig.add_scatter(x=X_train.index, y=best_model.predict(X_train), mode="lines", name="Model")
fig.show()


In [25]:
# plot final_df
fig = px.line(final_df, x=final_df.index, y=["Model", "Actual", "Forecast"], title="Power Generation Forecast")
fig.update_xaxes(title_text="Time")
fig.update_yaxes(title_text="Power Generation")
fig.update_layout(legend_title_text="Legend")
fig.show()