In [32]:
import requests
import pandas as pd
import hopsworks
import datetime
import matplotlib.pyplot as plt
import json
import re
import os
import sys
from dotenv import load_dotenv
from datetime import datetime, timedelta
import warnings
sys.path.append(os.path.abspath(os.path.join('..', 'functions')))
import util
sys.path.append(os.path.abspath(os.path.join('..', 'model')))
import nn
import torch
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split


warnings.filterwarnings("ignore")

In [33]:
load_dotenv()
proj = hopsworks.login()
fs = proj.get_feature_store("KTH_ID2223") 

2024-12-31 14:51:19,950 INFO: Closing external client and cleaning up certificates.
Connection closed.
2024-12-31 14:51:20,038 INFO: Initializing external client
2024-12-31 14:51:20,038 INFO: Base URL: https://c.app.hopsworks.ai:443
2024-12-31 14:51:21,344 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1164449


In [42]:
el_prices_fg = fs.get_feature_group(
    name='el_prices',
    version=1,
)
weather_fg = fs.get_feature_group(
    name='weather',
    version=1,
)
power_fg = fs.get_feature_group(
    name='power',
    version=1,
)

In [43]:
el_prices_fg.show(10)
# weather_fg.show(10)
# power_fg.show(10)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.39s) 


Unnamed: 0,date,price
0,2023-01-23 00:00:00+00:00,1.662196
1,2023-12-20 00:00:00+00:00,0.535002
2,2023-05-27 00:00:00+00:00,0.064113
3,2023-03-15 00:00:00+00:00,0.975358
4,2022-11-01 00:00:00+00:00,0.655705
5,2023-07-27 00:00:00+00:00,0.381928
6,2023-01-26 00:00:00+00:00,1.101363
7,2024-06-02 00:00:00+00:00,0.097737
8,2023-03-22 00:00:00+00:00,0.488279
9,2024-09-04 00:00:00+00:00,0.209347


In [1034]:
weather_df = weather_fg.read()
power_df = power_fg.read()
el_prices_df = el_prices_fg.read()

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.43s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.69s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.63s) 


In [1102]:
merged_df = (
    weather_df
    .merge(power_df, on="date", how="inner")
    .merge(el_prices_df, on="date", how="inner")
)

merged_df['date'] = pd.to_datetime(merged_df['date'])
merged_df = merged_df.sort_values(by="date").reset_index(drop=True)

dates = merged_df['date']
features = merged_df.drop(columns=['date', 'price']).iloc[:-1]
target = merged_df['price'].iloc[1:]


In [188]:
# X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=100)

In [1103]:
from sklearn.preprocessing import MinMaxScaler

normalizer = MinMaxScaler()
normalized_continuous_features = normalizer.fit_transform(features)
normalized_features_df = pd.DataFrame(normalized_continuous_features, columns=features.columns)
normalized_features_df

Unnamed: 0,temperature_2m_mean,precipitation_sum,wind_speed_10m_max,wind_direction_10m_dominant,sunshine_duration,hydro_mw,nuclear_mw,other_mw,wind_mw
0,0.653366,0.036923,0.261506,0.584641,0.000000,0.675887,0.706623,0.369720,0.208402
1,0.676117,0.027692,0.264039,0.521290,0.329079,0.588815,0.704314,0.371038,0.331940
2,0.631770,0.000000,0.315841,0.536078,0.423145,0.582232,0.703846,0.390193,0.380272
3,0.642915,0.000000,0.550507,0.403879,0.378280,0.463541,0.705697,0.363238,0.457186
4,0.625419,0.172308,0.474518,0.439128,0.000000,0.459842,0.706593,0.303004,0.489668
...,...,...,...,...,...,...,...,...,...
775,0.393059,0.049231,0.754938,0.773998,0.000000,0.601049,0.736722,0.207456,0.649949
776,0.506352,0.046154,0.947839,0.771849,0.226924,0.481619,0.735498,0.219950,0.893869
777,0.437348,0.000000,0.622481,0.793877,0.225706,0.591902,0.735926,0.340284,0.672421
778,0.443700,0.160000,0.571780,0.519946,0.000000,0.664265,0.735169,0.275513,0.616507


In [1030]:
target

1      0.607735
2      0.550615
3      0.292413
4      0.422040
5      0.351532
         ...   
776    0.270241
777    0.735706
778    0.384965
779    0.230300
780    0.324242
Name: price, Length: 780, dtype: float64

Testing Lagged Prices

In [1093]:
normalized_features_df['price_lag_1'] = target.shift(1).values
normalized_features_df['price_lag_2'] = target.shift(2).values
normalized_features_df['price_lag_3'] = target.shift(3).values

In [1123]:
X = torch.from_numpy(normalized_features_df.values)
y = torch.from_numpy(target.values)

train_num = int(0.85 * X.shape[0])
X_train = X[:train_num]
X_test = X[train_num:]
y_train = y[:train_num]
y_test = y[train_num:]

# val_num = int(0.2 * train_num)  
# X_val = X_train[-val_num:]
# y_val = y_train[-val_num:]
# X_train = X_train[:-val_num]
# y_train = y_train[:-val_num]


In [1025]:
X_train.size()


torch.Size([624, 13])

In [1095]:
input_features = [
        "temperature_2m_mean", "precipitation_sum", "wind_speed_10m_max",
        "wind_direction_10m_dominant", "sunshine_duration",
        "Hydro Water Reservoir", "Nuclear", "Other", "Wind Onshore", 
]
input_size = len(input_features)
# model = nn.EnergyPricePredictorLSTM(input_size=input_size)

In [1105]:
from torch.utils.data import Dataset, DataLoader
import numpy as np
class SequenceDataset(Dataset):
    def __init__(self, X, y, seq_length):
        self.X = X
        self.y = y
        self.seq_length = seq_length

    def __len__(self):
        return len(self.X) // self.seq_length

    def __getitem__(self, idx):
        x_seq = self.X[idx:idx + self.seq_length]
        y_hist = self.y[idx:idx + self.seq_length].reshape(-1, 1) 
        x_combined = torch.tensor(np.hstack((x_seq, y_hist)), dtype=torch.float32)
        y_target = torch.tensor(self.y[idx + self.seq_length], dtype=torch.float32)
        return x_combined, y_target


        # y_seq = self.y[idx:idx + self.seq_length].unsqueeze(-1)
        # y_target = self.y[idx + self.seq_length]
        # return torch.tensor(y_seq, dtype=torch.float32), torch.tensor(y_target, dtype=torch.float32)



In [1124]:
seq_length = 3
input_size = 10
batch_size = 16
train_dataset = SequenceDataset(X_train, y_train, seq_length)
test_dataset = SequenceDataset(X_test, y_test, seq_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# val_dataset = SequenceDataset(X_val, y_val, seq_length)
# val_loader = DataLoader(val_dataset, batch_size=batch_size)


In [1125]:
for i, (X, y) in enumerate(train_loader):
    print(X.size())
    print(y.size())
    break

for i, (X, y) in enumerate(test_loader):
    print(X.size())
    print(y.size())
    break

torch.Size([16, 3, 10])
torch.Size([16])
torch.Size([16, 3, 10])
torch.Size([16])


In [1126]:
hidden_size = 64        
num_layers = 2     
output_size = 1

In [1127]:
import torch.optim as optim
import torch.nn as nn

class PriceLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(PriceLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(p=0.4)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)  
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)  
        out, _ = self.lstm(x, (h0, c0))  
        out = self.dropout(out)
        out = self.fc(out[:, -1, :]) 
        return out

num_epochs = 1000
model = PriceLSTM(input_size, hidden_size, num_layers, output_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)
train_loss_list = []
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = criterion(y_pred.squeeze(), y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    

    train_loss_list.append(train_loss/len(train_loader))
    if epoch % 25 == 0:
        print(f"Epoch {epoch+1}, Training Loss: {train_loss/len(train_loader):.4f}")

    # if epoch % 20 == 0:
    #     model.eval()

    #     val_loss = 0
    #     with torch.no_grad():
    #         for X_batch, y_batch in val_loader:
    #             y_pred = model(X_batch)
    #             val_loss += loss.item()
        
    #     print(f"Epoch {epoch+1}, Training Loss: {train_loss/len(train_loader):.4f}, Validation Loss: {val_loss/len(val_loader)}")

from sklearn.metrics import mean_squared_error, r2_score

model.eval()
test_loss = 0
y_preds = []
y_true = []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        y_pred = model(X_batch)
        y_preds += y_pred.squeeze().tolist()
        y_true += y_batch.tolist()

mse = mean_squared_error(y_true, y_preds)
print("MSE:", mse)

r2 = r2_score(y_true, y_preds)
print("R squared:", r2)
# 0.15 0.12 0.24 / 3x64 
#      0.13          / 3x128
#      0.16          / 3x32
#      0.10   0.19    0.35 0.274  / 2x64
# 0.102 

Epoch 1, Training Loss: 2.3059
Epoch 26, Training Loss: 0.8655
Epoch 51, Training Loss: 0.3890
Epoch 76, Training Loss: 0.3131
Epoch 101, Training Loss: 0.3076
Epoch 126, Training Loss: 0.2963
Epoch 151, Training Loss: 0.2800
Epoch 176, Training Loss: 0.2913
Epoch 201, Training Loss: 0.2568
Epoch 226, Training Loss: 0.2615
Epoch 251, Training Loss: 0.2411
Epoch 276, Training Loss: 0.2493
Epoch 301, Training Loss: 0.2004
Epoch 326, Training Loss: 0.1991
Epoch 351, Training Loss: 0.1987
Epoch 376, Training Loss: 0.1653
Epoch 401, Training Loss: 0.1773
Epoch 426, Training Loss: 0.1710
Epoch 451, Training Loss: 0.1548
Epoch 476, Training Loss: 0.1626
Epoch 501, Training Loss: 0.1601
Epoch 526, Training Loss: 0.1582
Epoch 551, Training Loss: 0.1663
Epoch 576, Training Loss: 0.1680
Epoch 601, Training Loss: 0.1613
Epoch 626, Training Loss: 0.1471
Epoch 651, Training Loss: 0.1563
Epoch 676, Training Loss: 0.1694
Epoch 701, Training Loss: 0.1634
Epoch 726, Training Loss: 0.1377
Epoch 751, Trai

In [1117]:
target_variance = np.var(y_true)
print("Target Variance:", target_variance)


Target Variance: 0.012120836186714877


In [1118]:
target_mean = np.mean(y_true)
print("Target Mean:", target_mean)


Target Mean: 0.11867289241546622


In [1129]:
print(y_preds)
print(y_true)

[0.32315653562545776, 0.25238165259361267, 0.25038230419158936, 0.22440654039382935, 0.28922802209854126, 0.36193370819091797, 0.19421449303627014, 0.2526513338088989, 0.21887004375457764, 0.14466416835784912, 0.12356337904930115, 0.10380619019269943, 0.1399623602628708, 0.2858304977416992, 0.5071783661842346, 0.3986421823501587, 0.47549405694007874, 0.20019099116325378, 0.2068873792886734, 0.1932094246149063, 0.19209063053131104, 0.1959439516067505, 0.08877623826265335, 0.09596601128578186, 0.11146038770675659, 0.17437508702278137, 0.15514889359474182, 0.20192950963974, 0.17841124534606934, 0.3363396227359772, 0.13258112967014313, 0.16613391041755676, 0.19559583067893982, 0.29330918192863464, 0.22525247931480408, 0.4076455235481262, 0.4698227643966675, 0.4820707142353058, 0.2088499367237091]
[0.04659541696310043, 0.024965833872556686, 0.014926666393876076, 0.10545500367879868, 0.46143290400505066, 0.28074583411216736, 0.20934750139713287, 0.2059720903635025, 0.11346250027418137, 0.076

In [1096]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

xgb_model = XGBRegressor(n_estimators=1000, learning_rate=0.01)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE:", mse)

r2 = r2_score(y_test, y_pred)
print("R squared:", r2)

eval_dict = { 
    "MSE": str(mse),
    "R squared": str(r2),
}

MSE: 0.07517904554145154
R squared: 0.5644445593877336


In [1097]:
print(y_pred.tolist())
print(y_test.tolist())

[0.1578804850578308, 0.16040122509002686, 0.15100912749767303, 0.22404193878173828, 0.3151703476905823, 0.2218434363603592, 0.26445090770721436, 0.24221839010715485, 0.3177732527256012, 0.24816302955150604, 0.23305310308933258, 0.21571038663387299, 0.29243433475494385, 0.38481080532073975, 0.21342813968658447, 0.28036263585090637, 0.2965145707130432, 0.36680641770362854, 0.11406612396240234, 0.3845432996749878, 0.2515608072280884, 0.2049407809972763, 0.1070210263133049, 0.15198741853237152, 0.16504955291748047, 0.25059863924980164, 0.29119640588760376, 0.2879432439804077, 0.23039545118808746, 0.12243174016475677, 0.11377958953380585, 0.1857910454273224, 0.12101051211357117, 0.16305316984653473, 0.1647341102361679, 0.0660918802022934, 0.1433643400669098, 0.04634382575750351, 0.17173747718334198, 0.07695344090461731, 0.18522004783153534, 0.17565558850765228, 0.18228211998939514, 0.13385161757469177, 0.061195164918899536, 0.22207963466644287, 0.11822661757469177, 0.32753443717956543, 0.19

Upload model to Hopsworks

In [1098]:
model_dir = "price_prediction_model"
if not os.path.exists(model_dir):
    os.mkdir(model_dir)

In [1099]:
xgb_model.save_model(model_dir + "/model.json")

In [1100]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(normalized_features_df)
output_schema = Schema(target)

model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

mr = proj.get_model_registry()

price_prediction_model = mr.python.create_model(
    name="price_prediction_model", 
    metrics=eval_dict,
    model_schema=model_schema,
    input_example=target.sample().values, 
    description="Electricity Price Predicition Model",
)

price_prediction_model.save(model_dir)

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/3921325 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/20 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/1162 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1164449/models/price_prediction_model/3


Model(name: 'price_prediction_model', version: 3)