In [1]:

import requests
import pandas as pd
import hopsworks
import datetime
import matplotlib.pyplot as plt
import json
import re
import os
import sys
from dotenv import load_dotenv
from datetime import datetime, timedelta
import warnings
sys.path.append(os.path.abspath(os.path.join('..', 'functions')))
import util

warnings.filterwarnings("ignore")

In [2]:
load_dotenv()
proj = hopsworks.login()
fs = proj.get_feature_store() 

2024-12-23 13:33:57,767 INFO: Initializing external client
2024-12-23 13:33:57,768 INFO: Base URL: https://c.app.hopsworks.ai:443
2024-12-23 13:33:59,160 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1164449


In [3]:
el_prices_df = pd.read_csv('../data/prices.csv')

This DataFrame has already been cleaned

In [4]:
el_prices_df.head()

Unnamed: 0,date,price
0,2022-11-01,0.655705
1,2022-11-02,0.607735
2,2022-11-03,0.550615
3,2022-11-04,0.292413
4,2022-11-05,0.42204


In [5]:
el_prices_df["date"] = pd.to_datetime(el_prices_df["date"])
el_prices_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 781 entries, 0 to 780
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    781 non-null    datetime64[ns]
 1   price   781 non-null    float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 12.3 KB


In [6]:
el_prices_fg = fs.get_or_create_feature_group(
    name='el_prices',
    description='Average electricty price of each day',
    version=1,
    primary_key=['date'],
    event_time="date",
)

In [7]:
load_dotenv()
ELECTRICITY_API_TOKEN = os.getenv("ELECTRICTY_MAP_API_KEY")

latitude = 59.329323
longitude = 18.068581
today = datetime.now()
start_date = "2022-11-01"
end_date = "2024-12-20"

weather_data = util.get_historical_weather(start_date, end_date, latitude, longitude)

weather_data.head()

Coordinates 59.29701232910156°N 18.163265228271484°E
Elevation 24.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s


Unnamed: 0,date,temperature_2m_mean,precipitation_sum,wind_speed_10m_max,wind_direction_10m_dominant,sunshine_duration
0,2022-11-01,8.792083,1.2,13.202726,210.351074,0.0
1,2022-11-02,9.612916,0.9,13.276144,187.686081,19597.978516
2,2022-11-03,8.012917,0.0,14.77755,192.976761,25200.0
3,2022-11-04,8.415,0.0,21.578989,145.680145,22528.138672
4,2022-11-05,7.78375,5.6,19.376562,158.291138,0.0


In [8]:
weather_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 781 entries, 0 to 780
Data columns (total 6 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   date                         781 non-null    datetime64[ns]
 1   temperature_2m_mean          781 non-null    float32       
 2   precipitation_sum            781 non-null    float32       
 3   wind_speed_10m_max           781 non-null    float32       
 4   wind_direction_10m_dominant  781 non-null    float32       
 5   sunshine_duration            781 non-null    float32       
dtypes: datetime64[ns](1), float32(5)
memory usage: 21.5 KB


In [9]:
weather_fg = fs.get_or_create_feature_group(
    name='weather',
    description='Weather characteristics of each day',
    version=1,
    primary_key=['date'],
    event_time="date",
) 

In [10]:
power_data = util.process_energy_data()
power_data.rename(columns={"Date": "date",
                           "Hydro Water Reservoir - Actual Aggregated [MW]": "hydro_mw",
                           "Nuclear - Actual Aggregated [MW]": "nuclear_mw",
                           "Other - Actual Aggregated [MW]": "other_mw",
                           "Wind Onshore - Actual Aggregated [MW]": "wind_mw"}, inplace=True)
power_data.head()

Unnamed: 0,date,hydro_mw,nuclear_mw,other_mw,wind_mw
0,"""01.11.2022",8805.291667,5699.208333,897.958333,2831.708333
1,"""01.12.2022",11297.375,5742.666667,1544.708333,602.208333
2,"""02.11.2022",7981.458333,5689.541667,900.416667,4226.875
3,"""02.12.2022",11041.541667,5753.958333,1778.0,1336.625
4,"""03.11.2022",7919.166667,5687.583333,936.125,4772.708333


In [11]:
power_data["date"] = power_data["date"].str.strip('"')
power_data["date"] = pd.to_datetime(power_data["date"], format="%d.%m.%Y")
power_data.sort_values(by="date", inplace=True)
power_data.head()

Unnamed: 0,date,hydro_mw,nuclear_mw,other_mw,wind_mw
0,2022-11-01,8805.291667,5699.208333,897.958333,2831.708333
2,2022-11-02,7981.458333,5689.541667,900.416667,4226.875
4,2022-11-03,7919.166667,5687.583333,936.125,4772.708333
6,2022-11-04,6796.166667,5695.333333,885.875,5641.333333
8,2022-11-05,6761.166667,5699.083333,773.583333,6008.166667


In [12]:
power_data = power_data.iloc[:-2]
power_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 781 entries, 0 to 665
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   date        781 non-null    datetime64[ns]
 1   hydro_mw    781 non-null    float64       
 2   nuclear_mw  781 non-null    float64       
 3   other_mw    781 non-null    float64       
 4   wind_mw     781 non-null    float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 36.6 KB


In [13]:
power_fg = fs.get_or_create_feature_group(
    name='power',
    description='Power generation by source',
    version=1,
    primary_key=['date'],
    event_time="date",
) 

In [14]:
el_prices_fg.insert(el_prices_df, write_options={"wait_for_job": True})
weather_fg.insert(weather_data, write_options={"wait_for_job": True})
power_fg.insert(power_data, write_options={"wait_for_job": True})

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1164449/fs/1155152/fg/1393149


Uploading Dataframe: 100.00% |██████████| Rows 781/781 | Elapsed Time: 00:02 | Remaining Time: 00:00


Launching job: el_prices_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1164449/jobs/named/el_prices_1_offline_fg_materialization/executions
2024-12-23 13:34:20,344 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2024-12-23 13:34:23,516 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2024-12-23 13:36:47,723 INFO: Waiting for execution to finish. Current state: SUCCEEDING. Final status: UNDEFINED
2024-12-23 13:36:50,888 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2024-12-23 13:36:51,043 INFO: Waiting for log aggregation to finish.
2024-12-23 13:37:09,783 INFO: Execution finished successfully.
2024-12-23 13:37:09,784 INFO: Materialisation job was not scheduled.
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1164449/fs/1155152/fg/1393150


Uploading Dataframe: 100.00% |██████████| Rows 781/781 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: weather_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1164449/jobs/named/weather_1_offline_fg_materialization/executions
2024-12-23 13:37:25,410 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2024-12-23 13:37:28,579 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2024-12-23 13:39:29,857 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2024-12-23 13:39:30,010 INFO: Waiting for log aggregation to finish.
2024-12-23 13:39:55,270 INFO: Execution finished successfully.
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1164449/fs/1155152/fg/1393152


Uploading Dataframe: 100.00% |██████████| Rows 781/781 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: power_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1164449/jobs/named/power_1_offline_fg_materialization/executions
2024-12-23 13:40:11,349 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2024-12-23 13:40:14,548 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2024-12-23 13:42:06,194 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2024-12-23 13:42:06,358 INFO: Waiting for log aggregation to finish.
2024-12-23 13:42:28,237 INFO: Execution finished successfully.


(Job('power_1_offline_fg_materialization', 'SPARK'), None)