In [None]:
from pathlib import Path
from dotenv import load_dotenv
import warnings
import pandas as pd
import hopsworks
import datetime
import os

import openmeteo_requests

import requests_cache
from retry_requests import retry

warnings.filterwarnings("ignore")


In [None]:
load_dotenv("../.env", override=True)
HOPSWORKS_API_KEY = os.getenv("HOPSWORKS_API_KEY")
if HOPSWORKS_API_KEY is None:
    raise RuntimeError(
        "HOPSWORKS_API_KEY not found."
    )
else:
    print(f"HOPSWORKS_API_KEY loaded: {HOPSWORKS_API_KEY[:4]}...{HOPSWORKS_API_KEY[-4:]}")

project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY, project="project_scalable")
fs = project.get_feature_store()

In [None]:
project.name

In [None]:
def clean_hist_data(df_raw):
    #Extract metadata rows
    energy_sources = df_raw.iloc[0]
    prod_type = df_raw.iloc[1]
    areas = df_raw.iloc[2]
    units = df_raw.iloc[3]
    # Drop metadata rows
    df = df_raw.iloc[5:].reset_index(drop=True)
    # Parse datetime column
    df = df.rename(columns={0: "datetime"})
    df["datetime"] = pd.to_datetime(df["datetime"], format="%d.%m.%Y %H:%M")
    #AREAS = ["SE1", "SE2", "SE3", "SE4"]
    AREAS = ["SE3"]
    dfs = []
    # Extract solar production per area
    for area in AREAS:
        solar_cols = []
        for i in range(1, df.shape[1]):
            if energy_sources[i] == "Solkraft" and prod_type[i] == "produktion" and areas[i] == area:
                solar_cols.append(i)
        if len(solar_cols) == 0:
            print(f"No solar data found for {area}")
            continue
        df_area = df[["datetime"] + solar_cols].copy()
        # Sum in case of multiple solar columns
        df_area["MWh"] = df_area.iloc[:, 1:].sum(axis=1)
        df_area = df_area[["datetime", "MWh"]]
        df_area["area"] = area
        dfs.append(df_area)

  
    solar_all = (
        pd.concat(dfs)
        .sort_values(["datetime"])
       #.sort_values(["datetime", "area"])
        .reset_index(drop=True)
    )

    return solar_all


In [None]:
data_dir = Path("../data")
print(data_dir.resolve())     
print(list(data_dir.glob("*.csv"))) 
csv_files = sorted(data_dir.glob("*.csv"))
csv_files

In [None]:
all_years = []

for csv_file in csv_files:
    print(f"Processing {csv_file.name}")
    
    df_raw = pd.read_csv(csv_file, header=None)
    df_clean = clean_hist_data(df_raw)
    
    all_years.append(df_clean)
    
solar_hist = (
    pd.concat(all_years)
      .sort_values(["datetime"])
     #.sort_values(["datetime", "area"])
      .reset_index(drop=True)
)

solar_hist.head()

In [None]:
#check
solar_hist.info()
solar_hist.isna().sum()
print(solar_hist.shape)
print(solar_hist.dtypes)
print(solar_hist.head())


In [None]:
#weather data
#taken from openmeteo

def hist_weather_data(latitude, longitude, start_date, end_date):
    # Setup the Open-Meteo API client with cache and retry on error
    cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
    retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
    openmeteo = openmeteo_requests.Client(session = retry_session)
    
    # Make sure all required weather variables are listed here
    # The order of variables in hourly or daily is important to assign them correctly below
    url = "https://archive-api.open-meteo.com/v1/archive"
    params = {
    	"latitude": latitude,
    	"longitude": longitude,
    	"start_date": start_date,
    	"end_date": end_date,
    	"hourly": ["temperature_2m", "shortwave_radiation", "cloud_cover", "wind_speed_10m", "precipitation"],
        
    }
    responses = openmeteo.weather_api(url, params=params)
    
    # Process first location. Add a for-loop for multiple locations or weather models
    response = responses[0]
    print(f"Coordinates: {response.Latitude()}°N {response.Longitude()}°E")
    print(f"Elevation: {response.Elevation()} m asl")
    print(f"Timezone difference to GMT+0: {response.UtcOffsetSeconds()}s")
    
   
   
    hourly = response.Hourly()
    temperature_2m = hourly.Variables(0).ValuesAsNumpy()
    shortwave_radiation = hourly.Variables(1).ValuesAsNumpy()
    cloud_cover = hourly.Variables(2).ValuesAsNumpy()
    wind_speed_10m = hourly.Variables(3).ValuesAsNumpy()
    precipitation = hourly.Variables(4).ValuesAsNumpy()
    


    
    hourly_data = {"datetime": pd.date_range(
    	start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
    	end =  pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
    	freq = pd.Timedelta(seconds = hourly.Interval()),
    	inclusive = "left"
    )}
    
    hourly_data["temperature_2m"] = temperature_2m
    hourly_data["shortwave_radiation"] = shortwave_radiation
    hourly_data["cloud_cover"] = cloud_cover
    hourly_data["wind_speed_10m"] = wind_speed_10m
    hourly_data["precipitation"] = precipitation

    
    hourly_data_df = pd.DataFrame(data = hourly_data)
    hourly_data_df = hourly_data_df.dropna()

    print("\nHourly data\n", hourly_data_df)

    return hourly_data_df
    

In [None]:
AREAS = {
    #"SE1": {"lat": 65.58, "lon": 22.15},
    #"SE2": {"lat": 62.39, "lon": 17.30},
    "SE3": {"lat": 59.33, "lon": 18.06},
    #"SE4": {"lat": 55.60, "lon": 13.00},
}

weather_dfs = []

for area, coords in AREAS.items():
    df_weather = hist_weather_data(
        latitude=coords["lat"],
        longitude=coords["lon"],
        start_date="2023-01-01",
        end_date="2025-11-30",
    )
    df_weather["area"] = area
    weather_dfs.append(df_weather)

weather_all = pd.concat(weather_dfs).sort_values(["datetime"]).reset_index(drop=True)


weather_all.info()
weather_all.isna().sum()


In [None]:
solar_hist["MWh"] = solar_hist["MWh"].astype(float)
solar_hist["datetime"] = pd.to_datetime(solar_hist["datetime"])
solar_hist["area"] = solar_hist["area"].astype(str)
solar_hist = solar_hist.dropna()


In [None]:
solar_enery_fg = fs.get_or_create_feature_group(
    name='solar_energy',
    description='historical solar energy data',
    version=1,
    primary_key=['datetime'],
    event_time="datetime",
)
solar_enery_fg.insert(solar_hist)

In [None]:
weather_fg = fs.get_or_create_feature_group(
    name='weather_data',
    description='historical weather data',
    version=1,
    primary_key=['datetime'],
    event_time="datetime",
)
