In [None]:
import hopsworks
import sys
import os
from pathlib import Path
import pandas as pd

In [None]:
root_dir = str(Path().absolute())
print(f"Root dir: {root_dir}")

INPUT_DIR = Path(root_dir).parent / "data" / "feature_groups"
TRAFFIC_CSV = INPUT_DIR / "traffic_features.csv"
WEATHER_CSV = INPUT_DIR / "weather_features.csv"
CALENDAR_CSV = INPUT_DIR / "calendar_features.csv"

# Add the root directory to the `PYTHONPATH` 
if root_dir not in sys.path:
    sys.path.append(root_dir)
    print(f"Added the following directory to the PYTHONPATH: {root_dir}")

HOPSWORKS_API_KEY = os.environ.get("HOPSWORKS_API_KEY")

In [None]:
print("\n--- Connecting to Hopsworks ---")
project = hopsworks.login(
    host="eu-west.cloud.hopsworks.ai",                            # DNS of your Hopsworks instance
    project="occupancy",                      # Name of your Hopsworks project
    api_key_value=HOPSWORKS_API_KEY    # Hopsworks API key value 
)
fs = project.get_feature_store()               # Get the project's default feature store"

In [None]:
def prepare_event_time(df: pd.DataFrame, date_col: str = "date", hour_col: str = "hour") -> pd.DataFrame:
    """
    1. Converts date to datetime to calculate event_time.
    2. Converts date BACK to string because Online Feature Groups 
       do not support Timestamp types as Primary Keys.
    """
    df = df.copy()
    
    dt_series = pd.to_datetime(df[date_col])
    
    if hour_col in df.columns:
        df["event_time"] = dt_series + pd.to_timedelta(df[hour_col], unit="h")
    else:
        df["event_time"] = dt_series
        
    df[date_col] = dt_series.dt.strftime("%Y-%m-%d")
        
    return df

## Traffic data

In [None]:
traffic_df = pd.read_csv(TRAFFIC_CSV)
traffic_df = prepare_event_time(traffic_df)
traffic_df.info()

In [None]:
traffic_fg = fs.get_or_create_feature_group(
    name="skane_traffic",
    version=1,
    description="Aggregated hourly occupancy data for Skånetrafiken routes",
    primary_key=["route_id", "date", "hour"],
    event_time="event_time",
)

In [None]:
traffic_fg.insert(traffic_df)
print("Traffic data inserted.")

In [None]:
traffic_fg.update_feature_description("date", "Date of the traffic measurement")
traffic_fg.update_feature_description("hour", "Hour of the day (0-23)")
traffic_fg.update_feature_description("route_id", "Unique identifier for the bus route/line")
traffic_fg.update_feature_description("label_grouped", "Target variable: Categorical crowding level (EMPTY, MANY_SEATS_AVAILABLE, CROWDED)")
traffic_fg.update_feature_description("label_mode", "Raw most frequent occupancy label observed in this hour")
traffic_fg.update_feature_description("avg_occupancy_score", "Continuous score representing average occupancy (used to derive labels)")
traffic_fg.update_feature_description("n_obs", "Total number of bus observations recorded in this specific hour/route")
traffic_fg.update_feature_description("n_snapshots_used", "Count of valid data snapshots used to aggregate this record")
traffic_fg.update_feature_description("event_time", "Timestamp used for point-in-time correct joins")

## Calendar data

In [None]:
calendar_df = pd.read_csv(CALENDAR_CSV)
calendar_df = prepare_event_time(calendar_df)
calendar_df.info()

In [None]:
calendar_fg = fs.get_or_create_feature_group(
    name="sweden_calendar",
    version=1,
    description="Calendar and holiday data for Sweden",
    primary_key=["date"],
    event_time="event_time",
)

In [None]:
calendar_fg.insert(calendar_df)
print("Calendar data inserted.")

In [None]:
calendar_fg.update_feature_description("date", "Date in YYYY-MM-DD format")
calendar_fg.update_feature_description("year", "Year (e.g., 2024)")
calendar_fg.update_feature_description("month", "Month of the year (1-12)")
calendar_fg.update_feature_description("day", "Day of the month (1-31)")
calendar_fg.update_feature_description("weekday", "Day of the week (Monday, Tuesday, etc.)")
calendar_fg.update_feature_description("is_weekend", "Boolean flag indicating if the day is Saturday or Sunday")
calendar_fg.update_feature_description("is_holiday_se", "Boolean flag indicating if the day is a Swedish public holiday")
calendar_fg.update_feature_description("is_workday_se", "Boolean flag indicating if the day is a standard working day in Sweden")
calendar_fg.update_feature_description("event_time", "Timestamp used for point-in-time correct joins")

## Weather data

In [None]:
weather_df = pd.read_csv(WEATHER_CSV)
weather_df = prepare_event_time(weather_df)
weather_df.info()

In [None]:
weather_fg = fs.get_or_create_feature_group(
    name="skane_weather",
    version=1,
    description="Hourly weather data for Skåne (OpenMeteo)",
    primary_key=["date", "hour"],
    event_time="event_time",
)

In [None]:
weather_fg.insert(weather_df)
print("Weather data inserted.")

In [None]:
weather_fg.update_feature_description("temperature_2m", "Air temperature at 2 meters above ground (°C)")
weather_fg.update_feature_description("precipitation", "Total precipitation (rain/snow) in mm")
weather_fg.update_feature_description("windspeed_10m", "Wind speed at 10 meters above ground (m/s)")
weather_fg.update_feature_description("cloudcover", "Cloud cover percentage (0-100%)")
weather_fg.update_feature_description("date", "Date of the weather observation")
weather_fg.update_feature_description("hour", "Hour of the day (0-23)")
weather_fg.update_feature_description("event_time", "Timestamp used for point-in-time correct joins")