In [1]:
import hopsworks
import pandas as pd
from zoneinfo import ZoneInfo


In [2]:
import os
from dotenv import load_dotenv
load_dotenv()


PROJECT_NAME = os.getenv("HOPSWORKS_PROJECT", "Flight_Predictor_JFK")
HOPSWORKS_API_KEY = os.getenv("HOPSWORKS_API_KEY")

project = hopsworks.login(
    project=PROJECT_NAME,
    api_key_value=HOPSWORKS_API_KEY
)

print(f"Connected to Hopsworks project: {project.name}")

2026-01-06 04:23:57,305 INFO: Initializing external client
2026-01-06 04:23:57,306 INFO: Base URL: https://c.app.hopsworks.ai:443
2026-01-06 04:23:59,831 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1338517
Connected to Hopsworks project: Flight_Predictor_JFK


In [4]:
fs = project.get_feature_store()

In [None]:


bts_fg = fs.get_feature_group(
    name="bts_jfk_selected_features_fg1",
    version=1
)

df_flights = bts_fg.read()
print(df_flights.shape)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (12.07s) 
(107589, 11)


In [9]:
df_flights["sched_dep_local"] = pd.to_datetime(
    df_flights["sched_dep_local"],
    utc=True,
    errors="coerce"
)

# Drop rows with invalid timestamps (should be very few or zero)
df_flights = df_flights.dropna(subset=["sched_dep_local"])

In [10]:
df_flights["sched_hour_utc"] = df_flights["sched_dep_local"].dt.floor("H")

In [11]:
df_flights[["sched_dep_local", "sched_hour_utc"]].head(10)

Unnamed: 0,sched_dep_local,sched_hour_utc
0,2024-10-06 20:29:00+00:00,2024-10-06 20:00:00+00:00
1,2024-10-15 16:00:00+00:00,2024-10-15 16:00:00+00:00
2,2024-10-03 14:30:00+00:00,2024-10-03 14:00:00+00:00
3,2024-10-03 10:30:00+00:00,2024-10-03 10:00:00+00:00
4,2024-10-14 18:30:00+00:00,2024-10-14 18:00:00+00:00
5,2024-10-06 23:00:00+00:00,2024-10-06 23:00:00+00:00
6,2024-10-27 12:35:00+00:00,2024-10-27 12:00:00+00:00
7,2024-10-31 12:35:00+00:00,2024-10-31 12:00:00+00:00
8,2024-10-25 11:40:00+00:00,2024-10-25 11:00:00+00:00
9,2024-10-04 02:45:00+00:00,2024-10-04 02:00:00+00:00


In [12]:
bts_fg_v2 = fs.get_or_create_feature_group(
    name="bts_jfk_selected_features_fg1",
    version=2,
    primary_key=["flight_id"],
    event_time="sched_dep_local",
    description="BTS JFK flights with hourly-rounded departure time (UTC)"
)

bts_fg_v2.insert(df_flights)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1338517/fs/1329219/fg/1893823


Uploading Dataframe: 100.00% |██████████| Rows 107589/107589 | Elapsed Time: 00:06 | Remaining Time: 00:00


Launching job: bts_jfk_selected_features_fg1_2_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1338517/jobs/named/bts_jfk_selected_features_fg1_2_offline_fg_materialization/executions


(Job('bts_jfk_selected_features_fg1_2_offline_fg_materialization', 'SPARK'),
 None)

In [13]:
print("Rows:", df_flights.shape[0])
print("Min hour:", df_flights["sched_hour_utc"].min())
print("Max hour:", df_flights["sched_hour_utc"].max())

Rows: 107589
Min hour: 2024-10-01 10:00:00+00:00
Max hour: 2025-10-01 02:00:00+00:00


In [None]:
fs = project.get_feature_store()

TZ_NY = ZoneInfo("America/New_York")

flights_fg_v2 = fs.get_feature_group("bts_jfk_selected_features_fg1", version=2)

# Read through Feature Query Service (avoid Hive path issues)
df_flights = flights_fg_v2.select_all().read()

df_flights["sched_dep_local"] = pd.to_datetime(df_flights["sched_dep_local"], utc=True, errors="coerce")
df_flights = df_flights.dropna(subset=["sched_dep_local"])

# Convert UTC -> NY local time, then floor to hour
df_flights["sched_hour_local"] = df_flights["sched_dep_local"].dt.tz_convert(TZ_NY).dt.floor("h")

# Create v3 feature group with the added join key
flights_fg_v3 = fs.get_or_create_feature_group(
    name="bts_jfk_selected_features_fg1",
    version=3,
    primary_key=["flight_id"],
    event_time="sched_dep_local",
    description="BTS JFK flights with sched_hour_utc and sched_hour_local (NY) for weather joins"
)

flights_fg_v3.insert(df_flights)

print("Created flights FG v3 with sched_hour_local")

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (4.74s) 
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1338517/fs/1329219/fg/1908094


Uploading Dataframe: 100.00% |██████████| Rows 107589/107589 | Elapsed Time: 00:09 | Remaining Time: 00:00


Launching job: bts_jfk_selected_features_fg1_3_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1338517/jobs/named/bts_jfk_selected_features_fg1_3_offline_fg_materialization/executions
✅ Created flights FG v3 with sched_hour_local


In [14]:
flights_fg_v3 = fs.get_feature_group("bts_jfk_selected_features_fg1", version=3)
df_flights = flights_fg_v3.select_all().read()

# Create v4 with composite PK so joins are supported
flights_fg_v4 = fs.get_or_create_feature_group(
    name="bts_jfk_selected_features_fg1",
    version=4,
    primary_key=["flight_id", "sched_hour_local"],
    event_time="sched_dep_local",
    description="Flights FG with sched_hour_local included in primary key for joining with hourly weather"
)

flights_fg_v4.insert(df_flights)

print("✅ Flights FG v4 inserted with PK = [flight_id, sched_hour_local]")

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (25.49s) 
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1338517/fs/1329219/fg/1911144


Uploading Dataframe: 100.00% |██████████| Rows 107589/107589 | Elapsed Time: 00:05 | Remaining Time: 00:00


Launching job: bts_jfk_selected_features_fg1_4_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1338517/jobs/named/bts_jfk_selected_features_fg1_4_offline_fg_materialization/executions
✅ Flights FG v4 inserted with PK = [flight_id, sched_hour_local]


In [10]:
weather_fg_v1 = fs.get_feature_group("weather_jfk_hourly_fg", version=1)

# Read using Feature Query Service (avoid Hive)
df_weather = weather_fg_v1.select_all().read()

# Create a join key with the SAME name as flights FG
df_weather["sched_hour_local"] = pd.to_datetime(df_weather["weather_hour_local"], errors="coerce")

# Keep only what we need (optional)
df_weather_v2 = df_weather[[
    "sched_hour_local",
    "station",
    "latitude",
    "longitude",
    "weather_code",
    "wind_speed_ms",
    "wind_gust_ms",
    "temp_c",
    "precip_mm",
    "snowfall_cm",
    "visibility_m",
]].copy()

# Create v2 where sched_hour_local is the PRIMARY KEY
weather_fg_v2 = fs.get_or_create_feature_group(
    name="weather_jfk_hourly_fg",
    version=2,
    primary_key=["sched_hour_local"],
    event_time="sched_hour_local",
    description="Hourly JFK weather with sched_hour_local as PK for joining with flights"
)

weather_fg_v2.insert(df_weather_v2)

print("Weather FG v2 created with sched_hour_local as primary key")

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.92s) 
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1338517/fs/1329219/fg/1911143


Uploading Dataframe: 100.00% |██████████| Rows 8784/8784 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: weather_jfk_hourly_fg_2_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1338517/jobs/named/weather_jfk_hourly_fg_2_offline_fg_materialization/executions
Weather FG v2 created with sched_hour_local as primary key


In [15]:
# ---- Get Feature Groups ----
flights_fg = fs.get_feature_group(
    name="bts_jfk_selected_features_fg1",
    version=4
)

weather_fg = fs.get_feature_group(
    name="weather_jfk_hourly_fg",
    version=2
)

In [6]:
print([f.name + ":" + f.type for f in flights_fg.features])
print([f.name + ":" + f.type for f in weather_fg.features])

['flight_id:string', 'sched_dep_local:timestamp', 'quarter:bigint', 'month:bigint', 'day_of_month:bigint', 'day_of_week:bigint', 'crs_dep_time:string', 'reporting_airline:string', 'dest:string', 'distance:double', 'dep_delay:double', 'sched_hour_utc:timestamp', 'sched_hour_local:timestamp']
['weather_id:string', 'weather_hour_local:timestamp', 'station:string', 'latitude:double', 'longitude:double', 'weather_code:bigint', 'wind_speed_ms:double', 'wind_gust_ms:double', 'temp_c:double', 'precip_mm:double', 'snowfall_cm:double', 'visibility_m:double']


In [7]:
print(weather_fg.schema)

[Feature('weather_id', 'string', None, True, False, False, None, None, 1893821), Feature('weather_hour_local', 'timestamp', None, False, False, False, None, None, 1893821), Feature('station', 'string', None, False, False, False, None, None, 1893821), Feature('latitude', 'double', None, False, False, False, None, None, 1893821), Feature('longitude', 'double', None, False, False, False, None, None, 1893821), Feature('weather_code', 'bigint', None, False, False, False, None, None, 1893821), Feature('wind_speed_ms', 'double', None, False, False, False, None, None, 1893821), Feature('wind_gust_ms', 'double', None, False, False, False, None, None, 1893821), Feature('temp_c', 'double', None, False, False, False, None, None, 1893821), Feature('precip_mm', 'double', None, False, False, False, None, None, 1893821), Feature('snowfall_cm', 'double', None, False, False, False, None, None, 1893821), Feature('visibility_m', 'double', None, False, False, False, None, None, 1893821)]


In [16]:
fl = flights_fg.select([
    "flight_id",
    "sched_dep_local",
    "sched_hour_local",
    "month",
    "day_of_week",
    "reporting_airline",
    "dest",
    "distance",
    "dep_delay",
])

wx = weather_fg.select([
    "sched_hour_local",
    "weather_code",
    "wind_speed_ms",
    "wind_gust_ms",
    "temp_c",
    "precip_mm",
    "snowfall_cm",
    "visibility_m",
])

join_query = fl.join(wx, fl.sched_hour_local == wx.sched_hour_local)

In [17]:
fv = fs.get_or_create_feature_view(
    name="jfk_delay_weather_fv",
    version=1,
    query=join_query,
    description="JFK delay FV: flights joined with hourly weather on sched_hour_local"
)

print(f"✅ Feature View created: {fv.name} v{fv.version}")

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1338517/fs/1329219/fv/jfk_delay_weather_fv/version/1
✅ Feature View created: jfk_delay_weather_fv v1


In [18]:
fs = project.get_feature_store()

fv = fs.get_feature_view("jfk_delay_weather_fv", version=1)

df_sample = fv.get_batch_data()
print(df_sample.shape)
df_sample.head()

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (8.53s) 
(107589, 16)


Unnamed: 0,flight_id,sched_dep_local,sched_hour_local,month,day_of_week,reporting_airline,dest,distance,dep_delay,weather_jfk_hourly_fg_weather_code,weather_jfk_hourly_fg_wind_speed_ms,weather_jfk_hourly_fg_wind_gust_ms,weather_jfk_hourly_fg_temp_c,weather_jfk_hourly_fg_precip_mm,weather_jfk_hourly_fg_snowfall_cm,weather_jfk_hourly_fg_visibility_m
0,JFK_PBI_B6_2024-10-01 14:00:00-04:00,2024-10-01 18:00:00+00:00,2024-10-01 18:00:00+00:00,10,2,B6,PBI,1028.0,3.0,3,17.8,33.5,21.0,0.0,0.0,
1,JFK_PIT_YX_2024-10-01 14:00:00-04:00,2024-10-01 18:00:00+00:00,2024-10-01 18:00:00+00:00,10,2,YX,PIT,340.0,-7.0,3,17.8,33.5,21.0,0.0,0.0,
2,JFK_BOS_B6_2024-10-01 14:00:00-04:00,2024-10-01 18:00:00+00:00,2024-10-01 18:00:00+00:00,10,2,B6,BOS,187.0,49.0,3,17.8,33.5,21.0,0.0,0.0,
3,JFK_BOS_YX_2024-10-01 14:20:00-04:00,2024-10-01 18:20:00+00:00,2024-10-01 18:00:00+00:00,10,2,YX,BOS,187.0,-3.0,3,17.8,33.5,21.0,0.0,0.0,
4,JFK_PIT_9E_2024-10-01 14:25:00-04:00,2024-10-01 18:25:00+00:00,2024-10-01 18:00:00+00:00,10,2,9E,PIT,340.0,-5.0,3,17.8,33.5,21.0,0.0,0.0,
