In [147]:
import hopsworks
import pandas as pd
import os
from zoneinfo import ZoneInfo
import requests
import hsfs, hopsworks
print("hopsworks:", hopsworks.__version__)
print("hsfs:", hsfs.__version__)


hopsworks: 4.2.9
hsfs: 4.2.9


Login and create a hopworks project with api_key

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()


PROJECT_NAME = os.getenv("HOPSWORKS_PROJECT", "Flight_Predictor_JFK")
HOPSWORKS_API_KEY = os.getenv("HOPSWORKS_API_KEY")

project = hopsworks.login(
    project=PROJECT_NAME,
    api_key_value=HOPSWORKS_API_KEY
)

print(f"Connected to Hopsworks project: {project.name}")

2026-01-06 03:11:14,704 INFO: Closing external client and cleaning up certificates.
Connection closed.
2026-01-06 03:11:14,711 INFO: Initializing external client
2026-01-06 03:11:14,711 INFO: Base URL: https://c.app.hopsworks.ai:443
2026-01-06 03:11:16,122 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1338517
Connected to Hopsworks project: Flight_Predictor_JFK


In [149]:
TZ = ZoneInfo("America/New_York")

In [150]:
path_dataset = "data/09_2025.csv"
df = pd.read_csv(path_dataset, low_memory=False)

# Filter to JFK only
df = df[df["ORIGIN"] == "JFK"].copy()

print("Rows after JFK filter:", len(df))
print("Columns available:", len(df.columns))


Rows after JFK filter: 8567
Columns available: 16


In [151]:
# 2) Drop cancelled/diverted flights (recommended)
df["CANCELLED"] = pd.to_numeric(df["CANCELLED"], errors="coerce").fillna(0)
df["DIVERTED"] = pd.to_numeric(df["DIVERTED"], errors="coerce").fillna(0)
df = df[(df["CANCELLED"] == 0) & (df["DIVERTED"] == 0)].copy()
print("Rows after removing cancelled/diverted:", len(df))

Rows after removing cancelled/diverted: 8486


In [152]:
keep_cols = [
    "QUARTER", "MONTH", "DAY_OF_MONTH", "DAY_OF_WEEK",
    "FL_DATE", "CRS_DEP_TIME",
    "OP_UNIQUE_CARRIER", "DEST", "DISTANCE",
    "DEP_DELAY"
]
missing = [c for c in keep_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")

df = df[keep_cols].copy()

In [153]:
# ----------------------------
# TIME CLEANING
# ----------------------------
# CRS_DEP_TIME -> 4-digit string "HHMM"
df["CRS_DEP_TIME"] = pd.to_numeric(df["CRS_DEP_TIME"], errors="coerce")
df = df.dropna(subset=["FL_DATE", "CRS_DEP_TIME"])

df["CRS_DEP_TIME"] = df["CRS_DEP_TIME"].astype(int).astype(str).str.zfill(4)

# FL_DATE -> date (ignore time portion)
df["FL_DATE"] = pd.to_datetime(df["FL_DATE"], errors="coerce")
df = df.dropna(subset=["FL_DATE"])

# Build scheduled departure local timestamp
df["sched_dep_local"] = pd.to_datetime(
    df["FL_DATE"].dt.strftime("%Y-%m-%d") + " " + df["CRS_DEP_TIME"],
    format="%Y-%m-%d %H%M",
    errors="coerce"
)
df = df.dropna(subset=["sched_dep_local"])
df["sched_dep_local"] = df["sched_dep_local"].dt.tz_localize(TZ)




In [154]:
# ----------------------------
# TARGET CLEANING (keep raw dep_delay as requested)
# ----------------------------
df["DEP_DELAY"] = pd.to_numeric(df["DEP_DELAY"], errors="coerce")

In [155]:
# ----------------------------
# PRIMARY KEY
# ----------------------------
df["flight_id"] = (
    "JFK_" +
    df["DEST"].astype(str) + "_" +
    df["OP_UNIQUE_CARRIER"].astype(str) + "_" +
    df["sched_dep_local"].astype(str)
)

In [156]:
# ----------------------------
# FINAL DATAFRAME (rename to clean names)
# ----------------------------
df_fg = df.rename(columns={
    "QUARTER": "quarter",
    "MONTH": "month",
    "DAY_OF_MONTH": "day_of_month",
    "DAY_OF_WEEK": "day_of_week",
    "CRS_DEP_TIME": "crs_dep_time",
    "OP_UNIQUE_CARRIER": "reporting_airline",
    "DEST": "dest",
    "DISTANCE": "distance",
    "DEP_DELAY": "dep_delay",
})

df_fg = df_fg[
    [
        "flight_id",
        "sched_dep_local",
        "quarter",
        "month",
        "day_of_month",
        "day_of_week",
        "crs_dep_time",
        "reporting_airline",
        "dest",
        "distance",
        "dep_delay",
    ]
].copy()

print(df_fg.head())
print("Final rows:", len(df_fg))
print("Unique flight_id:", df_fg["flight_id"].nunique())

                               flight_id           sched_dep_local  quarter  \
58  JFK_AUS_AA_2025-09-01 11:16:00-04:00 2025-09-01 11:16:00-04:00        3   
59  JFK_AUS_AA_2025-09-01 20:30:00-04:00 2025-09-01 20:30:00-04:00        3   
60  JFK_CLT_AA_2025-09-01 06:30:00-04:00 2025-09-01 06:30:00-04:00        3   
61  JFK_CLT_AA_2025-09-01 11:58:00-04:00 2025-09-01 11:58:00-04:00        3   
62  JFK_CLT_AA_2025-09-01 15:25:00-04:00 2025-09-01 15:25:00-04:00        3   

    month  day_of_month  day_of_week crs_dep_time reporting_airline dest  \
58      9             1            1         1116                AA  AUS   
59      9             1            1         2030                AA  AUS   
60      9             1            1         0630                AA  CLT   
61      9             1            1         1158                AA  CLT   
62      9             1            1         1525                AA  CLT   

    distance  dep_delay  
58    1521.0       -5.0  
59    1521.0    

In [157]:
# ----------------------------
# HOPSWORKS
# ----------------------------
project = hopsworks.login(project=PROJECT_NAME, api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store()

bts_fg = fs.get_or_create_feature_group(
    name="bts_jfk_selected_features_fg1",
    version=1,
    primary_key=["flight_id"],
    event_time="sched_dep_local",
    description="BTS JFK departures with selected predictors and dep_delay target"
)

bts_fg.insert(df_fg, write_options={"wait_for_job": True})

print("Feature group created: bts_jfk_selected_features_fg v1")

2026-01-06 03:11:17,792 INFO: Closing external client and cleaning up certificates.
Connection closed.
2026-01-06 03:11:17,794 INFO: Initializing external client
2026-01-06 03:11:17,795 INFO: Base URL: https://c.app.hopsworks.ai:443
2026-01-06 03:11:19,264 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1338517


Uploading Dataframe: 100.00% |██████████| Rows 8486/8486 | Elapsed Time: 00:03 | Remaining Time: 00:00


Launching job: bts_jfk_selected_features_fg1_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1338517/jobs/named/bts_jfk_selected_features_fg1_1_offline_fg_materialization/executions
2026-01-06 03:11:41,399 INFO: Waiting for execution to finish. Current state: INITIALIZING. Final status: UNDEFINED
2026-01-06 03:11:44,622 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2026-01-06 03:11:47,900 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED


%6|1767665513.684|FAIL|rdkafka#consumer-14| [thrd:GroupCoordinator]: GroupCoordinator: 51.161.81.208:9093: Disconnected (after 50149ms in state UP, 1 identical error(s) suppressed)
%6|1767665540.932|FAIL|rdkafka#producer-13| [thrd:ssl://51.161.81.188:9093/bootstrap]: ssl://51.161.81.188:9093/1: Disconnected (after 93105ms in state UP, 1 identical error(s) suppressed)
%6|1767665563.822|FAIL|rdkafka#consumer-14| [thrd:ssl://51.161.81.188:9093/bootstrap]: ssl://51.161.81.188:9093/1: Disconnected (after 150598ms in state UP, 1 identical error(s) suppressed)
%6|1767665564.466|FAIL|rdkafka#consumer-14| [thrd:GroupCoordinator]: GroupCoordinator: 51.161.81.208:9093: Disconnected (after 50138ms in state UP, 1 identical error(s) suppressed)
%6|1767665592.280|FAIL|rdkafka#producer-13| [thrd:ssl://51.161.81.208:9093/bootstrap]: ssl://51.161.81.208:9093/2: Disconnected (after 50133ms in state UP, 1 identical error(s) suppressed)
%6|1767665615.020|FAIL|rdkafka#consumer-14| [thrd:GroupCoordinator]: G

2026-01-06 03:14:06,668 INFO: Waiting for execution to finish. Current state: SUCCEEDING. Final status: UNDEFINED
2026-01-06 03:14:09,886 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2026-01-06 03:14:10,061 INFO: Waiting for log aggregation to finish.


%6|1767665665.149|FAIL|rdkafka#consumer-14| [thrd:ssl://51.161.81.188:9093/bootstrap]: ssl://51.161.81.188:9093/1: Disconnected (after 100138ms in state UP, 1 identical error(s) suppressed)


2026-01-06 03:14:25,546 INFO: Execution finished successfully.
Feature group created: bts_jfk_selected_features_fg v1


Get Weather data from Open Mateo

In [159]:
fs = project.get_feature_store()

bts_fg = fs.get_feature_group(name="bts_jfk_selected_features_fg1", version=1)

# Read only the event time column
df_times = bts_fg.read(online=False)[["sched_dep_local"]].copy()
df_times["sched_dep_local"] = pd.to_datetime(df_times["sched_dep_local"], errors="coerce")
df_times = df_times.dropna(subset=["sched_dep_local"])

min_ts = df_times["sched_dep_local"].min()
max_ts = df_times["sched_dep_local"].max()

START_DATE = min_ts.date().isoformat()
END_DATE = max_ts.date().isoformat()

print("✅ BTS range:")
print("min:", min_ts)
print("max:", max_ts)
print("START_DATE:", START_DATE)
print("END_DATE:", END_DATE)

Reading data from Hopsworks, using Hopsworks Feature Query Service      

%6|1767666070.183|FAIL|rdkafka#consumer-14| [thrd:ssl://51.161.80.189:9093/bootstrap]: ssl://51.161.80.189:9093/0: Disconnected (after 99998ms in state UP, 1 identical error(s) suppressed)
%6|1767666070.581|FAIL|rdkafka#consumer-14| [thrd:GroupCoordinator]: GroupCoordinator: 51.161.81.208:9093: Disconnected (after 50139ms in state UP, 1 identical error(s) suppressed)


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (18.46s) 
✅ BTS range:
min: 2024-10-01 10:00:00+00:00
max: 2025-10-01 02:55:00+00:00
START_DATE: 2024-10-01
END_DATE: 2025-10-01


In [None]:
LAT = 40.6413
LON = -73.7781
TZ_NAME = "UTC"
TZ = ZoneInfo(TZ_NAME)

OPEN_METEO_HIST_URL = "https://archive-api.open-meteo.com/v1/archive"

HOURLY_VARS = [
    "weathercode",
    "windspeed_10m",
    "windgusts_10m",
    "temperature_2m",
    "precipitation",
    "snowfall",
    "visibility",
]

params = {
    "latitude": LAT,
    "longitude": LON,
    "start_date": START_DATE,
    "end_date": END_DATE,
    "hourly": ",".join(HOURLY_VARS),
    "timezone": TZ_NAME,
}

resp = requests.get(OPEN_METEO_HIST_URL, params=params, timeout=180)
resp.raise_for_status()
data = resp.json()
hourly = data["hourly"]

# Parse times safely (works whether Open-Meteo returns tz-aware or naive strings)
times = pd.to_datetime(hourly["time"], errors="coerce")

if times.tz is None:
    # Naive timestamps → localize
    weather_hour_local = times.tz_localize(
        TZ, ambiguous="infer", nonexistent="shift_forward"
    )
else:
    # Already tz-aware → convert
    weather_hour_local = times.tz_convert(TZ)

df_weather = pd.DataFrame({
    "weather_hour_local": weather_hour_local,
    "weather_code": pd.to_numeric(hourly.get("weathercode"), errors="coerce"),
    "wind_speed_ms": pd.to_numeric(hourly.get("windspeed_10m"), errors="coerce"),
    "wind_gust_ms": pd.to_numeric(hourly.get("windgusts_10m"), errors="coerce"),
    "temp_c": pd.to_numeric(hourly.get("temperature_2m"), errors="coerce"),
    "precip_mm": pd.to_numeric(hourly.get("precipitation"), errors="coerce"),
    "snowfall_cm": pd.to_numeric(hourly.get("snowfall"), errors="coerce"),
    "visibility_m": pd.to_numeric(hourly.get("visibility"), errors="coerce"),
}).dropna(subset=["weather_hour_local"])

# Primary key
df_weather["weather_id"] = df_weather["weather_hour_local"].astype(str)

# Optional metadata
df_weather["station"] = "JFK"
df_weather["latitude"] = float(LAT)
df_weather["longitude"] = float(LON)

df_weather_fg = df_weather[
    [
        "weather_id",
        "weather_hour_local",
        "station",
        "latitude",
        "longitude",
        "weather_code",
        "wind_speed_ms",
        "wind_gust_ms",
        "temp_c",
        "precip_mm",
        "snowfall_cm",
        "visibility_m",
    ]
].copy()

print(df_weather_fg.head())
print("Rows:", len(df_weather_fg))
print("Min:", df_weather_fg["weather_hour_local"].min())
print("Max:", df_weather_fg["weather_hour_local"].max())

                  weather_id        weather_hour_local station  latitude  \
0  2024-10-01 00:00:00+00:00 2024-10-01 00:00:00+00:00     JFK   40.6413   
1  2024-10-01 01:00:00+00:00 2024-10-01 01:00:00+00:00     JFK   40.6413   
2  2024-10-01 02:00:00+00:00 2024-10-01 02:00:00+00:00     JFK   40.6413   
3  2024-10-01 03:00:00+00:00 2024-10-01 03:00:00+00:00     JFK   40.6413   
4  2024-10-01 04:00:00+00:00 2024-10-01 04:00:00+00:00     JFK   40.6413   

   longitude  weather_code  wind_speed_ms  wind_gust_ms  temp_c  precip_mm  \
0   -73.7781             3           15.5          30.2    17.6        0.0   
1   -73.7781             3           10.3          17.6    17.0        0.0   
2   -73.7781             3            9.8          16.6    16.7        0.0   
3   -73.7781             3           10.8          17.6    16.4        0.0   
4   -73.7781             3           10.7          18.0    15.9        0.0   

   snowfall_cm  visibility_m  
0          0.0           NaN  
1          0

%6|1767666677.451|FAIL|rdkafka#consumer-14| [thrd:ssl://51.161.80.189:9093/bootstrap]: ssl://51.161.80.189:9093/0: Disconnected (after 99909ms in state UP, 1 identical error(s) suppressed)
%6|1767666678.004|FAIL|rdkafka#consumer-14| [thrd:GroupCoordinator]: GroupCoordinator: 51.161.81.208:9093: Disconnected (after 50133ms in state UP, 1 identical error(s) suppressed)
%6|1767666728.572|FAIL|rdkafka#consumer-14| [thrd:GroupCoordinator]: GroupCoordinator: 51.161.81.208:9093: Disconnected (after 50011ms in state UP, 1 identical error(s) suppressed)
%6|1767666728.786|FAIL|rdkafka#consumer-14| [thrd:ssl://51.161.80.189:9093/bootstrap]: ssl://51.161.80.189:9093/0: Disconnected (after 50254ms in state UP, 1 identical error(s) suppressed)


In [None]:
weather_fg = fs.get_or_create_feature_group(
    name="weather_jfk_hourly_fg",
    version=1,
    primary_key=["weather_id"],
    event_time="weather_hour_local",
    description="Hourly JFK weather from Open-Meteo archive (weathercode, wind, temp, precip, snowfall, visibility)"
)

weather_fg.insert(df_weather_fg)

print("Weather feature group inserted.")
print("Explore in Hopsworks UI:", weather_fg.url)

%6|1767666728.995|FAIL|rdkafka#consumer-14| [thrd:ssl://51.161.81.188:9093/bootstrap]: ssl://51.161.81.188:9093/1: Disconnected (after 50026ms in state UP, 1 identical error(s) suppressed)


Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1338517/fs/1329219/fg/1893821


Uploading Dataframe: 2.97% |▎         | Rows 261/8784 | Elapsed Time: 00:01 | Remaining Time: 00:35%6|1767666740.944|FAIL|rdkafka#producer-13| [thrd:ssl://51.161.81.208:9093/bootstrap]: ssl://51.161.81.208:9093/2: Disconnected (after 93782ms in state UP, 1 identical error(s) suppressed)
Uploading Dataframe: 100.00% |██████████| Rows 8784/8784 | Elapsed Time: 00:02 | Remaining Time: 00:00


Launching job: weather_jfk_hourly_fg_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1338517/jobs/named/weather_jfk_hourly_fg_1_offline_fg_materialization/executions
Weather feature group inserted.


AttributeError: 'FeatureGroup' object has no attribute 'url'. If you are trying to access a feature, fall back on using the `get_feature` method.

%6|1767666778.706|FAIL|rdkafka#consumer-14| [thrd:ssl://51.161.81.208:9093/bootstrap]: ssl://51.161.81.208:9093/2: Disconnected (after 99792ms in state UP, 1 identical error(s) suppressed)
%6|1767666779.259|FAIL|rdkafka#consumer-14| [thrd:GroupCoordinator]: GroupCoordinator: 51.161.81.208:9093: Disconnected (after 50136ms in state UP, 1 identical error(s) suppressed)
%6|1767666791.661|FAIL|rdkafka#producer-13| [thrd:ssl://51.161.81.188:9093/bootstrap]: ssl://51.161.81.188:9093/1: Disconnected (after 50125ms in state UP, 1 identical error(s) suppressed)
%6|1767666829.779|FAIL|rdkafka#consumer-14| [thrd:GroupCoordinator]: GroupCoordinator: 51.161.81.208:9093: Disconnected (after 50001ms in state UP, 1 identical error(s) suppressed)
%6|1767666842.769|FAIL|rdkafka#producer-13| [thrd:ssl://51.161.80.189:9093/bootstrap]: ssl://51.161.80.189:9093/0: Disconnected (after 50122ms in state UP, 1 identical error(s) suppressed)
%6|1767666879.900|FAIL|rdkafka#consumer-14| [thrd:ssl://51.161.80.189:9