In [1]:
# ============================================================
# LIVE INFERENCE NOTEBOOK (Aviationstack + Open-Meteo + Two-stage models from Hopsworks)
# Paste your Aviationstack key where indicated.
#
# What this notebook does:
#  1) Login to Hopsworks
#  2) Load Stage1 + Stage2 + metadata (threshold + feature_cols) from Model Registry
#  3) Build a DEST -> distance lookup from your Feature View batch (BTS distance)
#  4) Call Aviationstack for scheduled JFK departures
#  5) Call Open-Meteo hourly forecast for JFK
#  6) Build EXACT 11 feature columns (matching metadata)
#  7) Run two-stage predictions
#  8) Show results table
# ============================================================

import os
import json
import glob
import joblib
import numpy as np
import pandas as pd
import requests
from datetime import datetime
import hopsworks
import shutil

In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

# ============================================================
# 1) CONFIG
# ============================================================

# ---- Hopsworks ----
PROJECT_NAME = os.getenv("HOPSWORKS_PROJECT", "Flight_Predictor_JFK")
HOPSWORKS_API_KEY = os.getenv("HOPSWORKS_API_KEY")

# ---- Aviationstack ----
AVIATIONSTACK_API_KEY = os.getenv("AVIATIONSTACK_API_KEY")
AVIATIONSTACK_BASE_URL = "https://api.aviationstack.com/v1"

# ---- Open-Meteo (no key needed) ----
OPEN_METEO_URL = "https://api.open-meteo.com/v1/forecast"
JFK_LAT = 40.6413
JFK_LON = -73.7781
TZ = "America/New_York"

# ---- Models in Hopsworks Model Registry ----
STAGE1_NAME = "jfk_delay_stage1_classifier"
STAGE2_NAME = "jfk_delay_stage2_regressor"
META_NAME   = "jfk_delay_two_stage_metadata"   

DOWNLOAD_DIR = "hs_models_download"

# How many upcoming flights to request
FLIGHT_LIMIT = 100

# Weather forecast horizon
FORECAST_DAYS = 2

In [3]:
# ============================================================
# 2) LOGIN TO HOPSWORKS
# ============================================================

project = hopsworks.login(project=PROJECT_NAME, api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store()
mr = project.get_model_registry()

print("Logged into Hopsworks project:", project.name)

2026-01-06 17:57:32,429 INFO: Initializing external client
2026-01-06 17:57:32,429 INFO: Base URL: https://c.app.hopsworks.ai:443
2026-01-06 17:57:33,959 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1338517
Logged into Hopsworks project: Flight_Predictor_JFK


In [4]:
# ============================================================
# 3) TWO-STAGE WRAPPER (loads Stage1 + Stage2 + metadata from Model Registry)
# ============================================================

from importlib.resources import path


class TwoStageDelayPredictor:
    def __init__(self, stage1_model, stage2_model, threshold: float, feature_cols=None):
        self.stage1_model = stage1_model
        self.stage2_model = stage2_model
        self.threshold = float(threshold)
        self.feature_cols = feature_cols

    

    @staticmethod
    def _download_model_artifact(model_obj, local_dir):
        # If directory exists → remove it
        if os.path.exists(local_dir):
            shutil.rmtree(local_dir)
        os.makedirs(local_dir, exist_ok=True)

        path = model_obj.download(local_dir)
        return path

    @staticmethod
    def _find_first_pkl(folder: str) -> str:
        candidates = glob.glob(os.path.join(folder, "**", "*.pkl"), recursive=True)
        if not candidates:
            candidates = glob.glob(os.path.join(folder, "**", "*.joblib"), recursive=True)
        if not candidates:
            raise FileNotFoundError(f"No .pkl/.joblib found under: {folder}")
        candidates.sort(key=lambda p: os.path.getmtime(p), reverse=True)
        return candidates[0]

    @staticmethod
    def _load_metadata_from_folder(folder: str):
        candidates = glob.glob(os.path.join(folder, "**", "*.json"), recursive=True)
        if not candidates:
            return None
        preferred = [p for p in candidates if "metadata" in os.path.basename(p).lower()]
        path = preferred[0] if preferred else candidates[0]
        with open(path, "r") as f:
            return json.load(f)

    @classmethod
    def load_from_hopsworks(
        cls,
        project,
        stage1_name: str,
        stage2_name: str,
        metadata_name: str = None,
        stage1_version: int = None,
        stage2_version: int = None,
        metadata_version: int = None,
        fallback_threshold: float = None,
        fallback_feature_cols=None,
        download_dir: str = "hs_models_download",
    ):
        mr = project.get_model_registry()

        # stage 1
        stage1_obj = mr.get_model(stage1_name, version=stage1_version) if stage1_version else mr.get_model(stage1_name)
        stage1_path = cls._download_model_artifact(stage1_obj, os.path.join(download_dir, "stage1"))
        stage1_file = cls._find_first_pkl(stage1_path)
        stage1_model = joblib.load(stage1_file)

        # stage 2
        stage2_obj = mr.get_model(stage2_name, version=stage2_version) if stage2_version else mr.get_model(stage2_name)
        stage2_path = cls._download_model_artifact(stage2_obj, os.path.join(download_dir, "stage2"))
        stage2_file = cls._find_first_pkl(stage2_path)
        stage2_model = joblib.load(stage2_file)

        # metadata
        meta = None
        if metadata_name:
            meta_obj = mr.get_model(metadata_name, version=metadata_version) if metadata_version else mr.get_model(metadata_name)
            meta_path = cls._download_model_artifact(meta_obj, os.path.join(download_dir, "metadata"))
            meta = cls._load_metadata_from_folder(meta_path)

        threshold = None
        feature_cols = None

        if meta:
            if "decision_rule" in meta and "best_threshold" in meta["decision_rule"]:
                threshold = meta["decision_rule"]["best_threshold"]
            elif "best_threshold" in meta:
                threshold = meta["best_threshold"]
            if "feature_cols" in meta:
                feature_cols = meta["feature_cols"]

        if threshold is None:
            if fallback_threshold is None:
                raise ValueError("Threshold not found in metadata; pass fallback_threshold.")
            threshold = fallback_threshold

        if feature_cols is None:
            feature_cols = fallback_feature_cols

        print("Loaded TwoStageDelayPredictor")
        print("   Stage 1:", stage1_file)
        print("   Stage 2:", stage2_file)
        print("   Threshold:", float(threshold))
        print("   Feature cols:", feature_cols if feature_cols else "(not set)")

        return cls(stage1_model=stage1_model, stage2_model=stage2_model, threshold=threshold, feature_cols=feature_cols)

    def _prepare_X(self, df: pd.DataFrame) -> pd.DataFrame:
        if self.feature_cols is not None:
            missing = [c for c in self.feature_cols if c not in df.columns]
            if missing:
                raise ValueError(f"Missing required feature columns: {missing}")
            X = df[self.feature_cols].copy()
        else:
            X = df.copy()

        for col in ["reporting_airline", "dest"]:
            if col in X.columns:
                X[col] = X[col].astype(str)
        return X

    def predict_dataframe(self, df: pd.DataFrame, include_proba: bool = True) -> pd.DataFrame:
        X = self._prepare_X(df)

        proba = self.stage1_model.predict_proba(X)[:, 1]
        pred = np.zeros(len(X), dtype=float)

        mask = proba >= self.threshold
        if np.any(mask):
            pred[mask] = self.stage2_model.predict(X.loc[mask])

        pred = np.clip(pred, 0, None)

        out = df.copy()
        if include_proba:
            out["p_delayed"] = proba
        out["pred_is_delayed"] = (proba >= self.threshold).astype(int)
        out["pred_delay_min"] = pred
        return out

In [5]:
# ============================================================
# 4) LOAD MODELS + METADATA FROM HOPSWORKS
# ============================================================

predictor = TwoStageDelayPredictor.load_from_hopsworks(
    project=project,
    stage1_name=STAGE1_NAME,
    stage2_name=STAGE2_NAME,
    metadata_name=META_NAME,
    fallback_threshold=0.8,       # used only if metadata missing
    fallback_feature_cols=None,   # used only if metadata missing
    download_dir=DOWNLOAD_DIR,
)

assert hasattr(predictor.stage1_model, "predict_proba"), "Stage 1 must support predict_proba()"
print("Stage 1 supports predict_proba()")




Downloading: 0.000%|          | 0/5370 elapsed<00:00 remaining<?




Downloading: 0.000%|          | 0/539186 elapsed<00:00 remaining<?




Downloading: 0.000%|          | 0/1198 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/539186 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/5370 elapsed<00:00 remaining<?

Loaded TwoStageDelayPredictor dirs, 3 files)... DONE
   Stage 1: hs_models_download/stage1/stage1_classifier.pkl
   Stage 2: hs_models_download/stage2/stage2_regressor.pkl
   Threshold: 0.7999999999999999
   Feature cols: ['month', 'day_of_week', 'reporting_airline', 'dest', 'distance', 'weather_jfk_hourly_fg_weather_code', 'weather_jfk_hourly_fg_wind_speed_ms', 'weather_jfk_hourly_fg_wind_gust_ms', 'weather_jfk_hourly_fg_temp_c', 'weather_jfk_hourly_fg_precip_mm', 'weather_jfk_hourly_fg_snowfall_cm']
Stage 1 supports predict_proba()


In [6]:
# ============================================================
# 5) BUILD DEST -> DISTANCE LOOKUP FROM YOUR FEATURE VIEW
#    (So live inference can fill 'distance' even if Aviationstack doesn't provide it)
# ============================================================

FV_NAME = "jfk_delay_weather_fv"
FV_VERSION = 1

fv = fs.get_feature_view(FV_NAME, version=FV_VERSION)

# Pull a batch (this is historical; OK to build distance lookup)
df_fv = fv.get_batch_data()
print("FV batch shape:", df_fv.shape)

# Build lookup: median distance for each DEST from JFK
dist_lookup = (
    df_fv[["dest", "distance"]]
    .dropna()
    .groupby("dest", as_index=True)["distance"]
    .median()
    .to_dict()
)

global_distance_fallback = float(df_fv["distance"].median())
print("Distance lookup size:", len(dist_lookup))
print("Global distance fallback:", global_distance_fallback)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (6.38s) 
FV batch shape: (107589, 16)
Distance lookup size: 74
Global distance fallback: 1069.0


In [7]:
# ============================================================
# 6) AVIATIONSTACK: FETCH SCHEDULED JFK DEPARTURES
# ============================================================

def fetch_aviationstack_scheduled_jfk(limit=100):
    url = f"{AVIATIONSTACK_BASE_URL}/flights"
    params = {
        "access_key": AVIATIONSTACK_API_KEY,
        "dep_iata": "JFK",
        "flight_status": "scheduled",
        "limit": limit,
    }
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    payload = r.json()
    if "data" not in payload:
        raise ValueError(f"Unexpected response: {payload}")
    return payload["data"]

flights_raw = fetch_aviationstack_scheduled_jfk(limit=FLIGHT_LIMIT)
print("Aviationstack flights returned:", len(flights_raw))

# Peek at one record (optional)
flights_raw[0]

Aviationstack flights returned: 100


{'flight_date': '2026-01-06',
 'flight_status': 'scheduled',
 'departure': {'airport': 'John F Kennedy International',
  'timezone': 'America/New_York',
  'iata': 'JFK',
  'icao': 'KJFK',
  'terminal': '4',
  'gate': 'A14',
  'delay': None,
  'scheduled': '2026-01-06T12:59:00+00:00',
  'estimated': '2026-01-06T12:59:00+00:00',
  'actual': None,
  'estimated_runway': None,
  'actual_runway': None},
 'arrival': {'airport': 'Austin-bergstrom International',
  'timezone': 'America/Chicago',
  'iata': 'AUS',
  'icao': 'KAUS',
  'terminal': None,
  'gate': '8',
  'baggage': None,
  'scheduled': '2026-01-06T16:41:00+00:00',
  'delay': None,
  'estimated': None,
  'actual': None,
  'estimated_runway': None,
  'actual_runway': None},
 'airline': {'name': 'Virgin Atlantic', 'iata': 'VS', 'icao': 'VIR'},
 'flight': {'number': '4761',
  'iata': 'VS4761',
  'icao': 'VIR4761',
  'codeshared': {'airline_name': 'delta air lines',
   'airline_iata': 'dl',
   'airline_icao': 'dal',
   'flight_number': '

In [8]:
# ============================================================
# 7) PARSE AVIATIONSTACK -> BASE FLIGHT TABLE
#    We derive: sched_dep_local, month, day_of_week
#    And map airline/dest
# ============================================================

def parse_flights(flights_json):
    rows = []
    for f in flights_json:
        # Airline code: prefer IATA, else ICAO, else name
        airline = None
        if isinstance(f.get("airline"), dict):
            airline = f["airline"].get("iata") or f["airline"].get("icao") or f["airline"].get("name")
        airline = airline or "UNK"

        # Destination airport IATA
        dest = None
        if isinstance(f.get("arrival"), dict):
            dest = f["arrival"].get("iata")
        dest = dest or "UNK"

        # Scheduled departure timestamp
        # Aviationstack commonly provides ISO timestamps with timezone offset, e.g. "2026-01-06T18:30:00+00:00"
        sched = None
        if isinstance(f.get("departure"), dict):
            sched = f["departure"].get("scheduled")
        if not sched:
            continue

        # Parse as tz-aware if possible
        sched_dt = pd.to_datetime(sched, errors="coerce", utc=True)
        if pd.isna(sched_dt):
            continue

        # Convert to NY local time for feature derivation + weather join
        sched_local = sched_dt.tz_convert(TZ)

        # Derived features
        month = int(sched_local.month)
        # pandas: Monday=0...Sunday=6  -> convert to 1..7 if you used that in training
        # Your FV uses day_of_week as bigints; likely BTS day_of_week 1..7.
        # We'll convert to 1..7 to match BTS convention.
        day_of_week = int(sched_local.dayofweek) + 1

        # Local hour floored (for joining hourly weather)
        sched_hour_local = sched_local.floor("H")

        # Flight number for UI (optional)
        flight_iata = None
        if isinstance(f.get("flight"), dict):
            flight_iata = f["flight"].get("iata") or f["flight"].get("number")

        rows.append({
            "flight_iata": flight_iata,
            "reporting_airline": str(airline),
            "dest": str(dest),
            "sched_dep_utc": sched_dt,
            "sched_dep_local": sched_local,
            "sched_hour_local": sched_hour_local,
            "month": month,
            "day_of_week": day_of_week,
        })

    df = pd.DataFrame(rows)
    return df

df_live_flights = parse_flights(flights_raw)
print("Parsed flights:", df_live_flights.shape)
df_live_flights.head()

Parsed flights: (100, 8)


Unnamed: 0,flight_iata,reporting_airline,dest,sched_dep_utc,sched_dep_local,sched_hour_local,month,day_of_week
0,VS4761,VS,AUS,2026-01-06 12:59:00+00:00,2026-01-06 07:59:00-05:00,2026-01-06 07:00:00-05:00,1,2
1,VS4613,VS,PIT,2026-01-06 15:50:00+00:00,2026-01-06 10:50:00-05:00,2026-01-06 10:00:00-05:00,1,2
2,MU8756,MU,PIT,2026-01-06 15:50:00+00:00,2026-01-06 10:50:00-05:00,2026-01-06 10:00:00-05:00,1,2
3,KQ7127,KQ,PIT,2026-01-06 15:50:00+00:00,2026-01-06 10:50:00-05:00,2026-01-06 10:00:00-05:00,1,2
4,KL6680,KL,PIT,2026-01-06 15:50:00+00:00,2026-01-06 10:50:00-05:00,2026-01-06 10:00:00-05:00,1,2


In [9]:
# ============================================================
# 8) OPEN-METEO: FETCH HOURLY FORECAST FOR JFK (LOCAL TIME)
#    We will join by sched_hour_local.
# ============================================================

def fetch_openmeteo_hourly_forecast(lat, lon, timezone="America/New_York", forecast_days=2):
    params = {
        "latitude": lat,
        "longitude": lon,
        "hourly": ",".join([
            "weathercode",
            "windspeed_10m",
            "windgusts_10m",
            "temperature_2m",
            "precipitation",
            "snowfall",
        ]),
        "timezone": timezone,
        "forecast_days": forecast_days,
    }
    r = requests.get(OPEN_METEO_URL, params=params, timeout=30)
    r.raise_for_status()
    return r.json()

wx = fetch_openmeteo_hourly_forecast(JFK_LAT, JFK_LON, timezone=TZ, forecast_days=FORECAST_DAYS)

hourly = wx.get("hourly", {})
times = pd.to_datetime(hourly.get("time", []), errors="coerce")  # Open-Meteo returns in requested timezone (naive strings)
# Localize naive times to TZ
times = times.tz_localize(TZ, ambiguous="NaT", nonexistent="shift_forward")

df_wx = pd.DataFrame({
    "sched_hour_local": times,
    "weather_jfk_hourly_fg_weather_code": pd.to_numeric(hourly.get("weathercode", []), errors="coerce"),
    "weather_jfk_hourly_fg_wind_speed_ms": pd.to_numeric(hourly.get("windspeed_10m", []), errors="coerce") / 3.6,  # km/h -> m/s
    "weather_jfk_hourly_fg_wind_gust_ms": pd.to_numeric(hourly.get("windgusts_10m", []), errors="coerce") / 3.6,  # km/h -> m/s
    "weather_jfk_hourly_fg_temp_c": pd.to_numeric(hourly.get("temperature_2m", []), errors="coerce"),
    "weather_jfk_hourly_fg_precip_mm": pd.to_numeric(hourly.get("precipitation", []), errors="coerce"),
    "weather_jfk_hourly_fg_snowfall_cm": pd.to_numeric(hourly.get("snowfall", []), errors="coerce") / 10.0,      # mm -> cm (approx for water equiv; OK for feature)
})

# keep valid timestamps only
df_wx = df_wx.dropna(subset=["sched_hour_local"])
print("Weather hourly rows:", df_wx.shape)
df_wx.head()

Weather hourly rows: (48, 7)


Unnamed: 0,sched_hour_local,weather_jfk_hourly_fg_weather_code,weather_jfk_hourly_fg_wind_speed_ms,weather_jfk_hourly_fg_wind_gust_ms,weather_jfk_hourly_fg_temp_c,weather_jfk_hourly_fg_precip_mm,weather_jfk_hourly_fg_snowfall_cm
0,2026-01-06 00:00:00-05:00,3,1.444444,6.305556,3.1,0.0,0.0
1,2026-01-06 01:00:00-05:00,2,1.583333,6.0,3.0,0.0,0.0
2,2026-01-06 02:00:00-05:00,3,0.666667,4.305556,1.4,0.0,0.0
3,2026-01-06 03:00:00-05:00,3,1.194444,3.111111,1.0,0.0,0.0
4,2026-01-06 04:00:00-05:00,3,1.5,3.0,0.5,0.0,0.0


In [10]:
# ============================================================
# 9) JOIN FLIGHTS + WEATHER (LOCAL HOUR)
# ============================================================

df_live = df_live_flights.merge(df_wx, on="sched_hour_local", how="left")

print("Live joined shape:", df_live.shape)
df_live[[
    "flight_iata", "reporting_airline", "dest", "sched_dep_local", "sched_hour_local",
    "weather_jfk_hourly_fg_weather_code", "weather_jfk_hourly_fg_wind_speed_ms"
]].head()

Live joined shape: (100, 14)


Unnamed: 0,flight_iata,reporting_airline,dest,sched_dep_local,sched_hour_local,weather_jfk_hourly_fg_weather_code,weather_jfk_hourly_fg_wind_speed_ms
0,VS4761,VS,AUS,2026-01-06 07:59:00-05:00,2026-01-06 07:00:00-05:00,3,0.694444
1,VS4613,VS,PIT,2026-01-06 10:50:00-05:00,2026-01-06 10:00:00-05:00,3,1.083333
2,MU8756,MU,PIT,2026-01-06 10:50:00-05:00,2026-01-06 10:00:00-05:00,3,1.083333
3,KQ7127,KQ,PIT,2026-01-06 10:50:00-05:00,2026-01-06 10:00:00-05:00,3,1.083333
4,KL6680,KL,PIT,2026-01-06 10:50:00-05:00,2026-01-06 10:00:00-05:00,3,1.083333


In [11]:
# ============================================================
# 10) ADD DISTANCE VIA LOOKUP (BTS-derived)
# ============================================================

def add_distance(df, lookup_dict, fallback):
    d = df.copy()
    d["distance"] = d["dest"].map(lookup_dict).astype(float)
    d["distance"] = d["distance"].fillna(fallback)
    return d

df_live = add_distance(df_live, dist_lookup, global_distance_fallback)

print("Distance filled. Missing:", df_live["distance"].isna().sum())
df_live[["dest", "distance"]].head()

Distance filled. Missing: 0


Unnamed: 0,dest,distance
0,AUS,1521.0
1,PIT,340.0
2,PIT,340.0
3,PIT,340.0
4,PIT,340.0


In [12]:
# ============================================================
# 11) BUILD FINAL FEATURE DF (must match predictor.feature_cols exactly)
# ============================================================

# If metadata contains feature_cols, use them directly.
# Otherwise, define explicitly (must match training).
if predictor.feature_cols is None:
    predictor.feature_cols = [
        "month",
        "day_of_week",
        "reporting_airline",
        "dest",
        "distance",
        "weather_jfk_hourly_fg_weather_code",
        "weather_jfk_hourly_fg_wind_speed_ms",
        "weather_jfk_hourly_fg_wind_gust_ms",
        "weather_jfk_hourly_fg_temp_c",
        "weather_jfk_hourly_fg_precip_mm",
        "weather_jfk_hourly_fg_snowfall_cm",
    ]

df_features = df_live.copy()

# Ensure required columns exist
missing = [c for c in predictor.feature_cols if c not in df_features.columns]
if missing:
    raise ValueError(f"Missing live feature columns: {missing}")

df_features[predictor.feature_cols].head()

Unnamed: 0,month,day_of_week,reporting_airline,dest,distance,weather_jfk_hourly_fg_weather_code,weather_jfk_hourly_fg_wind_speed_ms,weather_jfk_hourly_fg_wind_gust_ms,weather_jfk_hourly_fg_temp_c,weather_jfk_hourly_fg_precip_mm,weather_jfk_hourly_fg_snowfall_cm
0,1,2,VS,AUS,1521.0,3,0.694444,1.0,0.7,0.0,0.0
1,1,2,VS,PIT,340.0,3,1.083333,1.305556,3.6,0.0,0.0
2,1,2,MU,PIT,340.0,3,1.083333,1.305556,3.6,0.0,0.0
3,1,2,KQ,PIT,340.0,3,1.083333,1.305556,3.6,0.0,0.0
4,1,2,KL,PIT,340.0,3,1.083333,1.305556,3.6,0.0,0.0


In [None]:
LIVE_THRESHOLD = 0.40
predictor.threshold = LIVE_THRESHOLD

preds_live = predictor.predict_dataframe(df_features, include_proba=True)

preds_live_out = preds_live[[
    "flight_iata", "reporting_airline", "dest", "sched_dep_local",
    "p_delayed", "pred_is_delayed", "pred_delay_min",
    "weather_jfk_hourly_fg_weather_code",
    "weather_jfk_hourly_fg_wind_speed_ms",
    "weather_jfk_hourly_fg_precip_mm",
    "distance"
]].sort_values(["sched_dep_local"]).reset_index(drop=True)

print("Threshold:", LIVE_THRESHOLD)
print("Predicted delayed rate:", preds_live_out["pred_is_delayed"].mean())
print("Non-zero delay predictions:", (preds_live_out["pred_delay_min"] > 0).sum(), "/", len(preds_live_out))

preds_live_out.head(25)

Threshold: 0.3
Predicted delayed rate: 0.86
Non-zero delay predictions: 86 / 100


Unnamed: 0,flight_iata,reporting_airline,dest,sched_dep_local,p_delayed,pred_is_delayed,pred_delay_min,weather_jfk_hourly_fg_weather_code,weather_jfk_hourly_fg_wind_speed_ms,weather_jfk_hourly_fg_precip_mm,distance
0,HA4824,HA,PSP,2026-01-06 06:30:00-05:00,0.393119,1,34.0798,3,1.388889,0.0,2378.0
1,QR3853,QR,KIN,2026-01-06 07:55:00-05:00,0.341366,1,30.698492,3,0.694444,0.0,1069.0
2,VS4761,VS,AUS,2026-01-06 07:59:00-05:00,0.296611,0,0.0,3,0.694444,0.0,1521.0
3,AF2472,AF,MCO,2026-01-06 08:40:00-05:00,0.375674,1,31.496201,3,1.444444,0.0,944.0
4,KE7339,KE,MCO,2026-01-06 08:40:00-05:00,0.375674,1,31.496201,3,1.444444,0.0,944.0
5,KL6762,KL,MCO,2026-01-06 08:40:00-05:00,0.375674,1,31.496201,3,1.444444,0.0,944.0
6,LA8464,LA,MCO,2026-01-06 08:40:00-05:00,0.375674,1,31.496201,3,1.444444,0.0,944.0
7,LY4348,LY,MCO,2026-01-06 08:40:00-05:00,0.375674,1,31.496201,3,1.444444,0.0,944.0
8,SV6865,SV,MCO,2026-01-06 08:40:00-05:00,0.375674,1,31.496201,3,1.444444,0.0,944.0
9,VS1628,VS,MCO,2026-01-06 08:40:00-05:00,0.375674,1,31.496201,3,1.444444,0.0,944.0


In [17]:
# ============================================================
# 12) RUN LIVE PREDICTIONS
# ============================================================

preds = predictor.predict_dataframe(df_features, include_proba=True)

# Useful display columns for UI
display_cols = [
    "flight_iata",
    "reporting_airline",
    "dest",
    "sched_dep_local",
    "p_delayed",
    "pred_is_delayed",
    "pred_delay_min",
    "weather_jfk_hourly_fg_weather_code",
    "weather_jfk_hourly_fg_wind_speed_ms",
    "weather_jfk_hourly_fg_precip_mm",
    "distance",
]

preds_out = preds[display_cols].sort_values(["sched_dep_local"]).reset_index(drop=True)

print("Live predictions rows:", preds_out.shape)
preds_out.head(25)

Live predictions rows: (100, 11)


Unnamed: 0,flight_iata,reporting_airline,dest,sched_dep_local,p_delayed,pred_is_delayed,pred_delay_min,weather_jfk_hourly_fg_weather_code,weather_jfk_hourly_fg_wind_speed_ms,weather_jfk_hourly_fg_precip_mm,distance
0,HA4824,HA,PSP,2026-01-06 06:30:00-05:00,0.393119,1,34.0798,3,1.388889,0.0,2378.0
1,QR3853,QR,KIN,2026-01-06 07:55:00-05:00,0.341366,1,30.698492,3,0.694444,0.0,1069.0
2,VS4761,VS,AUS,2026-01-06 07:59:00-05:00,0.296611,0,0.0,3,0.694444,0.0,1521.0
3,AF2472,AF,MCO,2026-01-06 08:40:00-05:00,0.375674,1,31.496201,3,1.444444,0.0,944.0
4,KE7339,KE,MCO,2026-01-06 08:40:00-05:00,0.375674,1,31.496201,3,1.444444,0.0,944.0
5,KL6762,KL,MCO,2026-01-06 08:40:00-05:00,0.375674,1,31.496201,3,1.444444,0.0,944.0
6,LA8464,LA,MCO,2026-01-06 08:40:00-05:00,0.375674,1,31.496201,3,1.444444,0.0,944.0
7,LY4348,LY,MCO,2026-01-06 08:40:00-05:00,0.375674,1,31.496201,3,1.444444,0.0,944.0
8,SV6865,SV,MCO,2026-01-06 08:40:00-05:00,0.375674,1,31.496201,3,1.444444,0.0,944.0
9,VS1628,VS,MCO,2026-01-06 08:40:00-05:00,0.375674,1,31.496201,3,1.444444,0.0,944.0


In [18]:
# ============================================================
# 13) QUICK SANITY CHECKS
# ============================================================

print("Threshold:", predictor.threshold)
print("Predicted delayed rate:", preds_out["pred_is_delayed"].mean())
print("Pred delay min summary:")
print(preds_out["pred_delay_min"].describe())

# Show top-risk flights
preds_out.sort_values("p_delayed", ascending=False).head(15)

Threshold: 0.3
Predicted delayed rate: 0.86
Pred delay min summary:
count    100.000000
mean      24.855053
std       11.052588
min        0.000000
25%       24.317024
50%       25.092309
75%       33.808695
max       36.345196
Name: pred_delay_min, dtype: float64


Unnamed: 0,flight_iata,reporting_airline,dest,sched_dep_local,p_delayed,pred_is_delayed,pred_delay_min,weather_jfk_hourly_fg_weather_code,weather_jfk_hourly_fg_wind_speed_ms,weather_jfk_hourly_fg_precip_mm,distance
45,VS3150,VS,DCA,2026-01-06 09:59:00-05:00,0.465193,1,33.808695,3,0.5,0.0,213.0
46,LA6246,LA,DCA,2026-01-06 09:59:00-05:00,0.465193,1,33.808695,3,0.5,0.0,213.0
44,KQ7037,KQ,DCA,2026-01-06 09:59:00-05:00,0.465193,1,33.808695,3,0.5,0.0,213.0
71,IB4677,IB,DCA,2026-01-06 10:30:00-05:00,0.464042,1,22.827573,3,1.083333,0.0,213.0
65,MH9513,MH,DCA,2026-01-06 10:30:00-05:00,0.464042,1,22.827573,3,1.083333,0.0,213.0
66,CX7527,CX,DCA,2026-01-06 10:30:00-05:00,0.464042,1,22.827573,3,1.083333,0.0,213.0
67,JL7456,JL,DCA,2026-01-06 10:30:00-05:00,0.464042,1,22.827573,3,1.083333,0.0,213.0
68,G36415,G3,DCA,2026-01-06 10:30:00-05:00,0.464042,1,22.827573,3,1.083333,0.0,213.0
69,AT5053,AT,DCA,2026-01-06 10:30:00-05:00,0.464042,1,22.827573,3,1.083333,0.0,213.0
70,GF6626,GF,DCA,2026-01-06 10:30:00-05:00,0.464042,1,22.827573,3,1.083333,0.0,213.0


In [19]:
# ============================================================
# 14) OPTIONAL: SAVE OUTPUT (for debugging / demo fallback)
# ============================================================

OUT_PATH = "data_processed_live_predictions.csv"
preds_out.to_csv(OUT_PATH, index=False)
print("Saved:", OUT_PATH)

Saved: data_processed_live_predictions.csv


In [20]:
for thr in [0.30, 0.40, 0.50]:
    predictor.threshold = thr
    out = predictor.predict_dataframe(df_features, include_proba=True)
    rate = out["pred_is_delayed"].mean()
    nonzero = (out["pred_delay_min"] > 0).sum()
    print(f"thr={thr:.2f} | delayed_rate={rate:.2%} | nonzero={nonzero}/{len(out)}")

thr=0.30 | delayed_rate=86.00% | nonzero=86/100
thr=0.40 | delayed_rate=19.00% | nonzero=19/100
thr=0.50 | delayed_rate=0.00% | nonzero=0/100
