# 09 - Subway Feature Sampling for Prediction

## Purpose
This notebook defines a reusable function to generate **model-ready subway features** at prediction time. It aligns exactly with the feature set used by the trained XGBoost model.

---

## Expected Inputs
- **Datetime object** representing the prediction timestamp
- **Weather dictionary** containing conditions like `temp`, `humidity`, `wind_speed`, etc.
- **Station metadata file** with `station_complex_id`, `latitude`, `longitude`, `is_cbd`

---

## Output
A single DataFrame with one row per subway station, including:
- Temporal features
- Weather features
- Spatial metadata
- Cyclical encodings
- Model-ready columns


In [4]:
import pandas as pd
import numpy as np
import json
from datetime import datetime
from pathlib import Path

def prepare_subway_features(timestamp, weather, station_metadata_path, feature_schema_path):
    """
    Generate model-ready features for each station based on timestamp and weather.

    Args:
        timestamp (datetime): The datetime for prediction (hour-level)
        weather (dict): Weather dictionary with keys like 'temp', 'feels_like', etc.
        station_metadata_path (str | Path): Path to station metadata CSV
        feature_schema_path (str | Path): Path to required_features.json

    Returns:
        pd.DataFrame: Feature DataFrame ready for model input
    """
    # Load feature schema
    with open(feature_schema_path) as f:
        required_features = json.load(f)

    station_metadata_path = Path(station_metadata_path)
    stations_df = pd.read_csv(station_metadata_path)[
        ["station_complex_id", "latitude", "longitude", "is_cbd"]
    ].drop_duplicates()

    # Extract temporal parts
    hour = timestamp.hour
    day_of_week = timestamp.weekday()
    month = timestamp.month
    is_weekend = int(day_of_week >= 5)
    is_rush_hour = int(hour in [7, 8, 9, 16, 17, 18])
    is_holiday = 0  # TODO: use holidays package if needed

    # Temporal features
    stations_df["hour"] = hour
    stations_df["day_of_week"] = day_of_week
    stations_df["month"] = month
    stations_df["hour_sin"] = np.sin(2 * np.pi * hour / 24)
    stations_df["hour_cos"] = np.cos(2 * np.pi * hour / 24)
    stations_df["dow_sin"] = np.sin(2 * np.pi * day_of_week / 7)
    stations_df["dow_cos"] = np.cos(2 * np.pi * day_of_week / 7)
    stations_df["month_sin"] = np.sin(2 * np.pi * month / 12)
    stations_df["month_cos"] = np.cos(2 * np.pi * month / 12)
    stations_df["is_rush_hour"] = is_rush_hour
    stations_df["is_weekend"] = is_weekend
    stations_df["is_holiday"] = is_holiday

    # Weather features
    temp = weather.get("temp", 15)
    stations_df["temp"] = temp
    stations_df["humidity"] = weather.get("humidity", 60)
    stations_df["wind_speed"] = weather.get("wind_speed", 5)
    stations_df["feels_like"] = weather.get("feels_like", temp)

    main_weather = weather.get("weather_main", "").lower()
    stations_df["has_rain"] = int("rain" in main_weather)
    stations_df["has_snow"] = int("snow" in main_weather)
    stations_df["is_freezing"] = int(temp <= 0)
    stations_df["is_hot"] = int(temp >= 30)

    # Ordinal temperature category
    def bin_temp(t):
        if t < 0:
            return "freezing"
        elif t < 10:
            return "cold"
        elif t < 20:
            return "mild"
        elif t < 30:
            return "warm"
        else:
            return "hot"

    category_map = {
        "freezing": 0,
        "cold": 1,
        "mild": 2,
        "warm": 3,
        "hot": 4
    }

    stations_df["temp_category"] = stations_df["temp"].apply(bin_temp).map(category_map).astype(int)

    # Final feature output in correct order
    final_features = stations_df[required_features].copy()

    # Safety check
    assert list(final_features.columns) == required_features, "Feature order mismatch with required_features.json"

    return final_features


In [5]:
from datetime import datetime
from pathlib import Path

# Inputs
timestamp = datetime(2024, 7, 25, 14, 0)
weather = {
    "temp": 27.5,
    "feels_like": 29.0,
    "humidity": 52,
    "wind_speed": 3.2,
    "weather_main": "Clear"
}
station_metadata_path = Path("C:/Users/neasa/manhattan-subway/data/processed/subway_stations.csv")
feature_schema_path = Path("C:/Users/neasa/manhattan-subway/data/processed/models/required_features.json")

# Generate features
features_df = prepare_subway_features(
    timestamp,
    weather,
    station_metadata_path,
    feature_schema_path
)

# Save for inspection or testing
features_df.to_csv(
    Path("C:/Users/neasa/manhattan-subway/data/processed/sample_subway_features.csv"),
    index=False
)


In [6]:
print("Generated features shape:", features_df.shape)
print("First few rows:\n", features_df.head())


Generated features shape: (121, 24)
First few rows:
     latitude  longitude  hour  day_of_week  month  hour_sin  hour_cos  \
0  40.764811 -73.973347    14            3      7      -0.5 -0.866025   
1  40.764664 -73.980658    14            3      7      -0.5 -0.866025   
2  40.759901 -73.984139    14            3      7      -0.5 -0.866025   
3  40.745494 -73.988691    14            3      7      -0.5 -0.866025   
4  40.741303 -73.989344    14            3      7      -0.5 -0.866025   

    dow_sin   dow_cos  month_sin  ...  temp  has_snow  has_rain  \
0  0.433884 -0.900969       -0.5  ...  27.5         0         0   
1  0.433884 -0.900969       -0.5  ...  27.5         0         0   
2  0.433884 -0.900969       -0.5  ...  27.5         0         0   
3  0.433884 -0.900969       -0.5  ...  27.5         0         0   
4  0.433884 -0.900969       -0.5  ...  27.5         0         0   

   temp_category  is_freezing  is_hot  humidity  wind_speed  feels_like  \
0              3            0 