# 02 - Feature Engineering

This notebook prepares features for model training by:
1. Loading and merging vehicle positions with stop_times
2. Extracting all features
3. Selecting relevant features for delay prediction
4. Preparing training data (X, y)


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append(str(Path('../src').resolve()))
from data_utils import (
    load_vehicle_positions,
    load_gtfs_data,
    preprocess_gtfs,
    merge_vehicle_positions_with_stop_times
)
from features import extract_all_features, create_feature_matrix

# Set up paths
data_dir = Path('../data/raw')
processed_dir = Path('../data/processed')
processed_dir.mkdir(parents=True, exist_ok=True)

print("=" * 60)
print("Feature Engineering - Transit Tracker")
print("=" * 60)


## 1. Load and Process Full Dataset

Load vehicle positions and stop_times, then merge them. For faster processing, you can use a sample.


In [None]:
# Load vehicle positions
vehicle_positions_file = 'vehicle_positions_rt_rows-11-13.csv'
print(f"Loading vehicle positions from {vehicle_positions_file}...")
vehicle_positions = load_vehicle_positions(data_dir / vehicle_positions_file)
print(f"Loaded {len(vehicle_positions):,} vehicle position records")

# Option: Use a sample for faster processing during development
# Remove this line for full dataset processing
USE_SAMPLE = True
SAMPLE_SIZE = 50000  # Adjust based on your needs

if USE_SAMPLE:
    vehicle_positions = vehicle_positions.sample(n=min(SAMPLE_SIZE, len(vehicle_positions)), random_state=42)
    print(f"Using sample of {len(vehicle_positions):,} records")


In [None]:
# Load and preprocess GTFS data
print("\nLoading GTFS stop_times...")
gtfs_data = load_gtfs_data(data_dir)
gtfs_data = preprocess_gtfs(gtfs_data)

if 'stop_times' not in gtfs_data:
    raise FileNotFoundError("stop_times.txt not found!")

print(f"Loaded {len(gtfs_data['stop_times']):,} stop_times records")


In [None]:
# Merge vehicle positions with stop_times
print("\nMerging vehicle positions with stop_times...")
merged_data = merge_vehicle_positions_with_stop_times(
    vehicle_positions,
    gtfs_data['stop_times']
)

merge_rate = merged_data['arrival_time'].notna().sum() / len(merged_data) * 100
print(f"Merge success rate: {merge_rate:.1f}%")
print(f"Merged data shape: {merged_data.shape}")


## 2. Extract All Features


In [None]:
# Extract all features
print("\nExtracting features...")
features_df = extract_all_features(merged_data)

print(f"Features shape: {features_df.shape}")
print(f"Total feature columns: {len(features_df.columns)}")

# Display feature columns
print("\nFeature columns:")
for i, col in enumerate(features_df.columns, 1):
    print(f"{i:3d}. {col}")


## 3. Select Features for Model Training

Select features that are useful for predicting delays. Exclude target variable and non-predictive columns.


In [None]:
# Define feature columns for model training
# Exclude: target variables, IDs, timestamps, and derived delay columns

exclude_cols = [
    # Target variables
    'arrival_delay_minutes', 'arrival_delay_seconds',
    'departure_delay_minutes', 'departure_delay_seconds',
    'predicted_delay_minutes',
    
    # IDs and identifiers
    'id', 'trip_id', 'route_id', 'vehicle_id', 'vehicle_label',
    'stop_id', 'trip_id_numeric', 'trip_date_suffix',
    
    # Timestamps and dates
    'timestamp', 'datetime', 'start_date',
    'scheduled_arrival', 'expected_arrival',
    'arrival_time', 'departure_time',
    
    # Status text (we have encoded versions)
    'current_status',
    
    # Other non-predictive
    'schedule_relationship', 'stop_headsign',
    'pickup_type', 'drop_off_type',
]

# Get available feature columns
available_cols = [col for col in features_df.columns if col not in exclude_cols]

# Select numeric and boolean features
feature_cols = []
for col in available_cols:
    dtype = features_df[col].dtype
    if dtype in ['int64', 'float64', 'bool'] or dtype.name == 'category':
        feature_cols.append(col)

print(f"\nSelected {len(feature_cols)} features for training:")
for i, col in enumerate(feature_cols, 1):
    print(f"{i:3d}. {col}")

# Display feature statistics
print("\nFeature statistics:")
features_df[feature_cols].describe()


In [None]:
# Prepare target variable (arrival delay in minutes)
# Use actual delay if available, otherwise skip rows without delay
if 'arrival_delay_minutes' in features_df.columns:
    # Filter rows with valid delay values
    valid_mask = features_df['arrival_delay_minutes'].notna()
    training_data = features_df[valid_mask].copy()
    
    y = training_data['arrival_delay_minutes'].values
    X = create_feature_matrix(training_data, feature_cols)
    
    print(f"Training data shape: X={X.shape}, y={y.shape}")
    print(f"\nTarget variable (y) statistics:")
    print(f"  Mean: {y.mean():.2f} minutes")
    print(f"  Median: {np.median(y):.2f} minutes")
    print(f"  Std: {y.std():.2f} minutes")
    print(f"  Min: {y.min():.2f} minutes")
    print(f"  Max: {y.max():.2f} minutes")
    
    # Check for any NaN or Inf values
    print(f"\nData quality check:")
    print(f"  NaN in X: {np.isnan(X).sum()}")
    print(f"  Inf in X: {np.isinf(X).sum()}")
    print(f"  NaN in y: {np.isnan(y).sum()}")
else:
    print("Warning: No arrival_delay_minutes column found!")
    print("Make sure you've merged vehicle positions with stop_times data.")
    X, y = None, None


## 5. Save Training Data

Save the feature matrix, target variable, and feature column names for model training.


In [None]:
if X is not None and y is not None:
    # Create DataFrame with features and target
    training_df = pd.DataFrame(X, columns=feature_cols)
    training_df['arrival_delay_minutes'] = y
    
    # Save training data
    training_data_path = processed_dir / 'training_data.csv'
    training_df.to_csv(training_data_path, index=False)
    print(f"\nSaved training data to: {training_data_path}")
    print(f"Shape: {training_df.shape}")
    
    # Save feature column names for later use
    import json
    feature_cols_path = processed_dir / 'feature_columns.json'
    with open(feature_cols_path, 'w') as f:
        json.dump(feature_cols, f)
    print(f"Saved feature columns to: {feature_cols_path}")
    
    # Also save the full features DataFrame for reference
    features_path = processed_dir / 'all_features.csv'
    features_df.to_csv(features_path, index=False)
    print(f"Saved full features DataFrame to: {features_path}")
else:
    print("\nCannot save training data - missing X or y")
