# 02 - Feature Engineering

This notebook prepares features for model training by:
1. Loading and merging vehicle positions with stop_times
2. Extracting all features
3. Selecting relevant features for delay prediction
4. Preparing training data (X, y)


In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append(str(Path('../src').resolve()))
from data_utils import (
    load_vehicle_positions,
    load_gtfs_data,
    preprocess_gtfs,
    merge_vehicle_positions_with_stop_times
)
from features import extract_all_features, create_feature_matrix

# Set up paths
data_dir = Path('../data/raw')
processed_dir = Path('../data/processed')
processed_dir.mkdir(parents=True, exist_ok=True)

print("=" * 60)
print("Feature Engineering - Transit Tracker")
print("=" * 60)


Feature Engineering - Transit Tracker


## 1. Load and Process Full Dataset

Load vehicle positions and stop_times, then merge them. For faster processing, you can use a sample.


In [None]:
# Load vehicle positions from multiple files
vehicle_positions_files = [
    'vehicle_positions_rt_rows-11-13.csv',
    'vehicle_positions_rt_rows_fri_11:14.csv'
]

vehicle_positions_list = []
for vehicle_positions_file in vehicle_positions_files:
    print(f"Loading vehicle positions from {vehicle_positions_file}...")
    df = load_vehicle_positions(data_dir / vehicle_positions_file)
    print(f"Loaded {len(df):,} vehicle position records")
    vehicle_positions_list.append(df)

# Concatenate all vehicle positions
vehicle_positions = pd.concat(vehicle_positions_list, ignore_index=True)
print(f"\nTotal vehicle position records: {len(vehicle_positions):,}")

# Option: Use a sample for faster processing during development
# Remove this line for full dataset processing
USE_SAMPLE = True
SAMPLE_SIZE = 50000  # Adjust based on your needs

if USE_SAMPLE:
    vehicle_positions = vehicle_positions.sample(n=min(SAMPLE_SIZE, len(vehicle_positions)), random_state=42)
    print(f"Using sample of {len(vehicle_positions):,} records")


Loading vehicle positions from vehicle_positions_rt_rows-11-13.csv...
Loaded 574,123 vehicle position records
Using sample of 50,000 records


In [3]:
# Load and preprocess GTFS data
print("\nLoading GTFS stop_times...")
gtfs_data = load_gtfs_data(data_dir)
gtfs_data = preprocess_gtfs(gtfs_data)

if 'stop_times' not in gtfs_data:
    raise FileNotFoundError("stop_times.txt not found!")

print(f"Loaded {len(gtfs_data['stop_times']):,} stop_times records")



Loading GTFS stop_times...
Loaded 2,098,728 stop_times records


In [4]:
# Merge vehicle positions with stop_times
print("\nMerging vehicle positions with stop_times...")
merged_data = merge_vehicle_positions_with_stop_times(
    vehicle_positions,
    gtfs_data['stop_times']
)

merge_rate = merged_data['arrival_time'].notna().sum() / len(merged_data) * 100
print(f"Merge success rate: {merge_rate:.1f}%")
print(f"Merged data shape: {merged_data.shape}")



Merging vehicle positions with stop_times...
Merge success rate: 100.0%
Merged data shape: (50235, 29)


## 2. Extract All Features


In [5]:
# Extract all features
print("\nExtracting features...")
features_df = extract_all_features(merged_data)

print(f"Features shape: {features_df.shape}")
print(f"Total feature columns: {len(features_df.columns)}")

# Display feature columns
print("\nFeature columns:")
for i, col in enumerate(features_df.columns, 1):
    print(f"{i:3d}. {col}")



Extracting features...
Features shape: (50235, 66)
Total feature columns: 66

Feature columns:
  1. id
  2. trip_id
  3. route_id
  4. start_date
  5. schedule_relationship
  6. vehicle_id
  7. vehicle_label
  8. latitude
  9. longitude
 10. bearing
 11. speed
 12. stop_id
 13. current_status
 14. timestamp
 15. current_stop_sequence
 16. datetime
 17. arrival_time
 18. departure_time
 19. stop_sequence
 20. stop_headsign
 21. pickup_type
 22. drop_off_type
 23. trip_id_event
 24. route_code
 25. destination_code
 26. timepoint
 27. bay_num
 28. arrival_time_seconds
 29. departure_time_seconds
 30. hour
 31. day_of_week
 32. day_of_month
 33. month
 34. is_weekend
 35. is_rush_hour
 36. is_morning_rush
 37. is_evening_rush
 38. time_of_day
 39. route_prefix
 40. route_suffix
 41. route_frequency
 42. trip_id_numeric
 43. trip_date_suffix
 44. vehicle_frequency
 45. has_speed
 46. is_moving
 47. speed_category
 48. has_bearing
 49. bearing_direction
 50. has_location
 51. status_stoppe

## 3. Select Features for Model Training

Select features that are useful for predicting delays. Exclude target variable and non-predictive columns.


In [6]:
# Define feature columns for model training
# Exclude: target variables, IDs, timestamps, and derived delay columns

exclude_cols = [
    # Target variables
    'arrival_delay_minutes', 'arrival_delay_seconds',
    'departure_delay_minutes', 'departure_delay_seconds',
    'predicted_delay_minutes',
    
    # IDs and identifiers
    'id', 'trip_id', 'route_id', 'vehicle_id', 'vehicle_label',
    'stop_id', 'trip_id_numeric', 'trip_date_suffix',
    
    # Timestamps and dates
    'timestamp', 'datetime', 'start_date',
    'scheduled_arrival', 'expected_arrival',
    'arrival_time', 'departure_time',
    
    # Status text (we have encoded versions)
    'current_status',
    
    # Other non-predictive
    'schedule_relationship', 'stop_headsign',
    'pickup_type', 'drop_off_type',
]

# Get available feature columns
available_cols = [col for col in features_df.columns if col not in exclude_cols]

# Select numeric and boolean features
feature_cols = []
for col in available_cols:
    dtype = features_df[col].dtype
    if dtype in ['int64', 'float64', 'bool'] or dtype.name == 'category':
        feature_cols.append(col)

print(f"\nSelected {len(feature_cols)} features for training:")
for i, col in enumerate(feature_cols, 1):
    print(f"{i:3d}. {col}")

# Display feature statistics
print("\nFeature statistics:")
features_df[feature_cols].describe()



Selected 34 features for training:
  1. latitude
  2. longitude
  3. bearing
  4. speed
  5. current_stop_sequence
  6. stop_sequence
  7. route_code
  8. timepoint
  9. arrival_time_seconds
 10. departure_time_seconds
 11. is_weekend
 12. is_rush_hour
 13. is_morning_rush
 14. is_evening_rush
 15. time_of_day
 16. route_frequency
 17. vehicle_frequency
 18. has_speed
 19. is_moving
 20. speed_category
 21. has_bearing
 22. bearing_direction
 23. has_location
 24. status_stopped
 25. status_in_transit
 26. status_incoming
 27. status_IN_TRANSIT_TO
 28. status_STOPPED_AT
 29. is_at_stop
 30. is_delayed
 31. is_early
 32. is_on_time
 33. sequence_match
 34. sequence_diff

Feature statistics:


Unnamed: 0,latitude,longitude,bearing,speed,current_stop_sequence,stop_sequence,route_code,timepoint,arrival_time_seconds,departure_time_seconds,time_of_day,route_frequency,vehicle_frequency,sequence_diff
count,50235.0,50235.0,50235.0,50235.0,50235.0,50235.0,50235.0,50235.0,50235.0,50235.0,50235.0,50235.0,50235.0,50235.0
mean,34.05454,-118.30377,94.926052,5.271894,22.727461,22.727461,205.736837,0.394227,23915.186623,23915.198567,14.46568,685.307554,39.810949,0.0
std,0.104681,0.122927,120.473949,6.404499,23.835221,23.835221,228.229345,0.488689,6063.290956,6063.313249,0.808259,321.095439,11.795187,0.0
min,33.70702,-118.86117,0.0,0.0,1.0,1.0,2.0,0.0,15360.0,15360.0,12.583333,58.0,1.0,0.0
25%,33.989147,-118.377083,0.0,0.044704,1.0,1.0,55.0,0.0,21420.0,21420.0,13.85,496.0,32.0,0.0
50%,34.052353,-118.28711,0.0,1.78816,15.0,15.0,120.0,0.0,23880.0,23880.0,14.55,640.0,41.0,0.0
75%,34.105877,-118.23788,181.7,10.013696,38.0,38.0,233.0,1.0,25920.0,25920.0,15.15,916.0,48.0,0.0
max,34.327396,-117.91002,360.0,30.085793,123.0,123.0,950.0,1.0,107040.0,107040.0,15.733333,1384.0,77.0,0.0


In [8]:
# Prepare target variable (arrival delay in minutes)
# Use actual delay if available, otherwise skip rows without delay
if 'arrival_delay_minutes' in features_df.columns:
    # Filter rows with valid delay values
    valid_mask = features_df['arrival_delay_minutes'].notna()
    training_data = features_df[valid_mask].copy()
    
    y = training_data['arrival_delay_minutes'].values
    X = create_feature_matrix(training_data, feature_cols)
    
    # Ensure X is numeric for NaN/Inf checks
    X = X.astype(np.float64, copy=False)
    
    print(f"Training data shape: X={X.shape}, y={y.shape}")
    print(f"\nTarget variable (y) statistics:")
    print(f"  Mean: {y.mean():.2f} minutes")
    print(f"  Median: {np.median(y):.2f} minutes")
    print(f"  Std: {y.std():.2f} minutes")
    print(f"  Min: {y.min():.2f} minutes")
    print(f"  Max: {y.max():.2f} minutes")
    
    # Check for any NaN or Inf values
    print(f"\nData quality check:")
    print(f"  NaN in X: {np.isnan(X).sum()}")
    print(f"  Inf in X: {np.isinf(X).sum()}")
    print(f"  NaN in y: {np.isnan(y).sum()}")
else:
    print("Warning: No arrival_delay_minutes column found!")
    print("Make sure you've merged vehicle positions with stop_times data.")
    X, y = None, None


Training data shape: X=(50235, 34), y=(50235,)

Target variable (y) statistics:
  Mean: 469.89 minutes
  Median: 479.55 minutes
  Std: 96.04 minutes
  Min: -963.83 minutes
  Max: 553.62 minutes

Data quality check:
  NaN in X: 0
  Inf in X: 0
  NaN in y: 0


## 5. Save Training Data

Save the feature matrix, target variable, and feature column names for model training.


In [9]:
if X is not None and y is not None:
    # Create DataFrame with features and target
    training_df = pd.DataFrame(X, columns=feature_cols)
    training_df['arrival_delay_minutes'] = y
    
    # Save training data
    training_data_path = processed_dir / 'training_data.csv'
    training_df.to_csv(training_data_path, index=False)
    print(f"\nSaved training data to: {training_data_path}")
    print(f"Shape: {training_df.shape}")
    
    # Save feature column names for later use
    import json
    feature_cols_path = processed_dir / 'feature_columns.json'
    with open(feature_cols_path, 'w') as f:
        json.dump(feature_cols, f)
    print(f"Saved feature columns to: {feature_cols_path}")
    
    # Also save the full features DataFrame for reference
    features_path = processed_dir / 'all_features.csv'
    features_df.to_csv(features_path, index=False)
    print(f"Saved full features DataFrame to: {features_path}")
else:
    print("\nCannot save training data - missing X or y")



Saved training data to: ../data/processed/training_data.csv
Shape: (50235, 35)
Saved feature columns to: ../data/processed/feature_columns.json
Saved full features DataFrame to: ../data/processed/all_features.csv
