# 01 - Data Exploration

This notebook explores the vehicle positions and GTFS transit data.

## Steps:
1. Load vehicle positions CSV
2. Load GTFS stop_times data
3. Explore data structure and basic statistics
4. Visualize delays and patterns


In [None]:

import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import sys
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append(str(Path('../src').resolve()))
from data_utils import (
    load_gtfs_data, 
    load_vehicle_positions,
    preprocess_gtfs,
    merge_vehicle_positions_with_stop_times
)
from features import extract_all_features

# Set up paths
data_dir = Path('../data/raw')
processed_dir = Path('../data/processed')
processed_dir.mkdir(parents=True, exist_ok=True)

print("=" * 60)
print("Data Exploration - Transit Tracker")
print("=" * 60)


## 1. Load Vehicle Positions Data


In [None]:
# Load vehicle positions from both files and combine them
vehicle_positions_file1 = 'vehicle_positions_rt_rows-11-13.csv'
vehicle_positions_file2 = 'vehicle_positions_rt_rows_fri_11:14.csv'

print(f"Loading {vehicle_positions_file1}...")
vehicle_positions1 = load_vehicle_positions(data_dir / vehicle_positions_file1)
print(f"  Loaded {len(vehicle_positions1):,} records")

print(f"\nLoading {vehicle_positions_file2}...")
vehicle_positions2 = load_vehicle_positions(data_dir / vehicle_positions_file2)
print(f"  Loaded {len(vehicle_positions2):,} records")

# Combine both dataframes
vehicle_positions = pd.concat([vehicle_positions1, vehicle_positions2], ignore_index=True)
print(f"\nCombined: {len(vehicle_positions):,} total records")

print(f"\nVehicle Positions Shape: {vehicle_positions.shape}")
print(f"\nColumns: {list(vehicle_positions.columns)}")
print(f"\nFirst few rows:")
vehicle_positions.head()


In [None]:
# Basic statistics
print("\n=== Vehicle Positions Statistics ===")
print(f"Total records: {len(vehicle_positions):,}")
print(f"Unique routes: {vehicle_positions['route_id'].nunique()}")
print(f"Unique trips: {vehicle_positions['trip_id'].nunique()}")
print(f"Unique vehicles: {vehicle_positions['vehicle_id'].nunique()}")
print(f"Unique stops: {vehicle_positions['stop_id'].nunique()}")
print(f"\nDate range: {vehicle_positions['datetime'].min()} to {vehicle_positions['datetime'].max()}")
print(f"\nRoute distribution (top 10):")
vehicle_positions['route_id'].value_counts().head(10)


## 2. Load GTFS Stop Times Data


In [None]:
# Load GTFS data
gtfs_data = load_gtfs_data(data_dir)
print("Loaded GTFS files:", list(gtfs_data.keys()))

# Preprocess GTFS
gtfs_data = preprocess_gtfs(gtfs_data)

# Explore stop_times
if 'stop_times' in gtfs_data:
    stop_times = gtfs_data['stop_times']
    print(f"\nStop Times Shape: {stop_times.shape}")
    print(f"\nStop Times Columns: {list(stop_times.columns)}")
    print(f"\nUnique trips in stop_times: {stop_times['trip_id'].nunique()}")
    print(f"\nUnique stops in stop_times: {stop_times['stop_id'].nunique()}")
    stop_times.head()


## 3. Merge Vehicle Positions with Stop Times


In [None]:
# Merge data (using a sample for faster exploration)
# Take a sample of vehicle positions for exploration
sample_size = min(10000, len(vehicle_positions))
vehicle_sample = vehicle_positions.sample(n=sample_size, random_state=42)

# Merge with stop_times
merged_data = merge_vehicle_positions_with_stop_times(
    vehicle_sample,
    gtfs_data['stop_times']
)

print(f"Merged data shape: {merged_data.shape}")
print(f"\nMerge success rate: {(merged_data['arrival_time'].notna().sum() / len(merged_data) * 100):.1f}%")
print(f"\nMerged columns: {list(merged_data.columns)}")
merged_data.head()


## 4. Extract Features and Explore Delays


In [None]:
# Extract all features
features_df = extract_all_features(merged_data)

print(f"Features shape: {features_df.shape}")
print(f"\nFeature columns: {len(features_df.columns)} columns")

# Check delay statistics if available
if 'arrival_delay_minutes' in features_df.columns:
    delays = features_df['arrival_delay_minutes'].dropna()
    print(f"\n=== Delay Statistics ===")
    print(f"Mean delay: {delays.mean():.2f} minutes")
    print(f"Median delay: {delays.median():.2f} minutes")
    print(f"Std delay: {delays.std():.2f} minutes")
    print(f"Min delay: {delays.min():.2f} minutes")
    print(f"Max delay: {delays.max():.2f} minutes")
    print(f"On-time percentage (Â±1 min): {((delays >= -1) & (delays <= 1)).sum() / len(delays) * 100:.1f}%")


## 5. Visualizations


In [None]:
# Plot delay distribution
if 'arrival_delay_minutes' in features_df.columns:
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    delays = features_df['arrival_delay_minutes'].dropna()
    plt.hist(delays, bins=50, edgecolor='black', alpha=0.7)
    plt.xlabel('Delay (minutes)')
    plt.ylabel('Frequency')
    plt.title('Distribution of Arrival Delays')
    plt.axvline(x=0, color='r', linestyle='--', label='On Time')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.subplot(1, 2, 2)
    if 'hour' in features_df.columns:
        hourly_delays = features_df.groupby('hour')['arrival_delay_minutes'].mean()
        plt.plot(hourly_delays.index, hourly_delays.values, marker='o')
        plt.xlabel('Hour of Day')
        plt.ylabel('Average Delay (minutes)')
        plt.title('Average Delay by Hour')
        plt.grid(True, alpha=0.3)
        plt.xticks(range(24))
    
    plt.tight_layout()
    plt.show()


In [None]:
# Status breakdown
if 'current_status' in features_df.columns:
    status_counts = features_df['current_status'].value_counts()
    plt.figure(figsize=(8, 6))
    status_counts.plot(kind='bar')
    plt.xlabel('Status')
    plt.ylabel('Count')
    plt.title('Vehicle Status Distribution')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print("\nStatus breakdown:")
    print(status_counts)


## 6. Save Processed Data for Next Steps

Save a sample of the processed data for feature engineering and model training.


In [None]:
# Save features for next notebook
# For full dataset, remove .sample() and process all data
features_df.to_csv(processed_dir / 'explored_features.csv', index=False)
print(f"Saved features to {processed_dir / 'explored_features.csv'}")
print(f"Shape: {features_df.shape}")
