In [None]:
import pandas as pd
import json
from pathlib import Path

BASE = Path(".")
STATIONS_JSON = BASE / "stations.json"
STOPS_FILE = BASE / "stops.txt"
STOP_TIMES_FILE = BASE / "stop_times.txt"
TRIPS_FILE = BASE / "trips.txt"
ROUTES_FILE = BASE / "routes.txt"

DEST_NAME_FOR_LINE = {
    "1": "Chambers St",
    "2": "Chambers St",
    "3": "Chambers St",
    "A": "Chambers St",
    "C": "Chambers St",
    "E": "World Trade Center",
}

# Morning commute window (for now)
MORNING_START_HOUR = 7   # 7:00
MORNING_END_HOUR = 11    # 11:59


def time_to_seconds(t_str: str):
    try:
        h, m, s = map(int, t_str.split(":"))
        return h * 3600 + m * 60 + s
    except Exception:
        return None

def is_morning(t_str: str) -> bool:
    sec = time_to_seconds(t_str)
    if sec is None:
        return False
    hour = sec // 3600
    return MORNING_START_HOUR <= hour < MORNING_END_HOUR

def main():
    print("Loading GTFS files...")
    stops = pd.read_csv(STOPS_FILE)
    stop_times = pd.read_csv(STOP_TIMES_FILE)
    trips = pd.read_csv(TRIPS_FILE)
    routes = pd.read_csv(ROUTES_FILE)
    trips_routes = trips.merge(routes[["route_id", "route_short_name"]],
                               on="route_id", how="left")
    trips_routes["line"] = trips_routes["route_short_name"].astype(str).str.strip()

    print("Loading stations.json...")
    with open(STATIONS_JSON, "r") as f:
        stations_data = json.load(f)
    stations = stations_data["stations"]

    print("Building origin station -> stop_id mapping...")

    station_to_stop_ids = {}
    for st in stations:
        name = st["Station Name"]
        mask = stops["stop_name"].str.contains(name, case=False, na=False)
        ids = stops.loc[mask, "stop_id"].unique().tolist()
        station_to_stop_ids[name] = ids


    print("Finding destination stop_ids...")
    line_to_dest_stop_ids = {}

    for line, dest_name in DEST_NAME_FOR_LINE.items():
        mask = stops["stop_name"].str.contains(dest_name, case=False, na=False)
        dest_ids = stops.loc[mask, "stop_id"].unique().tolist()
        if not dest_ids:
            print(f"WARNING: No destination stops found for line {line} with name '{dest_name}'")
        line_to_dest_stop_ids[line] = dest_ids

    
    print("Merging stop_times with trips...")
    st_with_trips = stop_times.merge(trips_routes[["trip_id", "route_id", "line"]],
                                     on="trip_id", how="left")

    st_by_stop = st_with_trips.set_index("stop_id")

    results = {}

    print("Computing travel times for each station & line...")
    for st in stations:
        station_name = st["Station Name"]
        lines = st["Train Lines"]
        origin_stop_ids = station_to_stop_ids.get(station_name, [])

        results[station_name] = {}

        if not origin_stop_ids:
            print(f"  [WARN] No stop_ids found for station '{station_name}'")
            for line in lines:
                results[station_name][line] = None
            continue

        for line in lines:
            dest_ids = line_to_dest_stop_ids.get(line, [])
            if not dest_ids:
                print(f"  [WARN] No destination ids for line {line}, station {station_name}")
                results[station_name][line] = None
                continue

            origin_rows = st_with_trips[
                (st_with_trips["stop_id"].isin(origin_stop_ids)) &
                (st_with_trips["line"] == line)
            ]
            dest_rows = st_with_trips[
                (st_with_trips["stop_id"].isin(dest_ids)) &
                (st_with_trips["line"] == line)
            ]

            if origin_rows.empty or dest_rows.empty:
                print(f"  [INFO] No matching trips for {station_name} on line {line}")
                results[station_name][line] = None
                continue

            merged = origin_rows.merge(
                dest_rows,
                on="trip_id",
                suffixes=("_orig", "_dest")
            )
            
            merged = merged[merged["stop_sequence_orig"] < merged["stop_sequence_dest"]]

            durations = []
            for _, row in merged.iterrows():
                dep = row["departure_time_orig"]
                arr = row["arrival_time_dest"]

                if not (isinstance(dep, str) and isinstance(arr, str)):
                    continue

                if not (is_morning(dep) and is_morning(arr)):
                    continue

                dep_s = time_to_seconds(dep)
                arr_s = time_to_seconds(arr)
                if dep_s is None or arr_s is None:
                    continue

                dt = arr_s - dep_s
                if 0 < dt < 3 * 3600:
                    durations.append(dt / 60.0)

            if durations:
                avg_minutes = round(sum(durations) / len(durations))
                results[station_name][line] = int(avg_minutes)
                print(f"  {station_name} ({line}): avg {avg_minutes} min, samples={len(durations)}")
            else:
                print(f"  [INFO] No valid durations for {station_name} on {line}")
                results[station_name][line] = None

    FALLBACK_MIN = 30
    final_output = {}
    for station_name, line_map in results.items():
        final_output[station_name] = {}
        for line, mins in line_map.items():
            if mins is None:
                final_output[station_name][line] = FALLBACK_MIN
            else:
                final_output[station_name][line] = mins

    out_path = BASE / "travel_times.json"
    with open(out_path, "w") as f:
        json.dump(final_output, f, indent=2)

    print(f"\nSaved travel_times.json to {out_path.resolve()}")


if __name__ == "__main__":
    main()

Loading GTFS files...
Loading stations.json...
Building origin station -> stop_id mapping...
Finding destination stop_ids...
Merging stop_times with trips...


  mask = stops["stop_name"].str.contains(name, case=False, na=False)


Computing travel times for each station & line...
  103 St (1): avg 26 min, samples=99
  103 St (A): avg 22 min, samples=1
  103 St (C): avg 24 min, samples=58
  104 St (A): avg 33 min, samples=39
  110 St-Malcolm X Plaza (2): avg 20 min, samples=82
  110 St-Malcolm X Plaza (3): avg 20 min, samples=69
  111 St (A): avg 34 min, samples=39
  116 St (2): avg 21 min, samples=82
  116 St (3): avg 21 min, samples=69
  116 St (A): avg 57 min, samples=3
  116 St (C): avg 26 min, samples=57
  116 St-Columbia University (1): avg 28 min, samples=99
  125 St (1): avg 31 min, samples=98
  125 St (2): avg 23 min, samples=80
  125 St (3): avg 23 min, samples=69
  125 St (A): avg 21 min, samples=71
  125 St (C): avg 28 min, samples=57
  135 St (2): avg 24 min, samples=80
  135 St (3): avg 24 min, samples=67
  [INFO] No valid durations for 135 St on A
  135 St (C): avg 30 min, samples=57
  137 St-City College (1): avg 32 min, samples=97
  14 St (1): avg 7 min, samples=105
  14 St (2): avg 5 min, sample