In [2]:
import requests
import gtfs_realtime_NYCT_pb2
import gtfs_realtime_pb2
import polars as pl
from polars import col
import re
from PIL import Image
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.graph_objects as go
import pandas as pd
from PIL import Image
import pyarrow
import json

In [3]:
api_endpoints = {
    "ACE": r"https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-ace",
    "BDFM": r"https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-bdfm",
    "G": r"https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-g",
    "JZ": r"https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-jz",
    "NQRW": r"https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-nqrw",
    "L": r"https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-l",
    "1234567": r"https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs",
    "SI": r"https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-si",
}

In [4]:
response = requests.get(
    "https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-bdfm"
)

In [5]:
feed = gtfs_realtime_pb2.FeedMessage()

In [6]:
feed.ParseFromString(response.content)

179295

In [7]:
stops = pl.read_csv(
    "stops.txt",
    separator=",",
    has_header=True,
    schema_overrides={"parent_station": pl.String},
)

shapes = pl.read_csv(
    "shapes.txt",
    separator=",",
    has_header=True,
)

colors = pl.read_csv("MTA_Colors_20240623.csv", separator=",", has_header=True)

In [8]:
colors = colors.filter(col("Operator") == "New York City Subway")
colors = colors.with_columns(
    col("Service").str.split(",")
)  # Split the comma-delimited values into lists
colors = colors.explode("Service")  # Explode the lists into separate rows

In [10]:
shape_unpack_re = re.compile(r"^(\w{1}).*\.+(\w+)$")


def shape_unpack(shape):
    m = re.match(shape_unpack_re, shape)
    return m.group(1), m.group(2)

In [11]:
shapes_clean = shapes.with_columns(
    [
        shapes["shape_id"]
        .map_elements(lambda x: shape_unpack(x)[0], return_dtype=str)
        .alias("Line"),
        shapes["shape_id"]
        .map_elements(lambda x: shape_unpack(x)[1], return_dtype=str)
        .alias("Line_Variation"),
    ]
)

In [12]:
longest_shapes = pl.sql(
    """
    WITH cte1 as (
        select Line, max(shape_pt_sequence) as max_points
        from shapes_clean
        group by Line
    )
    select Line, max(shape_id), max_points from shapes_clean s
    join cte1 c on s.shape_pt_sequence=c.max_points and s.Line=c.Line
    group by Line, max_points
    """
).collect()

In [14]:
longest_shapes_final = (
    shapes_clean.join(longest_shapes, on="shape_id")
    .select(["Line", "shape_pt_sequence", "shape_pt_lon", "shape_pt_lat"])
    .join(colors.select(["Service", "Hex color"]), left_on="Line", right_on="Service")
)

In [16]:
stop_removal_re = r".*[NS]$"

stops = stops.filter(~stops["stop_id"].str.contains(stop_removal_re))

In [17]:
stop_unpack_re = re.compile(r"^(\w{1})(\d{2})")


def stop_unpack(stop):
    m = re.match(stop_unpack_re, stop)
    return m.group(1), m.group(2)

In [18]:
stops_clean = stops[["stop_id", "stop_name", "stop_lat", "stop_lon"]].with_columns(
    [
        stops["stop_id"]
        .map_elements(lambda x: stop_unpack(x)[0], return_dtype=str)
        .alias("Line"),
        stops["stop_id"]
        .map_elements(lambda x: stop_unpack(x)[1], return_dtype=str)
        .alias("Order"),
    ]
)
# stops_clean = stops_clean.join(line_points, left_on="Line", right_on="Line", how="left")
stops_clean = stops_clean.join(colors, left_on="Line", right_on="Service").select(
    ["stop_name", "stop_lat", "stop_lon", "Line", "Order", "Hex color"]
)

In [19]:
stops_clean

stop_name,stop_lat,stop_lon,Line,Order,Hex color
str,f64,f64,str,str,str
"""Van Cortlandt Park-242 St""",40.889248,-73.898583,"""1""","""01""","""#EE352E"""
"""238 St""",40.884667,-73.90087,"""1""","""03""","""#EE352E"""
"""231 St""",40.878856,-73.904834,"""1""","""04""","""#EE352E"""
"""Marble Hill-225 St""",40.874561,-73.909831,"""1""","""06""","""#EE352E"""
"""215 St""",40.869444,-73.915279,"""1""","""07""","""#EE352E"""
…,…,…,…,…,…
"""Grasmere""",40.603117,-74.084087,"""S""","""27""","""#808183"""
"""Clifton""",40.621319,-74.071402,"""S""","""28""","""#808183"""
"""Stapleton""",40.627915,-74.075162,"""S""","""29""","""#808183"""
"""Tompkinsville""",40.636949,-74.074835,"""S""","""30""","""#808183"""


In [20]:
shapes_final = longest_shapes_final.join(
    stops_clean.select(["stop_lon", "stop_lat", "stop_name"]),
    left_on=("shape_pt_lon", "shape_pt_lat"),
    right_on=("stop_lon", "stop_lat"),
    how="left",
)

  shapes_final = longest_shapes_final.join(


In [21]:
shapes_final

Line,shape_pt_sequence,shape_pt_lon,shape_pt_lat,Hex color,stop_name
str,i64,f64,f64,str,str
"""1""",0,-73.898583,40.889248,"""#EE352E""","""Van Cortlandt Park-242 St"""
"""1""",1,-73.899616,40.887195,"""#EE352E""",
"""1""",2,-73.900041,40.886309,"""#EE352E""",
"""1""",3,-73.90073,40.884928,"""#EE352E""",
"""1""",4,-73.90087,40.884667,"""#EE352E""","""238 St"""
…,…,…,…,…,…
"""S""",685,-74.250493,40.513696,"""#808183""",
"""S""",686,-74.250706,40.513579,"""#808183""",
"""S""",687,-74.250917,40.513458,"""#808183""",
"""S""",688,-74.251124,40.513334,"""#808183""",


In [22]:
import math


def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in kilometers
    dlat = math.radians(lat2 - lat1)
    dlon = math.radians(lon2 - lon1)
    a = (
        math.sin(dlat / 2) ** 2
        + math.cos(math.radians(lat1))
        * math.cos(math.radians(lat2))
        * math.sin(dlon / 2) ** 2
    )
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = R * c
    return distance

In [23]:
# Define a function to calculate distance using lead coordinates
def calculate_distance_within_line(df):

    df = df.sort("shape_pt_sequence")

    # Create lead columns for lat and lon
    df = df.with_columns(
        [
            df["shape_pt_lat"].shift(-1).alias("lead_lat"),
            df["shape_pt_lon"].shift(-1).alias("lead_lon"),
        ]
    )

    # Apply the Haversine function to each row and add the result as a new column
    return df.with_columns(
        [
            pl.concat_list(["shape_pt_lat", "shape_pt_lon", "lead_lat", "lead_lon"])
            .map_elements(
                lambda row: (
                    haversine(row[0], row[1], row[2], row[3])
                    if row[2] is not None and row[3] is not None
                    else None
                ),
                return_dtype=pl.Float64,
            )
            .alias("distance")
        ]
    )


# Group by 'Line', sort each group by 'line_order', and apply the distance calculation
result = shapes_final.group_by("Line", maintain_order=True).map_groups(
    calculate_distance_within_line
)

result.write_excel("testing.xlsx")

<xlsxwriter.workbook.Workbook at 0x1427e6810>

In [24]:
result

Line,shape_pt_sequence,shape_pt_lon,shape_pt_lat,Hex color,stop_name,lead_lat,lead_lon,distance
str,i64,f64,f64,str,str,f64,f64,f64
"""1""",0,-73.898583,40.889248,"""#EE352E""","""Van Cortlandt Park-242 St""",40.887195,-73.899616,0.244241
"""1""",1,-73.899616,40.887195,"""#EE352E""",,40.886309,-73.900041,0.104797
"""1""",2,-73.900041,40.886309,"""#EE352E""",,40.884928,-73.90073,0.164121
"""1""",3,-73.90073,40.884928,"""#EE352E""",,40.884667,-73.90087,0.031317
"""1""",4,-73.90087,40.884667,"""#EE352E""","""238 St""",40.884461,-73.900991,0.025063
…,…,…,…,…,…,…,…,…
"""S""",685,-74.250493,40.513696,"""#808183""",,40.513579,-74.250706,0.022214
"""S""",686,-74.250706,40.513579,"""#808183""",,40.513458,-74.250917,0.022343
"""S""",687,-74.250917,40.513458,"""#808183""",,40.513334,-74.251124,0.022278
"""S""",688,-74.251124,40.513334,"""#808183""",,40.512764,-74.251961,0.094993


In [None]:
stops_1 = stops.filter(
    col("stop_id").map_elements(
        lambda x: True if re.search(r"^1\d{2}$", x) else False, return_dtype=pl.Boolean
    )
)
stops_1_ = stops_1.sort("stop_id").to_dicts()
stops_1a = (
    stops_1.sort("stop_id")
    .select(["stop_name", "stop_lat", "stop_lon"])
    .to_dict(as_series=False)
)

In [None]:
stops_Q = stops.filter(
    col("stop_id").map_elements(
        lambda x: True if re.search(r"^Q\d{2}$", x) else False, return_dtype=pl.Boolean
    )
)
stops_Q_ = stops_Q.sort("stop_id").to_dicts()
stops_Qa = (
    stops_Q.sort("stop_id")
    .select(["stop_name", "stop_lat", "stop_lon"])
    .to_dict(as_series=False)
)

In [None]:
stops_JJ = shapes.filter(
    col("shape_id").map_elements(
        lambda x: True if re.search(r"^1\.\.N03R", x) else False,
        return_dtype=pl.Boolean,
    )
)
stops_JJ_ = stops_JJ.sort("shape_pt_sequence", descending=False).to_dicts()
stops_JJa = (
    stops_JJ.sort("shape_pt_sequence", descending=False)
    .select(["shape_pt_sequence", "shape_pt_lat", "shape_pt_lon"])
    .to_dict(as_series=False)
)

In [None]:
stops_JJa.keys()

dict_keys(['shape_pt_sequence', 'shape_pt_lat', 'shape_pt_lon'])

In [None]:
def add_line_shape(figure: go.Figure, shape_dict: dict) -> None:
    figure.add_trace(
        go.Scattermapbox(
            mode="lines",
            name=shape_dict["Line"][0],
            lon=shape_dict["shape_pt_lon"],
            lat=shape_dict["shape_pt_lat"],
            text=shape_dict["shape_pt_sequence"],
            marker={"size": 10},
            line={"width": 4, "color": shape_dict["Hex color"][0]},
            hoverinfo="none",
        )
    )


def add_stops(figure: go.Figure, stops_dict: dict) -> None:
    figure.add_trace(
        go.Scattermapbox(
            mode="markers",
            name=stops_dict["Line"][0],
            lon=stops_dict["stop_lon"],
            lat=stops_dict["stop_lat"],
            text=stops_dict["stop_name"],
            marker={"size": 8, "color": stops_dict["Hex color"][0]},
            # hoverinfo=stops_dict["stop_name"][0],
            showlegend=False,
        )
    )

In [None]:
# Assuming stops_1a is your DataFrame
# For this example, let's recreate a similar DataFrame as your stops_1a

# Create the figure
fig = go.Figure()

# Add the scatter mapbox trace for the lines
for line in stops_clean["Line"].unique():
    d = stops_clean.filter("Line" == line).to_dict()
    fig.add_trace(
        go.Scattermapbox(
            mode="markers",
            name=d["Line"][0],
            lon=d["stop_lon"],
            lat=d["stop_lat"],
            text=d["stop_name"],
            marker={"size": 8},
            line={"width": 4, "color": "#EE352E"},
        )
    )

fig.add_trace(
    go.Scattermapbox(
        mode="lines",
        lon=stops_JJa["shape_pt_lon"],
        lat=stops_JJa["shape_pt_lat"],
        text=stops_JJa["shape_pt_sequence"],
        marker={"size": 10},
        line={"width": 4, "color": "#00933C"},
    )
)

# fig.add_trace(
#     go.Scattermapbox(
#         mode="markers",
#         lon=[-73.9851],  # Example longitude
#         lat=[40.7580],  # Example latitude
#         text=[
#             "<b>Times Square</b><br>"
#             "Location: New York<br>"
#             "Description: A major commercial intersection, "
#             "tourist destination, entertainment center, and "
#             "neighborhood in the Midtown Manhattan section of New York City."
#         ],  # Popup text with HTML
#     )
# )

# Update the layout
fig.update_layout(
    mapbox_style="carto-positron",
    mapbox_zoom=10,
    mapbox_center={"lat": 40.75, "lon": -73.95},
    margin={"r": 0, "t": 0, "l": 0, "b": 0},
)

# fig.add_layout_image(
#     dict(
#         source="https://raw.githubusercontent.com/cldougl/plot_images/add_r_img/vox.png",
#         xref="paper",
#         yref="paper",
#         x=1,
#         y=1.05,
#         sizex=0.2,
#         sizey=0.2,
#         xanchor="right",
#         yanchor="bottom",
#     )
# )

# fig.add_layout_image(dict(source="Q_train.png", x=40.75, y=-73.95))

# Initialize the Dash app
app = dash.Dash(__name__)

app.layout = html.Div([dcc.Graph(id="live-map", figure=fig)])

if __name__ == "__main__":
    app.run_server(debug=True)

IndexError: index 0 is out of bounds for sequence of length 0

In [None]:
longest_shapes_final.filter(pl.col("Line") == "5")

Line,shape_pt_sequence,shape_pt_lon,shape_pt_lat,Hex color
str,i64,f64,f64,str
"""5""",0,-73.830834,40.8883,"""#00933C"""
"""5""",1,-73.831315,40.887513,"""#00933C"""
"""5""",2,-73.83155,40.887136,"""#00933C"""
"""5""",3,-73.832289,40.88611,"""#00933C"""
"""5""",4,-73.832712,40.885577,"""#00933C"""
…,…,…,…,…
"""5""",613,-73.908946,40.662549,"""#00933C"""
"""5""",614,-73.902447,40.663515,"""#00933C"""
"""5""",615,-73.894895,40.664635,"""#00933C"""
"""5""",616,-73.889395,40.665449,"""#00933C"""


In [None]:
new_fig = go.Figure()

for line in sorted(longest_shapes_final["Line"].unique()):
    line_shape = (
        longest_shapes_final.filter(pl.col("Line") == line)
        .sort("shape_pt_sequence")
        .to_dict(as_series=False)
    )
    stops = stops_clean.filter(pl.col("Line") == line).to_dict(as_series=False)

    add_line_shape(new_fig, line_shape)
    try:
        add_stops(new_fig, stops)
    except IndexError:
        continue

new_fig.update_layout(
    mapbox_style="carto-positron",
    mapbox_zoom=10,
    mapbox_center={"lat": 40.75, "lon": -73.95},
    margin={"r": 0, "t": 0, "l": 0, "b": 0},
)
new_fig.write_html("subway_map_plot.html")

with open("map_plot.json", "w") as f:
    json.dump(new_fig.to_plotly_json(), f)

app = dash.Dash(__name__)

app.layout = html.Div([dcc.Graph(id="live-map", figure=new_fig)])

if __name__ == "__main__":
    app.run_server(debug=True)

In [None]:
shapes_clean.filter(pl.col("Line") == "5")["shape_pt_lat"].value_counts().sort("count")

shape_pt_lat,count
f64,u32
40.665449,5
40.667449,5
40.668271,5
40.667552,5
40.667735,5
…,…
40.73069,49
40.841894,52
40.816109,96
40.815931,96
