In [19]:
import geopandas as gpd
import pandas as pd
from typing import Union

In [61]:
def load_stations():
    data = pd.read_csv("https://data.cityofnewyork.us/api/views/kk4q-3rt2/rows.csv?accessType=DOWNLOAD")
    data = data.assign(
        geometry=lambda x: gpd.GeoSeries.from_wkt(x["the_geom"]),
        station_name=lambda x: x["NAME"].str.replace(r'(\d+)(st|nd|rd|th)', r"\1", regex=True),
        lines=lambda x: x["LINE"].str.replace(r"-. Express", "", regex=True)
    )
    data = data.drop(columns=["the_geom"])
    return gpd.GeoDataFrame(data)

In [71]:
def load_ridership():
    """Loads average weekday ridership by station."""
    data = pd.read_excel("https://new.mta.info/document/91476", sheet_name="Avg Weekday", skiprows=1)
    return (
        data
        .dropna(subset="Boro")
        .rename(columns={"Station (alphabetical by borough)": "station_name", 2019: "ridership"})
        .filter(["station_name", "ridership"])
        .assign(lines=lambda x: x["station_name"].str.extract(r"\((.+?)\)")[0].str.split(",").apply(lambda y: "-".join(sorted(y))))
    )

In [106]:
from thefuzz import process
def fuzzy_match(val: str, options: pd.Series, threshold: int = 90) -> Union[str, None]:
    """Returns fuzzy match of val from options."""
    res = process.extractOne(val, options, score_cutoff=threshold)
    if res is not None:
        return res[0]


def fuzzy_match_series_station_names(
    original: pd.Series,
    canonical: pd.Series
) -> pd.Series:
    """
    Returns fuzzy match of original from canonical.
    """
    return original.apply(lambda x: fuzzy_match(x, canonical, 90))

In [107]:
stations = load_stations()

In [108]:
ridership = load_ridership()

In [109]:
stations.head()

Unnamed: 0,URL,OBJECTID,NAME,LINE,NOTES,geometry,station_name,lines
0,http://web.mta.info/nyct/service/,1,Astor Pl,4-6-6 Express,"4 nights, 6-all times, 6 Express-weekdays AM s...",POINT (-73.99107 40.73005),Astor Pl,4-6
1,http://web.mta.info/nyct/service/,2,Canal St,4-6-6 Express,"4 nights, 6-all times, 6 Express-weekdays AM s...",POINT (-74.00019 40.71880),Canal St,4-6
2,http://web.mta.info/nyct/service/,3,50th St,1-2,"1-all times, 2-nights",POINT (-73.98385 40.76173),50 St,1-2
3,http://web.mta.info/nyct/service/,4,Bergen St,2-3-4,"4-nights, 3-all other times, 2-all times",POINT (-73.97500 40.68086),Bergen St,2-3-4
4,http://web.mta.info/nyct/service/,5,Pennsylvania Ave,3-4,"4-nights, 3-all other times",POINT (-73.89489 40.66471),Pennsylvania Ave,3-4


In [111]:
canonical_ridership = ridership.assign(
    clean_name=lambda x: x["station_name"].str.replace(r"\(.*\)", "", regex=True).str.replace("St.", "St", regex=False),
    station_match=lambda x: fuzzy_match_series_station_names(x["clean_name"], stations["station_name"]),
    # station_full_match=lambda x: fuzzy_match_series_station_names(x["station_match"] + x["lines"], stations["station_name"] + stations["lines"])
)

In [112]:
canonical_ridership.query("station_match.str.contains('Canal', na=False)")

Unnamed: 0,station_name,ridership,lines,clean_name,station_match
312,Canal St (1),6830.248,1,Canal St,Canal St
313,"Canal St (A,C,E)",21205.8268,A-C-E,Canal St,Canal St
314,"Canal St (J,N,Q,R,W,Z,6)",47628.7835,6-J-N-Q-R-W-Z,Canal St,Canal St


In [113]:
simple_ridership = canonical_ridership.filter(["station_match", "lines", "ridership"])
merger = stations.merge(
    simple_ridership,
    how="left",
    left_on=["station_name", "lines"],
    right_on=["station_match", "lines"],
# ).merge(
#     simple_ridership,
#     how="left",
#     left_on="station_name",
#     right_on="station_match",
#     suffixes=["", "_extra"]
)

In [117]:
merger.query("station_match.notna()")

Unnamed: 0,URL,OBJECTID,NAME,LINE,NOTES,geometry,station_name,lines,station_match,ridership
5,http://web.mta.info/nyct/service/,6,238th St,1,"1-all times, exit only northbound",POINT (-73.90087 40.88467),238 St,1,238 St,3998.2717
11,http://web.mta.info/nyct/service/,12,Van Siclen Ave,J-Z,"Z-rush hours AM westbound, PM eastbound, J-all...",POINT (-73.89166 40.67803),Van Siclen Ave,J-Z,Van Siclen Ave,2614.3898
12,http://web.mta.info/nyct/service/,13,Norwood Ave,J-Z,"Z-rush hours AM westbound, PM eastbound, J-all...",POINT (-73.87963 40.68152),Norwood Ave,J-Z,Norwood Ave,3155.7087
15,http://web.mta.info/nyct/service/,16,Beach 105th St,A-S,"A-rush hours AM northbound, PM southbound, S B...",POINT (-73.82758 40.58327),Beach 105 St,A-S,Beach 105 St,243.3662
16,http://web.mta.info/nyct/service/,17,Beach 90th St,A-S,"A-rush hours AM northbound, PM southbound, S B...",POINT (-73.81365 40.58809),Beach 90 St,A-S,Beach 90 St,945.5118
...,...,...,...,...,...,...,...,...,...,...
469,http://web.mta.info/nyct/service/,469,Coney Island - Stillwell Av,D-F-N-Q,"D,F,N,Q-all times",POINT (-73.98124 40.57728),Coney Island - Stillwell Av,D-F-N-Q,Coney Island - Stillwell Av,12818.9370
470,http://web.mta.info/nyct/service/,470,34th St - Hudson Yards,7-7 Express,"7-all times, 7 Express-rush hours AM westbound...",POINT (-74.00220 40.75545),34 St - Hudson Yards,7,34 St - Hudson Yards,18875.3583
471,http://web.mta.info/nyct/service/,641,72nd St,Q,Q-all times,POINT (-73.95836 40.76880),72 St,Q,72 St,31584.6339
472,http://web.mta.info/nyct/service/,642,86th St,Q,Q-all times,POINT (-73.95177 40.77786),86 St,Q,86 St,26306.6850
