In [1]:
# Importing all requisite libraries, settings some overall parameters and formatting.
%reset -f
from sqlalchemy import create_engine
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.dates as mdates
import seaborn as sns
import plotly
import numpy as np
import geopandas as gpd
from shapely.geometry import Point, Polygon
#from pandas_profiling import ProfileReport
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

pd.options.display.max_columns = 20
pd.options.display.max_rows = 20
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
sns.set_style('darkgrid');

In [2]:
def removeSpecialCharactersFromStationName(df):
    # Remove special characters from station names
    df.station = df.station.str.replace("/","_")
    df.station = df.station.str.replace("-","_")
    df.station = df.station.str.replace(" ","_")
    df.station = df.station.str.lower()
    return df

In [3]:
def removeSpecialCharactersFromStationTimestamps(df):
    # Remove special characters from station dates
    df['date'] = df['date'].str.replace('/','_')
    df['time'] = df['time'].str.replace(':','_')
    df['desc'] = df['desc'].str.replace(' ', '_')
    return df

In [4]:
def createPassengerCountColumnsByTime(df):
    df['date'] = df.date_time.dt.date
    df['day_of_week'] = df.date_time.dt.dayofweek
    df['month'] = df.date_time.dt.month
    df['week'] = df.date_time.dt.isocalendar().week
    df['hour'] = df.date_time.dt.hour
    return df

In [5]:
def createStationCountsByTime(df, input_col, col_name='entry'):
#     df[col_name+'_year'] = df[input_col].dt.year
#     df[col_name+'_month'] = df[input_col].dt.month
    df[col_name+'_day'] = df[input_col].dt.day
    df[col_name+'_hour'] = df[input_col].dt.hour
#     df[col_name+'_weekday'] = df[input_col].dt.weekday
    df[col_name+'_weekday'] = df[input_col].dt.day_name()
#     df[col_name+'_date'] = df[input_col].dt.date
    df[col_name+'_year_month'] = df[input_col].dt.to_period('M')
    print(f'"{input_col}" splitted into multiple columns.\n')
    return df

In [6]:
def computeTrafficRidershipCounts(df):
    # Creating the Net_Entries, Net_Exits, and Net_Traffic columns
    df['net_entries'] = df.groupby(['control_area', 'unit', 'subunit_channel_pos', 'station'])['entries'].transform(lambda x: x.diff())
    df['net_exits'] = df.groupby(['control_area', 'unit', 'subunit_channel_pos', 'station'])['exits'].transform(lambda x: x.diff())
    df['net_traffic'] = df.net_entries + df.net_exits

    # Elimating turnstiles that count in reverse by casting all values as absolutes.
    df['net_entries'] = abs(df.net_entries)
    df['net_exits'] = abs(df.net_exits)
    df['net_traffic'] = abs(df.net_traffic)

    return df

In [7]:
def removeOutliers(df):
    # Elimate outliers in the data by reducing to the 99th percentile. 
    q = np.nanquantile(df["net_entries"], .99)
    df = df[df["net_entries"] < q]

    q2 = np.nanquantile(df["net_exits"], .99)
    df = df[df["net_exits"] < q2]

    q3 = np.nanquantile(df['net_traffic'], .99)
    df=df[df['net_traffic'] < q3]
    
    return df

In [8]:
def removeSpecialCharactersFromStopName(df):
    df.stop_name = df.stop_name.str.replace(" - ","_")
    df.stop_name = df.stop_name.str.replace(" ","_")
    df.stop_name = df.stop_name.str.replace("(","")
    df.stop_name = df.stop_name.str.replace(")","")
    df.stop_name = df.stop_name.str.replace("/","_")
    df.stop_name = df.stop_name.str.replace(".","")
    df.stop_name = df.stop_name.str.replace("-","_")
    df.stop_name = df.stop_name.str.lower()
    return df

In [9]:
def matchStationNames(df,df_gtfs):
    mat1 = []
    mat2 = []
    p= []
    list1 = df.station.tolist()
    list2 = df_gtfs.stop_name.tolist()
 
    threshold = 50

    for i in list1:
        mat1.append(process.extractOne(i, list2, scorer=fuzz.ratio))
    df['matches'] = mat1

    for j in df['matches']:
        if j[1] >= threshold:
            p.append(j[0])

        mat2.append(','.join(p))
        p= []

    df['matches'] = mat2
    return df,df_gtfs

In [17]:
import rtree
import pygeos
def combineGTFSStopsAndStationData(df):
   
    df['geometry'] = [Point(xy) for xy in zip(np.array(df['gtfs_longitude']), np.array(df['gtfs_latitude']))]
    gpd.options.use_pygeos = True
    
    cdta_map = gpd.read_file("..\\data\\nycdta2020_22b\\nycdta2020.shp")
    cdta_map.to_crs(4326, inplace=True)
    
    cdta_geo_df = cdta_map[['CDTA2020', 'CDTAName','geometry', 'Shape_Leng', 'Shape_Area','BoroName']].set_index('CDTA2020', drop=True)
    
    top_station_geo_df = gpd.GeoDataFrame(df, crs=4326, geometry = df.geometry)
    top_station_geo_df.to_crs(4326, inplace=True)
    
    #df.to_csv('allstation1.csv')
    # Locate each Station Point Geometry within NTA Polygon geometry
    station_all_df = gpd.sjoin(cdta_geo_df,top_station_geo_df, how="left", op="contains")
    station_all_df = station_all_df.reset_index()
    
    print('before ctda cleanup')
    print(station_all_df.head(1))
    print(station_all_df.columns)
    print(station_all_df.shape)
    
    station_all_df = station_all_df[station_all_df['CDTA2020'].str.match('^[a-zA-Z]{2}\d{2}$')]
    print('after ctda cleanup')
    print(station_all_df.head(1))
    print(station_all_df.columns)
    print(station_all_df.shape)
    #Few stations that belong to Manhattan Burough were identified based on the CDTA code
    station_all_df['borough'] = station_all_df.borough.fillna("M")
    
    print(station_all_df.columns)   
    print(station_all_df.head(1))
    cdta_dict = cdta_map[["CDTA2020", "CDTAName"]].set_index("CDTA2020").to_dict()["CDTAName"]
    return station_all_df,cdta_dict   
       

In [11]:
#1 Load first six months of 2022 Ridership data for subway stations using Turnstile datasource.

engine = create_engine("sqlite:///C:\\Users\\panch\\capstone\\notebooks\\mta_data.db")
mta_df = pd.read_sql('SELECT * FROM mta_data;', engine)

#Cleanup data from station names
# Rename mta_df columns to make them easier to work wit
mta_df = mta_df.rename(columns={'C/A': 'control_area', 'UNIT': 'unit', 'SCP': 'subunit_channel_pos', 'STATION':'station', 'LINENAME':'subway_lines', 'DIVISION':'division', 'DATE':'date', 'TIME':'time', 'DESC':'desc', 'ENTRIES':'entries', 'EXITS':'exits'})
mta_df = removeSpecialCharactersFromStationName(mta_df)
mta_df = removeSpecialCharactersFromStationTimestamps(mta_df)

mta_df['subunit_channel_pos'] = mta_df['subunit_channel_pos'].str.replace('-', '_')

#mta_df.station.sort_values().unique()

# Create UniqueId column for grouping by 
mta_df['unique_id'] = mta_df['control_area'] + '_' + mta_df['unit'] + '_' + mta_df['subunit_channel_pos'] + '_' + mta_df['station'] + '_' + mta_df['date'] + '_' + mta_df['time'] + '_' + mta_df['desc']
mta_df['date_time'] = mta_df.date + ' ' + mta_df.time
mta_df.date_time = pd.to_datetime(mta_df['date_time'], format = '%m_%d_%Y %H_%M_%S')
mta_df = mta_df[mta_df.desc != 'RECOVR_AUD']

mta_df = computeTrafficRidershipCounts(mta_df)
mta_df.fillna(0, inplace=True)

# Elimate outliers in the data by reducing to the 99th percentile. 
mta_df = removeOutliers(mta_df)

print("Cleaned up station data.")

#mta_df = createPassengerCountColumnsByTime(mta_df)
mta_df = createStationCountsByTime(mta_df, 'date_time', col_name='Station_Readings')
#mta_df = createStationCountsByTime(mta_df, 'date_time', col_name='Exit')


#hourly_df = mta_df.groupby(['hour'])[['net_traffic']].mean().reset_index()
#print(mta_df.shape)
#print('hourly df')
#hourly_df.head(5)

# net traffic
#stations_df = mta_df.groupby('station')[['net_traffic']].sum().sort_values(by='net_traffic', ascending=False).reset_index().copy()
#stations_df['pct_total'] = stations_df['net_traffic'].apply(lambda x: x / stations_df.net_traffic.sum())
#stations_df['avg_daily'] = stations_df['net_traffic'].apply(lambda x: x/ 365)

#net_entries
#stations_df = mta_df.groupby('station')[['net_entries']].sum().sort_values(by='net_entries', ascending=False).reset_index().copy()
#stations_df['pct_total_net_entries'] = stations_df['net_entries'].apply(lambda x: x / stations_df.net_entries.sum())
#stations_df['avg_daily_net_entries'] = stations_df['net_entries'].apply(lambda x: x/ 365)

#net_exits
#stations_df = mta_df.groupby('station')[['net_exits']].sum().sort_values(by='net_exits', ascending=False).reset_index().copy()
#stations_df['pct_total_net_exits'] = stations_df['net_exits'].apply(lambda x: x / stations_df.net_exits.sum())
#stations_df['avg_daily_net_exits'] = stations_df['net_exits'].apply(lambda x: x/ 365)

print(mta_df.columns)
print(mta_df.shape)
print(mta_df.head(10))

print('Stations With passenger counts and timestamps ,net entries and net exits and net traffic(which is sum of net entry and exit')
#df['day_of_week'] = df.date_time.dt.dayofweek -Entry_weekday  
#df['month'] = df.date_time.dt.month - Entry_year_month
#df['week'] = df.date_time.dt.isocalendar().week instead of week - Entry_day
#df['hour'] = df.date_time.dt.hour -  Entry_hour

Cleaned up station data.
"date_time" splitted into multiple columns.

Index(['control_area', 'unit', 'subunit_channel_pos', 'station',
       'subway_lines', 'division', 'date', 'time', 'desc', 'entries', 'exits',
       'unique_id', 'date_time', 'net_entries', 'net_exits', 'net_traffic',
       'Station_Readings_day', 'Station_Readings_hour',
       'Station_Readings_weekday', 'Station_Readings_year_month'],
      dtype='object')
(5287402, 20)
  control_area  unit subunit_channel_pos station subway_lines division  \
0         A002  R051            02_00_00   59_st      NQR456W      BMT   
1         A002  R051            02_00_00   59_st      NQR456W      BMT   
2         A002  R051            02_00_00   59_st      NQR456W      BMT   
3         A002  R051            02_00_00   59_st      NQR456W      BMT   
4         A002  R051            02_00_00   59_st      NQR456W      BMT   
5         A002  R051            02_00_00   59_st      NQR456W      BMT   
6         A002  R051            0

In [19]:
#2 Load GTFS stop data for mapping Borough and CDTA using Lat and Long geometry to station data.
#cols_to_keep = ["station","entries", "exits","net_entries","net_exits","net_traffic","date", "time","date_time","day_of_week", "month", "week", "hour"]
cols_to_keep = ["station","entries", "exits","net_entries","net_exits","net_traffic","Station_Readings_day", "Station_Readings_hour", "Station_Readings_year_month", "Station_Readings_weekday"]
group_cols = ["net_entries","net_exits","net_traffic"]

station_data_gtfs = pd.read_csv('../data/subway/stationnames.csv')
station_data_gtfs = removeSpecialCharactersFromStopName(station_data_gtfs)

mta_df = mta_df[cols_to_keep]
                
top_stations = mta_df.groupby('station')[group_cols].sum().sort_values(by='net_traffic', ascending=False).reset_index().copy()

top_stations,station_data_gtfs = matchStationNames(top_stations,station_data_gtfs)
#print(station_data_gtfs.stop_name.sort_values().unique())

#Merge station names from GTFS and Turnstile data for mapping CDTA
top_station_df = pd.merge(top_stations, right=station_data_gtfs, left_on='matches', right_on='stop_name', how='left')

#Group by station names
#top_station_df = top_station_df.groupby(['station'])[['net_traffic', 'net_entries','net_exits','pct_total', 'pct_total_net_entries','avg_daily_net_entries','pct_total_net_exits','avg_daily_net_exits','borough', 'gtfs_longitude', 'gtfs_latitude', 'avg_daily','gtfs_stop_id','station_id',]].first().sort_values(by='net_traffic', ascending=False).reset_index()

#Merge NTA Code and Burough code into single DF
all_station_df,cdta_dict = combineGTFSStopsAndStationData(top_station_df)

print('all stations combined..')

print(all_station_df.shape)
print(all_station_df.columns)
all_station_df 


  df.stop_name = df.stop_name.str.replace("(","")
  df.stop_name = df.stop_name.str.replace(")","")
  df.stop_name = df.stop_name.str.replace(".","")
  arr = construct_1d_object_array_from_listlike(values)


before ctda cleanup
  CDTA2020                                        CDTAName  \
0     BK01  BK01 Williamsburg-Greenpoint (CD 1 Equivalent)   

                                            geometry    Shape_Leng  \
0  POLYGON ((-73.92406 40.71411, -73.92404 40.714...  65655.741577   

     Shape_Area  BoroName  index_right      station  net_entries  net_exits  \
0  1.316597e+08  Brooklyn        242.0  flushing_av     766908.0   930000.0   

   ...  division       line    stop_name  borough daytime_routes structure  \
0  ...       IND  Crosstown  flushing_av       Bk              G    Subway   

  gtfs_latitude gtfs_longitude north_direction_label south_direction_label  
0     40.700377     -73.950234                Queens             Church Av  

[1 rows x 26 columns]
Index(['CDTA2020', 'CDTAName', 'geometry', 'Shape_Leng', 'Shape_Area',
       'BoroName', 'index_right', 'station', 'net_entries', 'net_exits',
       'net_traffic', 'matches', 'ogc_fid', 'station_id', 'complex_id',
     

Unnamed: 0,CDTA2020,CDTAName,geometry,Shape_Leng,Shape_Area,BoroName,index_right,station,net_entries,net_exits,...,division,line,stop_name,borough,daytime_routes,structure,gtfs_latitude,gtfs_longitude,north_direction_label,south_direction_label
0,BK01,BK01 Williamsburg-Greenpoint (CD 1 Equivalent),"POLYGON ((-73.92406 40.71411, -73.92404 40.714...",65655.741577,1.316597e+08,Brooklyn,242.0,flushing_av,766908.0,930000.0,...,IND,Crosstown,flushing_av,Bk,G,Subway,40.700377,-73.950234,Queens,Church Av
1,BK01,BK01 Williamsburg-Greenpoint (CD 1 Equivalent),"POLYGON ((-73.92406 40.71411, -73.92404 40.714...",65655.741577,1.316597e+08,Brooklyn,474.0,hewes_st,176626.0,291832.0,...,BMT,Jamaica,hewes_st,Bk,J M,Elevated,40.706870,-73.953431,Jamaica - Middle Village,Manhattan
2,BK01,BK01 Williamsburg-Greenpoint (CD 1 Equivalent),"POLYGON ((-73.92406 40.71411, -73.92404 40.714...",65655.741577,1.316597e+08,Brooklyn,223.0,marcy_av,731003.0,1211417.0,...,BMT,Jamaica,marcy_av,Bk,J M Z,Elevated,40.708359,-73.957757,Jamaica - Middle Village,Manhattan
3,BK01,BK01 Williamsburg-Greenpoint (CD 1 Equivalent),"POLYGON ((-73.92406 40.71411, -73.92404 40.714...",65655.741577,1.316597e+08,Brooklyn,212.0,metropolitan_av,866565.0,1222713.0,...,IND,Crosstown,metropolitan_av,Bk,G,Subway,40.712792,-73.951418,Queens,Church Av
4,BK01,BK01 Williamsburg-Greenpoint (CD 1 Equivalent),"POLYGON ((-73.92406 40.71411, -73.92404 40.714...",65655.741577,1.316597e+08,Brooklyn,101.0,bedford_av,1232083.0,2995818.0,...,BMT,Canarsie,bedford_av,Bk,L,Subway,40.717304,-73.956872,Manhattan,Canarsie - Rockaway Parkway
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513,SI01,SI01 North Shore (CD 1 Equivalent),"MULTIPOLYGON (((-74.15946 40.64145, -74.15975 ...",153754.773653,3.772990e+08,Staten Island,511.0,tompkinsville,64915.0,5.0,...,SIR,Staten Island,tompkinsville,SI,SIR,At Grade,40.636949,-74.074835,St George,Tottenville
514,SI01,SI01 North Shore (CD 1 Equivalent),"MULTIPOLYGON (((-74.15946 40.64145, -74.15975 ...",153754.773653,3.772990e+08,Staten Island,395.0,st._george,734063.0,73.0,...,SIR,Staten Island,st_george,SI,SIR,Open Cut,40.643748,-74.073643,,Tottenville
515,SI02,SI02 Mid-Island (CD 2 Approximation),"POLYGON ((-74.06398 40.60214, -74.06312 40.601...",145684.168999,6.061494e+08,Staten Island,,,,,...,,,,M,,,,,,
516,SI03,SI03 South Shore (CD 3 Approximation),"POLYGON ((-74.10010 40.55895, -74.10021 40.558...",179446.540623,5.954228e+08,Staten Island,,,,,...,,,,M,,,,,,


In [20]:
#cdta_subway_df = all_station_df.set_index("cdtaCode")[["CDTAName", "borough","BoroName","net_traffic","net_entries", "net_exits"]].drop_duplicates()
all_station_df = all_station_df.rename(columns={'CDTA2020': 'cdtaCode'})
#all_station_df = all_station_df[station_all_df['net_entries'].str.match('^[a-zA-Z]{2}\d{2}$')]

cdta_subway_df = all_station_df.groupby('cdtaCode')[["net_entries","net_exits","net_traffic","CDTAName"]].sum().sort_values(by='net_traffic', ascending=False).reset_index().copy()

#Filter CDTAs that do not have any traffic mapped
cdta_subway_df= cdta_subway_df[cdta_subway_df['net_traffic'] != 0]

#Map Borough to CDTA DF
cdta_subway_df['borough'] = cdta_subway_df["cdtaCode"].apply(lambda x: "EWR" if "EWR" in x else x[:2]).map(
    {
        'EWR': 'EWR',
        'QN': 'Queens',
        'BX': 'Bronx',
        'MN': 'Manhattan',
        'SI': 'Staten Island',
        'BK': 'Brooklyn'
    }
)

cdta_subway_df['CDTA_name'] = cdta_subway_df['cdtaCode'].map(cdta_dict).fillna("")

cdta_subway_df

Unnamed: 0,cdtaCode,net_entries,net_exits,net_traffic,borough,CDTA_name
0,MN05,119600805.0,172187736.0,289159891.0,Manhattan,MN05 Midtown-Flatiron-Union Square (CD 5 Appro...
1,MN01,47039233.0,62793511.0,109282874.0,Manhattan,MN01 Financial District-Tribeca (CD 1 Equivalent)
2,MN04,34119611.0,47398488.0,81125195.0,Manhattan,MN04 Chelsea-Hell's Kitchen (CD 4 Approximation)
3,MN02,32831131.0,46817960.0,78539021.0,Manhattan,MN02 Greenwich Village-SoHo (CD 2 Equivalent)
4,BK02,29064949.0,35325386.0,63846177.0,Brooklyn,BK02 Downtown Brooklyn-Fort Greene (CD 2 Appro...
...,...,...,...,...,...,...
53,BX08,1449988.0,299277.0,1749265.0,Bronx,BX08 Riverdale-Kingsbridge-Marble Hill (CD 8 A...
54,BK55,284565.0,530354.0,814919.0,Brooklyn,BK55 Prospect Park (JIA 55 Approximation)
55,SI01,798978.0,78.0,799056.0,Staten Island,SI01 North Shore (CD 1 Equivalent)
56,QN83,436814.0,346455.0,783269.0,Queens,QN83 JFK International Airport (JIA 83 Approxi...


In [22]:
cdta_subway_df

Unnamed: 0,cdtaCode,net_entries,net_exits,net_traffic,borough,CDTA_name
0,MN05,119600805.0,172187736.0,289159891.0,Manhattan,MN05 Midtown-Flatiron-Union Square (CD 5 Appro...
1,MN01,47039233.0,62793511.0,109282874.0,Manhattan,MN01 Financial District-Tribeca (CD 1 Equivalent)
2,MN04,34119611.0,47398488.0,81125195.0,Manhattan,MN04 Chelsea-Hell's Kitchen (CD 4 Approximation)
3,MN02,32831131.0,46817960.0,78539021.0,Manhattan,MN02 Greenwich Village-SoHo (CD 2 Equivalent)
4,BK02,29064949.0,35325386.0,63846177.0,Brooklyn,BK02 Downtown Brooklyn-Fort Greene (CD 2 Appro...
...,...,...,...,...,...,...
53,BX08,1449988.0,299277.0,1749265.0,Bronx,BX08 Riverdale-Kingsbridge-Marble Hill (CD 8 A...
54,BK55,284565.0,530354.0,814919.0,Brooklyn,BK55 Prospect Park (JIA 55 Approximation)
55,SI01,798978.0,78.0,799056.0,Staten Island,SI01 North Shore (CD 1 Equivalent)
56,QN83,436814.0,346455.0,783269.0,Queens,QN83 JFK International Airport (JIA 83 Approxi...
