In [None]:
# Importing all requisite libraries, settings some overall parameters and formatting.
%reset -f
from sqlalchemy import create_engine
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.dates as mdates
import seaborn as sns
import plotly
import numpy as np
import geopandas as gpd
from shapely.geometry import Point, Polygon
#from pandas_profiling import ProfileReport
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

pd.options.display.max_columns = 20
pd.options.display.max_rows = 20
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
sns.set_style('darkgrid');

In [None]:
def removeSpecialCharactersFromStationName(df):
    # Remove special characters from station names
    df.station = df.station.str.replace("/","_")
    df.station = df.station.str.replace("-","_")
    df.station = df.station.str.replace(" ","_")
    df.station = df.station.str.lower()
    return df

In [None]:
def removeSpecialCharactersFromStationTimestamps(df):
    # Remove special characters from station dates
    df['date'] = df['date'].str.replace('/','_')
    df['time'] = df['time'].str.replace(':','_')
    df['desc'] = df['desc'].str.replace(' ', '_')
    return df

In [None]:
def createStationCountsByTime(df, input_col, col_name='entry'):

    df[col_name+'_day'] = df[input_col].dt.day
    df[col_name+'_hour'] = df[input_col].dt.hour
    df[col_name+'_weekday'] = df[input_col].dt.day_name()
    df[col_name+'_year_month'] = df[input_col].dt.to_period('M')
    print(f'"{input_col}" splitted into multiple columns.\n')
    return df

In [None]:
def computeTrafficRidershipCounts(df):
    # Creating the Net_Entries, Net_Exits, and Net_Traffic columns
    df['net_entries'] = df.groupby(['control_area', 'unit', 'subunit_channel_pos', 'station'])['entries'].transform(lambda x: x.diff())
    df['net_exits'] = df.groupby(['control_area', 'unit', 'subunit_channel_pos', 'station'])['exits'].transform(lambda x: x.diff())
    df['net_traffic'] = df.net_entries + df.net_exits

    # Elimating turnstiles that count in reverse by casting all values as absolutes.
    df['net_entries'] = abs(df.net_entries)
    df['net_exits'] = abs(df.net_exits)
    df['net_traffic'] = abs(df.net_traffic)

    return df

In [None]:
def removeOutliers(df):
    # Elimate outliers in the data by reducing to the 99th percentile. 
    q = np.nanquantile(df["net_entries"], .99)
    df = df[df["net_entries"] < q]

    q2 = np.nanquantile(df["net_exits"], .99)
    df = df[df["net_exits"] < q2]

    q3 = np.nanquantile(df['net_traffic'], .99)
    df=df[df['net_traffic'] < q3]
    
    return df

In [None]:
def removeSpecialCharactersFromStopName(df):
    df.stop_name = df.stop_name.str.replace(" - ","_")
    df.stop_name = df.stop_name.str.replace(" ","_")
    df.stop_name = df.stop_name.str.replace("(","")
    df.stop_name = df.stop_name.str.replace(")","")
    df.stop_name = df.stop_name.str.replace("/","_")
    df.stop_name = df.stop_name.str.replace(".","")
    df.stop_name = df.stop_name.str.replace("-","_")
    df.stop_name = df.stop_name.str.lower()
    return df

In [None]:
def matchStationNames(df,df_gtfs):
    
    mat1 = []
    mat2 = []
    p= []
    list1 = df.station.tolist()
    list2 = df_gtfs.stop_name.tolist()
 
    threshold = 50

    for i in list1:
        mat1.append(process.extractOne(i, list2, scorer=fuzz.ratio))
    df['matches'] = mat1

    for j in df['matches']:
        if j[1] >= threshold:
            p.append(j[0])

        mat2.append(','.join(p))
        p= []

    df['matches'] = mat2
    return df,df_gtfs

In [None]:
import rtree
import pygeos
def combineGTFSStopsAndStationData(df):
   
    df['geometry'] = [Point(xy) for xy in zip(np.array(df['gtfs_longitude']), np.array(df['gtfs_latitude']))]
    gpd.options.use_pygeos = True
    
    cdta_map = gpd.read_file("..\\data\\nycdta2020_22b\\nycdta2020.shp")
    cdta_map.to_crs(4326, inplace=True)
    
    cdta_geo_df = cdta_map[['CDTA2020', 'CDTAName','geometry', 'Shape_Leng', 'Shape_Area','BoroName']].set_index('CDTA2020', drop=True)
    
    top_station_geo_df = gpd.GeoDataFrame(df, crs=4326, geometry = df.geometry)
    top_station_geo_df.to_crs(4326, inplace=True)
    
    #df.to_csv('allstation1.csv')
    # Locate each Station Point Geometry within NTA Polygon geometry
    station_all_df = gpd.sjoin(cdta_geo_df,top_station_geo_df, how="left", op="contains")
    station_all_df = station_all_df.reset_index()
    
    #print('before ctda cleanup')
    #print(station_all_df.head(1))
    #print(station_all_df.columns)
    #print(station_all_df.shape)
    
    station_all_df = station_all_df[station_all_df['CDTA2020'].str.match('^[a-zA-Z]{2}\d{2}$')]
    
    #print('after ctda cleanup')
    #print(station_all_df.head(1))
    #print(station_all_df.columns)
    #print(station_all_df.shape)
    
    #Few stations that belong to Manhattan Burough were identified based on the CDTA code
    station_all_df['borough'] = station_all_df.borough.fillna("M")
    
    print('combineGTFSStopsAndStationData ouput..')
    print(station_all_df.columns)   
    print(station_all_df.head(1))
    cdta_dict = cdta_map[["CDTA2020", "CDTAName"]].set_index("CDTA2020").to_dict()["CDTAName"]
    return station_all_df,cdta_dict   
       

In [None]:
#1. Load first six months of 2022 Ridership data for subway stations using Turnstile datasource.

engine = create_engine("sqlite:///C:\\Users\\panch\\capstone\\notebooks\\mta_data.db")
mta_df = pd.read_sql('SELECT * FROM mta_data;', engine)

#Cleanup data from station names
# Rename mta_df columns to make them easier to work wit
mta_df = mta_df.rename(columns={'C/A': 'control_area', 'UNIT': 'unit', 'SCP': 'subunit_channel_pos', 'STATION':'station', 'LINENAME':'subway_lines', 'DIVISION':'division', 'DATE':'date', 'TIME':'time', 'DESC':'desc', 'ENTRIES':'entries', 'EXITS':'exits'})
mta_df = removeSpecialCharactersFromStationName(mta_df)
mta_df = removeSpecialCharactersFromStationTimestamps(mta_df)

mta_df['subunit_channel_pos'] = mta_df['subunit_channel_pos'].str.replace('-', '_')

# Create UniqueId column for grouping by 
mta_df['unique_id'] = mta_df['control_area'] + '_' + mta_df['unit'] + '_' + mta_df['subunit_channel_pos'] + '_' + mta_df['station'] + '_' + mta_df['date'] + '_' + mta_df['time'] + '_' + mta_df['desc']
mta_df['date_time'] = mta_df.date + ' ' + mta_df.time
mta_df.date_time = pd.to_datetime(mta_df['date_time'], format = '%m_%d_%Y %H_%M_%S')
mta_df = mta_df[mta_df.desc != 'RECOVR_AUD']

mta_df = computeTrafficRidershipCounts(mta_df)
mta_df.fillna(0, inplace=True)

# Elimate outliers in the data by reducing to the 99th percentile. 
mta_df = removeOutliers(mta_df)

mta_df = createStationCountsByTime(mta_df, 'date_time', col_name='Station_Readings_Entry')
mta_df = createStationCountsByTime(mta_df, 'date_time', col_name='Station_Readings_Exit')

print('Step 1..')
print(mta_df.columns)
print(mta_df.shape)
print(mta_df.head(10))

print('Stations With passenger counts and timestamps ,net entries and net exits and net traffic(which is sum of net entry and exit')


In [None]:
#2. Load GTFS stop data for mapping Borough and CDTA using Lat and Long geometry to station data.
cols_to_keep = ["station","entries", "exits","net_entries","net_exits","net_traffic","Station_Readings_Exit_weekday","Station_Readings_Exit_year_month","Station_Readings_Exit_day","Station_Readings_Exit_hour","Station_Readings_Entry_weekday","Station_Readings_Entry_year_month","Station_Readings_Entry_day","Station_Readings_Entry_hour"]
group_cols = ["net_traffic"]

station_data_gtfs = pd.read_csv('../data/subway/stationnames.csv')
station_data_gtfs = removeSpecialCharactersFromStopName(station_data_gtfs)

mta_df = mta_df[cols_to_keep]
                
top_stations = mta_df.groupby('station')[group_cols].sum().sort_values(by='net_traffic', ascending=False).reset_index().copy()

top_stations,station_data_gtfs = matchStationNames(top_stations,station_data_gtfs)

#Merge station names from GTFS and Turnstile data for mapping CDTA
top_station_df = pd.merge(top_stations, right=station_data_gtfs, left_on='matches', right_on='stop_name', how='left')

#Merge CDTA Code and Burough code into single DF
stationWithCdta,cdta_dict = combineGTFSStopsAndStationData(top_station_df)
stationWithCdta.dropna(subset=['station'], how='all', inplace=True)
stationWithCdta.info() 

stationWithCdta = stationWithCdta.rename(columns={'CDTA2020': 'cdtaCode'})

#Create CDTA Dictionary
cdta_station_dict = stationWithCdta[["cdtaCode", "station"]].set_index("station").to_dict()["cdtaCode"]
#print(cdta_station_dict)

In [None]:
# 3. Calculate CDTA NetEntries and NetExits
print(mta_df.columns)
net_entry_stations = mta_df.groupby(['station'])["net_entries"].sum().reset_index().copy()
net_exit_stations = mta_df.groupby(['station'])["net_exits"].sum().reset_index().copy()

net_entry_stations['cdtaCode_entry'] = net_entry_stations['station'].map(cdta_station_dict).fillna("UN99")
net_exit_stations['cdtaCode_exit'] = net_entry_stations['station'].map(cdta_station_dict).fillna("UN99")

net_entry_cdta = net_entry_stations.groupby(['cdtaCode_entry']).sum()
net_exit_cdta = net_exit_stations.groupby(['cdtaCode_exit']).sum()

print(net_entry_cdta.shape)
print(net_exit_cdta.shape)


In [None]:
# 4. Calculate CDTA Day wise (1 to 31 ) NetEntries and NetExits

print(mta_df.columns)
day_entry_stations = mta_df.groupby(['Station_Readings_Entry_day','station'])["net_entries"].sum().reset_index().copy()
day_exit_stations = mta_df.groupby(['Station_Readings_Exit_day','station'])["net_exits"].sum().reset_index().copy()

day_entry_stations['cdtaCode_entry'] = day_entry_stations['station'].map(cdta_station_dict).fillna("UN99")
day_exit_stations['cdtaCode_exit'] = day_exit_stations['station'].map(cdta_station_dict).fillna("UN99")

## Entry day count by CDTA
entry_day_count_cdta = day_entry_stations.groupby(['cdtaCode_entry', 'Station_Readings_Entry_day']).sum().reset_index()\
    [['cdtaCode_entry', 'Station_Readings_Entry_day','net_entries']].pivot(index='cdtaCode_entry', columns="Station_Readings_Entry_day", values="net_entries")
entry_day_count_cdta.columns = ["Entry_total_trip_count_day_" + str(col) for col in entry_day_count_cdta.columns]

## Exit day count by CDTA
exit_day_count_cdta = day_exit_stations.groupby(['cdtaCode_exit', 'Station_Readings_Exit_day']).sum().reset_index()\
    [['cdtaCode_exit', 'Station_Readings_Exit_day','net_exits']].pivot(index='cdtaCode_exit', columns="Station_Readings_Exit_day", values="net_exits")
exit_day_count_cdta.columns = ["Exit_total_trip_count_day_" + str(col) for col in exit_day_count_cdta.columns]

print(entry_day_count_cdta.shape)
print(exit_day_count_cdta.shape)

In [None]:
#5. Calculate CDTA Hour wise (1 to 24 ) NetEntries and NetExits

print(mta_df.columns)
hour_entry_stations = mta_df.groupby(['Station_Readings_Entry_hour','station'])["net_entries"].sum().reset_index().copy()
hour_exit_stations = mta_df.groupby(['Station_Readings_Exit_hour','station'])["net_exits"].sum().reset_index().copy()

hour_entry_stations['cdtaCode_entry'] = hour_entry_stations['station'].map(cdta_station_dict).fillna("UN99")
hour_exit_stations['cdtaCode_exit'] = hour_exit_stations['station'].map(cdta_station_dict).fillna("UN99")

 ## Entry Hour count by CDTA
entry_hour_count_cdta = hour_entry_stations.groupby(['cdtaCode_entry', 'Station_Readings_Entry_hour']).sum().reset_index()\
    [['cdtaCode_entry', 'Station_Readings_Entry_hour','net_entries']].pivot(index='cdtaCode_entry', columns="Station_Readings_Entry_hour", values="net_entries")
entry_hour_count_cdta.columns = ["Entry_total_trip_count_hour_" + str(col) for col in entry_hour_count_cdta.columns]

 ##  Exit Hour count by CDTA
exit_hour_count_cdta = hour_exit_stations.groupby(['cdtaCode_exit', 'Station_Readings_Exit_hour']).sum().reset_index()\
    [['cdtaCode_exit', 'Station_Readings_Exit_hour','net_exits']].pivot(index='cdtaCode_exit', columns="Station_Readings_Exit_hour", values="net_exits")
exit_hour_count_cdta.columns = ["Exit_total_trip_count_hour_" + str(col) for col in exit_hour_count_cdta.columns]

#print(exit_hour_count_cdta.shape)
#print(entry_hour_count_cdta.shape)


In [None]:
#6. Calculate CDTA Weekday wise (Sunday to Saturday ) NetEntries and NetExits

print(mta_df.columns)
weekday_entry_stations = mta_df.groupby(['Station_Readings_Entry_weekday','station'])["net_entries"].sum().reset_index().copy()
weekday_exit_stations = mta_df.groupby(['Station_Readings_Exit_weekday','station'])["net_exits"].sum().reset_index().copy()

weekday_entry_stations['cdtaCode_entry'] = weekday_entry_stations['station'].map(cdta_station_dict).fillna("UN99")
weekday_exit_stations['cdtaCode_exit'] = weekday_exit_stations['station'].map(cdta_station_dict).fillna("UN99")

 ## Entry Weekday count by CDTA
entry_weekday_count_cdta = weekday_entry_stations.groupby(['cdtaCode_entry', 'Station_Readings_Entry_weekday']).sum().reset_index()\
    [['cdtaCode_entry', 'Station_Readings_Entry_weekday','net_entries']].pivot(index='cdtaCode_entry', columns="Station_Readings_Entry_weekday", values="net_entries")
entry_weekday_count_cdta.columns = ["Entry_total_trip_count_weekday_" + str(col) for col in entry_weekday_count_cdta.columns]

 ## Exit Weekday count by CDTA
exit_weekday_count_cdta = weekday_exit_stations.groupby(['cdtaCode_exit', 'Station_Readings_Exit_weekday']).sum().reset_index()\
    [['cdtaCode_exit', 'Station_Readings_Exit_weekday','net_exits']].pivot(index='cdtaCode_exit', columns="Station_Readings_Exit_weekday", values="net_exits")
exit_weekday_count_cdta.columns = ["Exit_total_trip_count_weekday_" + str(col) for col in exit_weekday_count_cdta.columns]

#print(exit_weekday_count_cdta.shape)
#print(entry_weekday_count_cdta.shape)


In [None]:
#cdta_subway_df = stationWithCdta.groupby('cdtaCode')[["net_entries","net_exits","CDTAName"]].sum().sort_values(by='CDTAName', ascending=False).reset_index().copy()

#Filter CDTAs that do not have any traffic mapped
#cdta_subway_df= cdta_subway_df[cdta_subway_df['net_traffic'] != 0]

#Map Borough to CDTA DF
cdta_subway_df['borough'] = cdta_subway_df["cdtaCode"].apply(lambda x: "EWR" if "EWR" in x else x[:2]).map(
    {
        'EWR': 'EWR',
        'QN': 'Queens',
        'BX': 'Bronx',
        'MN': 'Manhattan',
        'SI': 'Staten Island',
        'BK': 'Brooklyn'
    }
)

#cdta_subway_df['CDTA_name'] = cdta_subway_df['cdtaCode'].map(cdta_dict).fillna("")

cdta_subway_df