In [1]:
# using data from MTA-Turnstile-181117 look at only A TRAIN

In [1]:
# import Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# dataframe with all enters/exits
csv_path = 'commuters_11_2018.csv'
commuters_11_2018 = pd.read_csv(csv_path, encoding="utf-8")

commuters_11_2018.head()

Unnamed: 0,STATION,LINENAME,AM_ENTERS,AM_EXITS,PM_ENTERS,PM_EXITS
0,1 AV,L,2173.0,5427.0,17391.0,18195.0
1,103 ST,1,7888.0,1943.0,13537.0,6671.0
2,103 ST,6,8439.0,7932.0,16862.0,12606.0
3,103 ST,BC,3504.0,1019.0,4818.0,3202.0
4,103 ST-CORONA,7,22039.0,1903.0,14701.0,9735.0


In [3]:
# look at A line only
A_train_all = commuters_11_2018[commuters_11_2018['LINENAME'].str.contains('A')]

In [4]:
# look at all station names
A_train_all['STATION'].unique()

array(['104 ST', '111 ST', '125 ST', '14 ST', '145 ST', '168 ST',
       '175 ST', '181 ST', '190 ST', '34 ST-PENN STA', '42 ST-PORT AUTH',
       '59 ST COLUMBUS', '8 AV', '80 ST', '88 ST', 'AQUEDUCT N.COND',
       'AQUEDUCT RACETR', 'BEACH 105 ST', 'BEACH 25 ST', 'BEACH 36 ST',
       'BEACH 44 ST', 'BEACH 60 ST', 'BEACH 67 ST', 'BEACH 90 ST',
       'BEACH 98 ST', 'BROAD CHANNEL', 'BROADWAY JCT', 'CANAL ST',
       'CHAMBERS ST', 'DYCKMAN ST', 'EUCLID AV', 'FAR ROCKAWAY',
       'FRANKLIN AV', 'FULTON ST', 'GRANT AV', 'HIGH ST',
       'HOWARD BCH JFK', 'HOYT-SCHER', 'INWOOD-207 ST', 'JAY ST-METROTEC',
       'NOSTRAND AV', 'OZONE PK LEFFRT', 'PARK PLACE', 'ROCKAWAY BLVD',
       'ROCKAWAY PARK B', 'TIMES SQ-42 ST', 'UTICA AV', 'W 4 ST-WASH SQ',
       'WORLD TRADE CTR'], dtype=object)

In [5]:
# check station count
A_train_all['STATION'].count()

54

In [6]:
# merge and sum duplicate station data
A_train = A_train_all.groupby('STATION').sum()
A_train.head()

Unnamed: 0_level_0,AM_ENTERS,AM_EXITS,PM_ENTERS,PM_EXITS
STATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
104 ST,2441.0,74.0,1325.0,412.0
111 ST,6459.0,146.0,1475.0,727.0
125 ST,6472.0,5890.0,27231.0,21340.0
14 ST,3919.0,11802.0,27238.0,16013.0
145 ST,8715.0,3053.0,22567.0,19270.0


In [7]:
# order stations in exel
# save as csv
A_train.to_csv('A_train.csv')

In [8]:
# open ordered CSV
csv_path_A = 'A_train_order.csv'
A_train_order = pd.read_csv(csv_path_A, encoding="utf-8")

In [9]:
# needs further cleaning
A_train_order["STATION"].replace("TIMES SQ-42 ST", value="42 ST-PORT AUTH", inplace=True)
A_train_order["STATION"].replace("WORLD TRADE CTR", value="CHAMBERS ST", inplace=True)
A_train_order["STATION"].replace("PARK PLACE", value="CHAMBERS ST", inplace=True)

In [10]:
# now sum same name rows
A_commuters = A_train_order.groupby('STATION').sum()
A_commuters.head()

Unnamed: 0_level_0,AM_ENTERS,AM_EXITS,PM_ENTERS,PM_EXITS
STATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
104 ST,2441,74,1325,412
111 ST,6459,146,1475,727
125 ST,6472,5890,27231,21340
14 ST,3919,11802,27238,16013
145 ST,8715,3053,22567,19270


In [11]:
# get number of people on the subway per stop
# subtract exits from enters
# get one figure for AM and one figure for PM

A_commuters['AM_TOTALS'] = A_commuters['AM_ENTERS'] - A_commuters['AM_EXITS']

In [12]:
# repeat for PM riders
A_commuters['PM_TOTALS'] = A_commuters['PM_ENTERS'] - A_commuters['PM_EXITS']
A_commuters.head()

Unnamed: 0_level_0,AM_ENTERS,AM_EXITS,PM_ENTERS,PM_EXITS,AM_TOTALS,PM_TOTALS
STATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
104 ST,2441,74,1325,412,2367,913
111 ST,6459,146,1475,727,6313,748
125 ST,6472,5890,27231,21340,582,5891
14 ST,3919,11802,27238,16013,-7883,11225
145 ST,8715,3053,22567,19270,5662,3297


In [13]:
# look at only AM, so dataframe is esaier to understand
A_commuters_am = A_commuters.drop(['PM_ENTERS','PM_EXITS','PM_TOTALS'], axis=1)
A_commuters_am.head()

Unnamed: 0_level_0,AM_ENTERS,AM_EXITS,AM_TOTALS
STATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
104 ST,2441,74,2367
111 ST,6459,146,6313
125 ST,6472,5890,582
14 ST,3919,11802,-7883
145 ST,8715,3053,5662


In [14]:
# add / subtract riders to get total at each stop
A_commuters_am['AM_COMMUTERS'] = abs(A_commuters_am['AM_TOTALS'].shift(1) - A_commuters_am['AM_TOTALS'])

In [15]:
# add sequence range to always keep stations in order
A_commuters_am['ORDER'] = range(1, 1 + len(A_commuters_am))
A_commuters_am = A_commuters_am[['ORDER','AM_ENTERS','AM_EXITS','AM_TOTALS','AM_COMMUTERS']]
A_commuters_am.head()

Unnamed: 0_level_0,ORDER,AM_ENTERS,AM_EXITS,AM_TOTALS,AM_COMMUTERS
STATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
104 ST,1,2441,74,2367,
111 ST,2,6459,146,6313,3946.0
125 ST,3,6472,5890,582,5731.0
14 ST,4,3919,11802,-7883,8465.0
145 ST,5,8715,3053,5662,13545.0


In [21]:
# replace Nan value
first_value = A_commuters_am[A_commuters_am.ORDER == 1]['AM_TOTALS']
A_commuters_am["AM_COMMUTERS"].fillna(first_value, limit = 1, inplace = True) 
A_commuters_am.head()

Unnamed: 0_level_0,ORDER,AM_ENTERS,AM_EXITS,AM_TOTALS,AM_COMMUTERS
STATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
104 ST,1,2441,74,2367,2367.0
111 ST,2,6459,146,6313,3946.0
125 ST,3,6472,5890,582,5731.0
14 ST,4,3919,11802,-7883,8465.0
145 ST,5,8715,3053,5662,13545.0


In [26]:
# save as csv
A_commuters_am.to_csv('A_commuters_am.csv')

# import csv
csv_path_3 = 'A_commuters_am.csv'
A_train_am = pd.read_csv(csv_path_3, encoding="utf-8")