In [1]:
# MTA turnstile data for all stations
# week: November 10-16, 2018
# extracting: M-F, AM (7am + 8am), PM (3pm + 4pm)
# average out entries and exits

In [1]:
# import Dependencies
import pandas as pd
import numpy as np

In [2]:
# data from MTA: http://web.mta.info/developers/turnstile.html
csv_path = "turnstile_181117.csv"
ts_11_2018_all = pd.read_csv(csv_path, encoding="utf-8")

In [3]:
# format date and time for Pandas
ts_11_2018_all["DATE"] = pd.to_datetime(ts_11_2018_all["DATE"])
ts_11_2018_all["TIME"] = pd.to_datetime(ts_11_2018_all["TIME"])

In [4]:
# add 'day of week' column - prep to remove Saturdy and Sunday
ts_11_2018_all["WEEKDAY"] = ts_11_2018_all["DATE"].dt.day_name()

In [5]:
# create new dataframe, as to not completely lose weekend data
ts_11_2018 = ts_11_2018_all

In [6]:
# use .loc to remove Saturday annd Sunday
ts_11_2018 = ts_11_2018.loc[(ts_11_2018["WEEKDAY"]!= "Saturday") & (ts_11_2018["WEEKDAY"]!= "Sunday"), : ]
ts_11_2018.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,WEEKDAY
12,A002,R051,02-00-00,59 ST,NQR456W,BMT,2018-11-12,2019-08-22 03:00:00,REGULAR,6831427,2316424,Monday
13,A002,R051,02-00-00,59 ST,NQR456W,BMT,2018-11-12,2019-08-22 07:00:00,REGULAR,6831438,2316474,Monday
14,A002,R051,02-00-00,59 ST,NQR456W,BMT,2018-11-12,2019-08-22 11:00:00,REGULAR,6831570,2316713,Monday
15,A002,R051,02-00-00,59 ST,NQR456W,BMT,2018-11-12,2019-08-22 15:00:00,REGULAR,6831812,2316785,Monday
16,A002,R051,02-00-00,59 ST,NQR456W,BMT,2018-11-12,2019-08-22 19:00:00,REGULAR,6832592,2316857,Monday


In [7]:
# check column titles because there is a problem with EXIT
ts_11_2018.columns

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES',
       'EXITS                                                               ',
       'WEEKDAY'],
      dtype='object')

In [8]:
# remane columns: entries / exits, as these are cummulative
ts_11_2018 = ts_11_2018.rename(columns={'ENTRIES': 'TOT. ENTRIES'})
ts_11_2018 = ts_11_2018.rename(columns={'EXITS                                                               '
                                                      : 'TOT. EXITS'})
ts_11_2018.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,TOT. ENTRIES,TOT. EXITS,WEEKDAY
12,A002,R051,02-00-00,59 ST,NQR456W,BMT,2018-11-12,2019-08-22 03:00:00,REGULAR,6831427,2316424,Monday
13,A002,R051,02-00-00,59 ST,NQR456W,BMT,2018-11-12,2019-08-22 07:00:00,REGULAR,6831438,2316474,Monday
14,A002,R051,02-00-00,59 ST,NQR456W,BMT,2018-11-12,2019-08-22 11:00:00,REGULAR,6831570,2316713,Monday
15,A002,R051,02-00-00,59 ST,NQR456W,BMT,2018-11-12,2019-08-22 15:00:00,REGULAR,6831812,2316785,Monday
16,A002,R051,02-00-00,59 ST,NQR456W,BMT,2018-11-12,2019-08-22 19:00:00,REGULAR,6832592,2316857,Monday


In [9]:
# enter/exit are cummulative, need to subtract from previous
# copying ENTERS, shifting down a row, then substracting
# using abs to give us a positive number
# create new column to hold entries
ts_11_2018['ENTERS'] = abs(ts_11_2018['TOT. ENTRIES'].shift(1) - ts_11_2018['TOT. ENTRIES'])

In [10]:
# need to repeat steps for exits
ts_11_2018['EXITS'] = abs(ts_11_2018['TOT. EXITS'].shift(1) - ts_11_2018['TOT. EXITS'])
ts_11_2018.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,TOT. ENTRIES,TOT. EXITS,WEEKDAY,ENTERS,EXITS
12,A002,R051,02-00-00,59 ST,NQR456W,BMT,2018-11-12,2019-08-22 03:00:00,REGULAR,6831427,2316424,Monday,,
13,A002,R051,02-00-00,59 ST,NQR456W,BMT,2018-11-12,2019-08-22 07:00:00,REGULAR,6831438,2316474,Monday,11.0,50.0
14,A002,R051,02-00-00,59 ST,NQR456W,BMT,2018-11-12,2019-08-22 11:00:00,REGULAR,6831570,2316713,Monday,132.0,239.0
15,A002,R051,02-00-00,59 ST,NQR456W,BMT,2018-11-12,2019-08-22 15:00:00,REGULAR,6831812,2316785,Monday,242.0,72.0
16,A002,R051,02-00-00,59 ST,NQR456W,BMT,2018-11-12,2019-08-22 19:00:00,REGULAR,6832592,2316857,Monday,780.0,72.0


In [11]:
# new data frame = focus on morning commute: 7:00am AND 8:00am
ts_11_2018_am = ts_11_2018

In [12]:
# extract AM data
ts_11_2018_am = ts_11_2018_am.loc[(ts_11_2018_am["TIME"]=="07:00:00") | (ts_11_2018_am["TIME"]=="08:00:00"), :]
ts_11_2018_am.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,TOT. ENTRIES,TOT. EXITS,WEEKDAY,ENTERS,EXITS
13,A002,R051,02-00-00,59 ST,NQR456W,BMT,2018-11-12,2019-08-22 07:00:00,REGULAR,6831438,2316474,Monday,11.0,50.0
19,A002,R051,02-00-00,59 ST,NQR456W,BMT,2018-11-13,2019-08-22 07:00:00,REGULAR,6832901,2316944,Tuesday,16.0,46.0
25,A002,R051,02-00-00,59 ST,NQR456W,BMT,2018-11-14,2019-08-22 07:00:00,REGULAR,6834427,2317507,Wednesday,18.0,60.0
39,A002,R051,02-00-01,59 ST,NQR456W,BMT,2018-11-12,2019-08-22 07:00:00,REGULAR,6104322,1367964,Monday,11.0,19.0
45,A002,R051,02-00-01,59 ST,NQR456W,BMT,2018-11-13,2019-08-22 07:00:00,REGULAR,6105426,1368189,Tuesday,19.0,18.0


In [13]:
# clean up data frame
ts_11_2018_am = ts_11_2018_am.drop(['C/A','UNIT','SCP','DIVISION','DESC','TOT. ENTRIES','TOT. EXITS'], axis=1)
ts_11_2018_am.head()

Unnamed: 0,STATION,LINENAME,DATE,TIME,WEEKDAY,ENTERS,EXITS
13,59 ST,NQR456W,2018-11-12,2019-08-22 07:00:00,Monday,11.0,50.0
19,59 ST,NQR456W,2018-11-13,2019-08-22 07:00:00,Tuesday,16.0,46.0
25,59 ST,NQR456W,2018-11-14,2019-08-22 07:00:00,Wednesday,18.0,60.0
39,59 ST,NQR456W,2018-11-12,2019-08-22 07:00:00,Monday,11.0,19.0
45,59 ST,NQR456W,2018-11-13,2019-08-22 07:00:00,Tuesday,19.0,18.0


In [14]:
# groupby station name and get total (not average)
ts_11_2018_am_tot = ts_11_2018_am.groupby(['STATION','LINENAME']).sum()

In [15]:
# change columns name; AM ENTERS / AM EXITS
# remane AM columns: entries / exits
ts_11_2018_am_tot = ts_11_2018_am_tot.rename(columns={'ENTERS': 'AM_ENTERS'})
ts_11_2018_am_tot = ts_11_2018_am_tot.rename(columns={'EXITS' : 'AM_EXITS'})
ts_11_2018_am_tot.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,AM_ENTERS,AM_EXITS
STATION,LINENAME,Unnamed: 2_level_1,Unnamed: 3_level_1
1 AV,L,2173.0,5427.0
103 ST,1,7888.0,1943.0
103 ST,6,8439.0,7932.0
103 ST,BC,3504.0,1019.0
103 ST-CORONA,7,22039.0,1903.0


In [16]:
# check total number of stations we have
ts_11_2018_am_tot.count()

AM_ENTERS    441
AM_EXITS     441
dtype: int64

In [17]:
# save as csv
ts_11_2018_am_tot.to_csv('ts_11_2018_am_tot.csv')

In [18]:
# Repeat for PM commute
# new data frame = focus on evening commute: 3:00pm AND 4:00pm
ts_11_2018_pm = ts_11_2018

In [19]:
# look at only 3pm AND 4pm data
ts_11_2018_pm = ts_11_2018_pm.loc[(ts_11_2018_pm["TIME"]=="15:00:00") | (ts_11_2018_pm["TIME"]=="16:00:00"), :]

In [20]:
# clean up data frame
ts_11_2018_pm = ts_11_2018_pm.drop(['C/A','UNIT','SCP','DIVISION','DESC','TOT. ENTRIES','TOT. EXITS'], axis=1)

In [23]:
# groupby station name and get total (not average)
ts_11_2018_pm_tot = ts_11_2018_pm.groupby(['STATION','LINENAME']).sum()

In [24]:
# change columns name; PM ENTERS / PM EXITS
ts_11_2018_pm_tot = ts_11_2018_pm_tot.rename(columns={'ENTERS': 'PM_ENTERS'})
ts_11_2018_pm_tot = ts_11_2018_pm_tot.rename(columns={'EXITS' : 'PM_EXITS'})
ts_11_2018_pm_tot.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,PM_ENTERS,PM_EXITS
STATION,LINENAME,Unnamed: 2_level_1,Unnamed: 3_level_1
1 AV,L,17391.0,18195.0
103 ST,1,13537.0,6671.0
103 ST,6,16862.0,12606.0
103 ST,BC,4818.0,3202.0
103 ST-CORONA,7,14701.0,9735.0


In [25]:
# save as csv
ts_11_2018_pm_tot.to_csv('ts_11_2018_pm_tot.csv')

In [26]:
# merge AM and PM into one dataframe
commuters_11_2018 = pd.merge(ts_11_2018_am_tot, ts_11_2018_pm_tot, on=["STATION", "LINENAME"], how="outer")
commuters_11_2018.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,AM_ENTERS,AM_EXITS,PM_ENTERS,PM_EXITS
STATION,LINENAME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1 AV,L,2173.0,5427.0,17391.0,18195.0
103 ST,1,7888.0,1943.0,13537.0,6671.0
103 ST,6,8439.0,7932.0,16862.0,12606.0
103 ST,BC,3504.0,1019.0,4818.0,3202.0
103 ST-CORONA,7,22039.0,1903.0,14701.0,9735.0


In [27]:
# save as csv
commuters_11_2018.to_csv('commuters_11_2018.csv')