In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.float_format', lambda x: '%.f' % x)

In [54]:
# read the data and remove spaces from column names
am = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_180505.txt')
am.columns = am.columns.str.strip()


# add in two new columns, one that is a datetime (from the date and time columns)
# and one that is just the day of the week
am['DATETIME'] = am['DATE'] + ' ' + am['TIME']
am['DATETIME'] = pd.to_datetime(am['DATETIME'], format="%m/%d/%Y %H:%M:%S")
am['DAY'] = am['DATETIME'].dt.day_name()

# only look at M-F and the 8AM - 12PM 
am = am[(am["TIME"] == '08:00:00') | (am["TIME"] == '12:00:00')]
am = am[(am["DAY"] != 'Saturday') & (am["DAY"] != 'Sunday')]


# make three new columns:
# 1. the difference between the number of entries of the row and the row above it
# 2. same for exits
# 3. a new column that is the sum of the first two, divided by the length to normalize

am['exits_diff'] = am.groupby(['STATION', 'UNIT'])['EXITS'].transform(lambda x: x.diff())
am['entries_diff'] = am.groupby(['STATION', 'UNIT'])['ENTRIES'].transform(lambda x: x.diff())
am['am_diff'] = am['entries_diff'] + am['exits_diff']
am['am_diff'] = am['am_diff'] / len(am)


# remove all diff values that are negative and 
# remove all diff values that are greater than 30k
# 30k is an estimation of how many people could realistically go through a turnstile
# in a 4h period

am = am[(am['am_diff'] > 0)] 
am = am[(am['am_diff'] < 30000)] 
print(am['am_diff'].describe())


# make a new dataframe that has the aggregate sum of 
# entry/exit differences for each station/unit pair
# make a new index column
# print the result
am_agg = am.groupby(['STATION', 'UNIT']).agg({'am_diff' : 'sum'})
am_agg.reset_index(inplace=True)
am_agg.head(20)

URLError: <urlopen error [Errno 8] nodename nor servname provided, or not known>

In [35]:
# read the data and remove spaces from column names
mid = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_180505.txt')
mid.columns = mid.columns.str.strip()


# add in two new columns, one that is a datetime (from the date and time columns)
# and one that is just the day of the week
mid['DATETIME'] = mid['DATE'] + ' ' + mid['TIME']
mid['DATETIME'] = pd.to_datetime(mid['DATETIME'], format="%m/%d/%Y %H:%M:%S")
mid['DAY'] = mid['DATETIME'].dt.day_name()

# only look at M-F and the 8AM - 12PM 
mid = mid[(mid["TIME"] == '12:00:00') | (mid["TIME"] == '16:00:00')]
mid = mid[(mid["DAY"] != 'Saturday') & (mid["DAY"] != 'Sunday')]

# make three new columns:
# 1. the difference between the number of entries of the row and the row above it
# 2. same for exits
# 3. a new column that is the sum of the first two , divided by the length to normalize

mid['exits_diff'] = mid.groupby(['STATION', 'UNIT'])['EXITS'].transform(lambda x: x.diff())
mid['entries_diff'] = mid.groupby(['STATION', 'UNIT'])['ENTRIES'].transform(lambda x: x.diff())
mid['mid_diff'] = mid['entries_diff'] + mid['exits_diff']
mid['mid_diff'] = mid['mid_diff'] / len(mid)



# remove all diff values that are negative and 
# remove all diff values that are greater than 30k
# 30k is an estimation of how many people could realistically go through a turnstile
# in a 4h period

mid = mid[(mid['mid_diff'] > 0)] 
mid = mid[(mid['mid_diff'] < 30000)] 
print(mid['mid_diff'].describe())


# make a new dataframe that has the aggregate sum of 
# entry/exit differences for each station/unit pair
# make a new index column
# print the result
mid_agg = mid.groupby(['STATION', 'UNIT']).agg({'mid_diff' : 'sum'})
mid_agg.reset_index(inplace=True)
mid_agg.head(20)

count    21041.000000
mean        40.329628
std        622.619964
min          0.000041
25%          0.015894
50%          0.032941
75%          0.082640
max      29289.434777
Name: diff2, dtype: float64


Unnamed: 0,STATION,UNIT,diff2
0,1 AV,R248,30731.075476
1,103 ST-CORONA,R208,708.42193
2,104 ST,R007,0.422877
3,104 ST,R354,161.89356
4,110 ST,R181,796.515029
5,111 ST,R008,234.887013
6,111 ST,R310,115.480483
7,121 ST,R009,75.739521
8,125 ST,R034,323.624763
9,125 ST,R102,31099.221115


In [36]:
# read the data and remove spaces from column names
pm = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_180505.txt')
pm.columns = pm.columns.str.strip()


# add in two new columns, one that is a datetime (from the date and time columns)
# and one that is just the day of the week
pm['DATETIME'] = pm['DATE'] + ' ' + pm['TIME']
pm['DATETIME'] = pd.to_datetime(pm['DATETIME'], format="%m/%d/%Y %H:%M:%S")
pm['DAY'] = pm['DATETIME'].dt.day_name()

# only look at M-F and the 4PM - 8PM 
pm = pm[(pm["TIME"] == '16:00:00') | (pm["TIME"] == '20:00:00')]
pm = pm[(pm["DAY"] != 'Saturday') & (pm["DAY"] != 'Sunday')]

# make three new columns:
# 1. the difference between the number of entries of the row and the row above it
# 2. same for exits
# 3. a new column that is the sum of the first two, divided by the length to normalize

pm['exits_diff'] = pm.groupby(['STATION', 'UNIT'])['EXITS'].transform(lambda x: x.diff())
pm['entries_diff'] = pm.groupby(['STATION', 'UNIT'])['ENTRIES'].transform(lambda x: x.diff())
pm['pm_diff'] = pm['entries_diff'] + pm['exits_diff']
pm['pm_diff'] = pm['pm_diff'] / len(pm)


# remove all diff values that are negative and 
# remove all diff values that are greater than 30k
# 30k is an estimation of how many people could realistically go through a turnstile
# in a 4h period

pm = pm[(pm['pm_diff'] > 0)] 
pm = pm[(pm['pm_diff'] < 30000)] 
print(pm['pm_diff'].describe())


# make a new dataframe that has the aggregate sum of 
# entry/exit differences for each station/unit pair
# make a new index column
# print the result
pm_agg = pm.groupby(['STATION', 'UNIT']).agg({'pm_diff' : 'sum'})
pm_agg.reset_index(inplace=True)
pm_agg.head(20)

count     9734.000000
mean       121.513959
std        914.149118
min          0.000090
25%          0.136396
50%          0.240973
75%          0.387073
max      29380.032768
Name: diff3, dtype: float64


Unnamed: 0,STATION,UNIT,diff3
0,1 AV,R248,7636.277487
1,103 ST-CORONA,R208,1552.720979
2,104 ST,R007,0.892399
3,104 ST,R354,354.896913
4,110 ST,R181,1746.027442
5,111 ST,R008,514.888247
6,111 ST,R310,252.976981
7,121 ST,R009,166.011013
8,125 ST,R034,709.294277
9,125 ST,R102,3966.906211


In [53]:
# make a new df by copying the am aggregate df
final = am_agg

# add two new columns to include the mid-day and evening commute
# diff values
final['mid_diff'] = mid_agg['mid_diff'].values
final['pm_diff'] = pm_agg['pm_diff'].values

# make two new columns comparing the mid-day traffic to
# both the morning and the evening
final['am_v_mid'] = final['am_diff'] - final['mid_diff']
final['pm_v_mid'] = final['pm_diff'] - final['mid_diff']

# sort the df by  one of the important columns (am chosen here arbitrarily)
# and print the info to see how many rows
final.sort_values('am_v_mid', ascending=False, inplace = True)
print(final.info())

# get rid of all values where either evaluation is low, aribtrarily chosen
# to be 1,000 here. That value, 1000, is quantitatively not useful to 
# intuit - it is the number of people normalized to the number of turnstiles (I think?)

final = final[(final['am_v_mid'] > 1000) & (final['pm_v_mid'] > 1000)]
print(final.info())
final.head(20)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 245 entries, 135 to 239
Data columns (total 7 columns):
STATION    245 non-null object
UNIT       245 non-null object
diff1      245 non-null float64
diff2      245 non-null float64
diff3      245 non-null float64
am_diff    245 non-null float64
pm_diff    245 non-null float64
dtypes: float64(5), object(2)
memory usage: 15.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 21 entries, 99 to 129
Data columns (total 7 columns):
STATION    21 non-null object
UNIT       21 non-null object
diff1      21 non-null float64
diff2      21 non-null float64
diff3      21 non-null float64
am_diff    21 non-null float64
pm_diff    21 non-null float64
dtypes: float64(5), object(2)
memory usage: 1.3+ KB
None


Unnamed: 0,STATION,UNIT,diff1,diff2,diff3,am_diff,pm_diff
99,ATL AV-BARCLAY,R057,42409,11797,25863,30612,14065
123,BOWLING GREEN,R042,13833,1168,2559,12665,1391
77,59 ST COLUMBUS,R084,14300,2668,5849,11632,3181
107,BAY RIDGE-95 ST,R216,11397,2313,5070,9084,2757
182,JAY ST-METROTEC,R127,11029,2762,6055,8267,3293
87,79 ST,R371,7723,1191,2611,6532,1420
181,JAY ST-METROTEC,R089,8992,4164,9128,4828,4964
55,34 ST-PENN STA,R031,6294,2170,4757,4124,2587
173,HALSEY ST,R266,4912,984,2156,3929,1173
154,EAST BROADWAY,R257,5758,1909,4170,3850,2262
