Suggestions 

Data from [MTA](http://web.mta.info/developers/turnstile.html) <br>
Description of data [here](http://web.mta.info/developers/resources/nyct/turnstile/ts_Field_Description.txt)


[GitHub](https://github.com/huge-reality/MTA_Project_Group_5)



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.float_format', lambda x: '%.f' % x)

In [2]:
# read the data and remove spaces from column names
am = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_180505.txt')
am.columns = am.columns.str.strip()


# add in two new columns, one that is a datetime (from the date and time columns)
# and one that is just the day of the week
am['DATETIME'] = am['DATE'] + ' ' + am['TIME']
am['DATETIME'] = pd.to_datetime(am['DATETIME'], format="%m/%d/%Y %H:%M:%S")
am['DAY'] = am['DATETIME'].dt.day_name()

# only look at M-F and the 8AM - 12PM 
am = am[(am["TIME"] == '08:00:00') | (am["TIME"] == '12:00:00')]
am = am[(am["DAY"] != 'Saturday') & (am["DAY"] != 'Sunday')]


# make three new columns:
# 1. the difference between the number of entries of the row and the row above it
# 2. same for exits
# 3. a new column that is the sum of the first two, divided by the length to normalize

am['exits_diff'] = am.groupby(['STATION', 'UNIT'])['EXITS'].transform(lambda x: x.diff())
am['entries_diff'] = am.groupby(['STATION', 'UNIT'])['ENTRIES'].transform(lambda x: x.diff())
am['am_diff'] = am['entries_diff'] + am['exits_diff']
am['am_diff'] = am['am_diff']


# remove all diff values that are negative and 
# remove all diff values that are greater than 30k
# 30k is an estimation of how many people could realistically go through a turnstile
# in a 4h period

am = am[(am['am_diff'] > 0)] 
am = am[(am['am_diff'] < 30000)] 
print(am['am_diff'].describe())


# make a new dataframe that has the aggregate sum of 
# entry/exit differences for each station/unit pair
# make a new index column
# print the result
am_agg = am.groupby(['STATION', 'UNIT']).agg({'am_diff' : 'sum'})
am_agg.reset_index(inplace=True)
am_agg.head(20)

count   20073
mean     1266
std      1179
min         1
25%       477
50%       895
75%      1728
max     28474
Name: am_diff, dtype: float64


Unnamed: 0,STATION,UNIT,am_diff
0,1 AV,R248,200306
1,103 ST-CORONA,R208,141412
2,104 ST,R007,10392
3,104 ST,R354,10101
4,110 ST,R181,91490
5,111 ST,R008,19938
6,111 ST,R310,76632
7,121 ST,R009,7909
8,125 ST,R034,71462
9,125 ST,R102,215703


In [3]:
# read the data and remove spaces from column names
mid = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_180505.txt')
mid.columns = mid.columns.str.strip()


# add in two new columns, one that is a datetime (from the date and time columns)
# and one that is just the day of the week
mid['DATETIME'] = mid['DATE'] + ' ' + mid['TIME']
mid['DATETIME'] = pd.to_datetime(mid['DATETIME'], format="%m/%d/%Y %H:%M:%S")
mid['DAY'] = mid['DATETIME'].dt.day_name()

# only look at M-F and the 8AM - 12PM 
mid = mid[(mid["TIME"] == '12:00:00') | (mid["TIME"] == '16:00:00')]
mid = mid[(mid["DAY"] != 'Saturday') & (mid["DAY"] != 'Sunday')]

# make three new columns:
# 1. the difference between the number of entries of the row and the row above it
# 2. same for exits
# 3. a new column that is the sum of the first two , divided by the length to normalize

mid['exits_diff'] = mid.groupby(['STATION', 'UNIT'])['EXITS'].transform(lambda x: x.diff())
mid['entries_diff'] = mid.groupby(['STATION', 'UNIT'])['ENTRIES'].transform(lambda x: x.diff())
mid['mid_diff'] = mid['entries_diff'] + mid['exits_diff']
mid['mid_diff'] = mid['mid_diff']



# remove all diff values that are negative and 
# remove all diff values that are greater than 30k
# 30k is an estimation of how many people could realistically go through a turnstile
# in a 4h period

mid = mid[(mid['mid_diff'] > 0)] 
mid = mid[(mid['mid_diff'] < 30000)] 
print(mid['mid_diff'].describe())


# make a new dataframe that has the aggregate sum of 
# entry/exit differences for each station/unit pair
# make a new index column
# print the result
mid_agg = mid.groupby(['STATION', 'UNIT']).agg({'mid_diff' : 'sum'})
mid_agg.reset_index(inplace=True)
mid_agg.head(20)

count   20067
mean     1251
std      1305
min         1
25%       369
50%       752
75%      1750
max     28474
Name: mid_diff, dtype: float64


Unnamed: 0,STATION,UNIT,mid_diff
0,1 AV,R248,199287
1,103 ST-CORONA,R208,147335
2,104 ST,R007,10270
3,104 ST,R354,9901
4,110 ST,R181,90908
5,111 ST,R008,19776
6,111 ST,R310,75422
7,121 ST,R009,7840
8,125 ST,R034,71325
9,125 ST,R102,215057


In [4]:
# read the data and remove spaces from column names
pm = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_180505.txt')
pm.columns = pm.columns.str.strip()


# add in two new columns, one that is a datetime (from the date and time columns)
# and one that is just the day of the week
pm['DATETIME'] = pm['DATE'] + ' ' + pm['TIME']
pm['DATETIME'] = pd.to_datetime(pm['DATETIME'], format="%m/%d/%Y %H:%M:%S")
pm['DAY'] = pm['DATETIME'].dt.day_name()

# only look at M-F and the 4PM - 8PM 
pm = pm[(pm["TIME"] == '16:00:00') | (pm["TIME"] == '20:00:00')]
pm = pm[(pm["DAY"] != 'Saturday') & (pm["DAY"] != 'Sunday')]

# make three new columns:
# 1. the difference between the number of entries of the row and the row above it
# 2. same for exits
# 3. a new column that is the sum of the first two, divided by the length to normalize

pm['exits_diff'] = pm.groupby(['STATION', 'UNIT'])['EXITS'].transform(lambda x: x.diff())
pm['entries_diff'] = pm.groupby(['STATION', 'UNIT'])['ENTRIES'].transform(lambda x: x.diff())
pm['pm_diff'] = pm['entries_diff'] + pm['exits_diff']
pm['pm_diff'] = pm['pm_diff']


# remove all diff values that are negative and 
# remove all diff values that are greater than 30k
# 30k is an estimation of how many people could realistically go through a turnstile
# in a 4h period

pm = pm[(pm['pm_diff'] > 0)] 
pm = pm[(pm['pm_diff'] < 30000)] 
print(pm['pm_diff'].describe())


# make a new dataframe that has the aggregate sum of 
# entry/exit differences for each station/unit pair
# make a new index column
# print the result
pm_agg = pm.groupby(['STATION', 'UNIT']).agg({'pm_diff' : 'sum'})
pm_agg.reset_index(inplace=True)
pm_agg.head(20)

count   20084
mean     1283
std      1093
min         1
25%       511
50%       993
75%      1753
max     28474
Name: pm_diff, dtype: float64


Unnamed: 0,STATION,UNIT,pm_diff
0,1 AV,R248,204524
1,103 ST-CORONA,R208,150150
2,104 ST,R007,10646
3,104 ST,R354,9992
4,110 ST,R181,92085
5,111 ST,R008,20114
6,111 ST,R310,76759
7,121 ST,R009,7887
8,125 ST,R034,73071
9,125 ST,R102,220728


In [10]:
# make a new df by copying the am aggregate df
final = am_agg
print(final.head())

# add two new columns to include the mid-day and evening commute
# diff values
final['mid_diff'] = mid_agg['mid_diff'].values
final['pm_diff'] = pm_agg['pm_diff'].values

# make two new columns comparing the mid-day traffic to
# both the morning and the evening
final['am_v_mid'] = final['am_diff'] - final['mid_diff']
final['pm_v_mid'] = final['pm_diff'] - final['mid_diff']

# sort the df by  one of the important columns (am chosen here arbitrarily)
# and print the info to see how many rows
final.sort_values('pm_v_mid', ascending=False, inplace = True)

print(final.info())
final.head()

# get rid of all values where either evaluation is low, aribtrarily chosen
# to be 1,000 here. That value, 1000, is quantitatively not useful to 
# intuit - it is the number of people normalized to the number of turnstiles (I think?)

final = final[(final['am_v_mid'] > 2000) & (final['pm_v_mid'] > 2000)]
print(final.info())
final.head(20)

           STATION  UNIT  am_diff  mid_diff  pm_diff  am_v_mid  pm_v_mid
82           72 ST  R281    70374    633206   655689   -562832     22483
24          175 ST  R126    84085    546637   568465   -462552     21828
123  BOWLING GREEN  R042    80591    528722   550187   -448131     21465
14          145 ST  R101   195842    400075   418454   -204233     18379
235  VAN SICLEN AV  R068    27123    415702   432337   -388579     16635
<class 'pandas.core.frame.DataFrame'>
Int64Index: 245 entries, 32 to 244
Data columns (total 7 columns):
STATION     245 non-null object
UNIT        245 non-null object
am_diff     245 non-null float64
mid_diff    245 non-null float64
pm_diff     245 non-null float64
am_v_mid    245 non-null float64
pm_v_mid    245 non-null float64
dtypes: float64(5), object(2)
memory usage: 15.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 20 entries, 50 to 159
Data columns (total 7 columns):
STATION     20 non-null object
UNIT        20 non-null object
am_d

Unnamed: 0,STATION,UNIT,am_diff,mid_diff,pm_diff,am_v_mid,pm_v_mid
50,34 ST-HERALD SQ,R022,548004,365034,373423,182970,8389
191,LEXINGTON AV/53,R017,303767,205141,212829,98626,7688
170,GRD CNTRL-42 ST,R046,647244,190471,196476,456773,6005
70,51 ST,R049,194122,169580,174761,24542,5181
229,SUTPHIN-ARCHER,R024,192816,119367,124466,73449,5099
99,ATL AV-BARCLAY,R057,359329,102437,106983,256892,4546
9,125 ST,R102,215703,192650,197029,23053,4379
232,TIMES SQ-42 ST,R033,416855,200239,204257,216616,4018
147,COURT SQ,R359,133495,118187,122196,15308,4009
231,TIMES SQ-42 ST,R032,426334,119901,122996,306433,3095
