## Steps undertaken in this notebook:

1. Import and clean data using Abi's code
2. Split Dataframe in two, one for weekends and one for weekdays
3. Find average weekday and weekend volume for each station
4. Create new dataframe with difference between weekday and weekend avg volume
5. Sort by difference to find stations with highest proportion of commuters




In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
# Combine three weeks worth of MTA data from late April to early May

df1 = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_190427.txt')
df2 = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_190504.txt')
df3 = pd.read_csv('http://web.mta.info/developers/data/nyct/turnstile/turnstile_190511.txt')

df = df1.append([df2, df3])

In [3]:
# Data Cleaning 

df.columns = df.columns.str.strip()

df['DATETIME'] = df['DATE'] + ' ' + df['TIME']
df['DATETIME'] = pd.to_datetime(df['DATETIME'], format="%m/%d/%Y %H:%M:%S")

df.drop(columns=['DATE', 'TIME'], inplace=True)

df['DAY'] = df['DATETIME'].dt.day_name()

# Each turnstile can be indentified by a combination of the UNIT and SCP. Grouping by these two features allows us to get accurate
# entry and exit counts. 

df['ENTRY COUNT'] = df.groupby(['UNIT', 'SCP'])['ENTRIES'].transform(lambda x: x.diff())
df['EXIT COUNT'] = df.groupby(['UNIT', 'SCP'])['EXITS'].transform(lambda x: x.diff())

In [4]:
df = df[(df['ENTRY COUNT'] >= 0) & (df['EXIT COUNT'] >= 0)]

In [5]:
df = df[(df['ENTRY COUNT'] < 15000 ) & (df['EXIT COUNT'] < 15000)]

In [6]:
# Adding a new column which calculates the total volume for each row, i.e. entries and exits for the 4 hr timeframe
df['VOLUME'] = df['ENTRY COUNT'] + df['EXIT COUNT']
df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DESC,ENTRIES,EXITS,DATETIME,DAY,ENTRY COUNT,EXIT COUNT,VOLUME
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7026719,2382240,2019-04-20 04:00:00,Saturday,17.0,6.0,23.0
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7026737,2382269,2019-04-20 08:00:00,Saturday,18.0,29.0,47.0
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7026814,2382335,2019-04-20 12:00:00,Saturday,77.0,66.0,143.0
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7027007,2382395,2019-04-20 16:00:00,Saturday,193.0,60.0,253.0
5,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7027293,2382427,2019-04-20 20:00:00,Saturday,286.0,32.0,318.0


In [7]:
# Adding yet ANOTHER column for date, so that I can get daily total volumes
df['DATE'] = df['DATETIME'].dt.strftime('%m/%d/%Y')
df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DESC,ENTRIES,EXITS,DATETIME,DAY,ENTRY COUNT,EXIT COUNT,VOLUME,DATE
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7026719,2382240,2019-04-20 04:00:00,Saturday,17.0,6.0,23.0,04/20/2019
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7026737,2382269,2019-04-20 08:00:00,Saturday,18.0,29.0,47.0,04/20/2019
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7026814,2382335,2019-04-20 12:00:00,Saturday,77.0,66.0,143.0,04/20/2019
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7027007,2382395,2019-04-20 16:00:00,Saturday,193.0,60.0,253.0,04/20/2019
5,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7027293,2382427,2019-04-20 20:00:00,Saturday,286.0,32.0,318.0,04/20/2019


In [8]:
# Splitting into two data frames
weekends = df[(df.DAY == "Saturday") | (df.DAY == "Sunday")]
weekdays = df[(df.DAY != "Saturday") & (df.DAY != "Sunday")]

Getting average volume, per station, per day, starting by getting the total volume for each station for each calendar day

In [9]:
weekday_totals = weekdays.groupby(['STATION', 'DATE']).agg({'VOLUME' : 'sum'})

weekend_totals = weekends.groupby(['STATION', 'DATE']).agg({'VOLUME' : 'sum'})

In [10]:
weekday_totals.reset_index()

Unnamed: 0,STATION,DATE,VOLUME
0,1 AV,04/22/2019,35706.000
1,1 AV,04/23/2019,36389.000
2,1 AV,04/24/2019,37278.000
3,1 AV,04/25/2019,38619.000
4,1 AV,04/26/2019,38401.000
5,1 AV,04/29/2019,36454.000
6,1 AV,04/30/2019,39009.000
7,1 AV,05/01/2019,40345.000
8,1 AV,05/02/2019,40741.000
9,1 AV,05/03/2019,42397.000


In [11]:
weekend_totals.reset_index()

Unnamed: 0,STATION,DATE,VOLUME
0,1 AV,04/20/2019,22404.000
1,1 AV,04/21/2019,24402.000
2,1 AV,04/27/2019,17079.000
3,1 AV,04/28/2019,12444.000
4,1 AV,05/04/2019,17728.000
5,1 AV,05/05/2019,13728.000
6,103 ST,04/20/2019,25214.000
7,103 ST,04/21/2019,25636.000
8,103 ST,04/27/2019,33985.000
9,103 ST,04/28/2019,28262.000


In [12]:
weekday_average = weekday_totals.groupby('STATION').mean()
weekday_average.head()

Unnamed: 0_level_0,VOLUME
STATION,Unnamed: 1_level_1
1 AV,38937.933
103 ST,52080.267
103 ST-CORONA,33784.533
104 ST,5230.6
110 ST,19662.667


In [13]:
weekend_average = weekend_totals.groupby('STATION').mean()
weekend_average.head()

Unnamed: 0_level_0,VOLUME
STATION,Unnamed: 1_level_1
1 AV,17964.167
103 ST,29322.667
103 ST-CORONA,22017.167
104 ST,923.333
110 ST,11005.333


In [14]:
weekday_average.reset_index()
weekend_average.reset_index()

Unnamed: 0,STATION,VOLUME
0,1 AV,17964.167
1,103 ST,29322.667
2,103 ST-CORONA,22017.167
3,104 ST,923.333
4,110 ST,11005.333
5,111 ST,13565.667
6,116 ST,33269.500
7,116 ST-COLUMBIA,11028.667
8,121 ST,952.333
9,125 ST,84999.167


In [15]:
difference = weekday_average['VOLUME'] - weekend_average['VOLUME']

In [16]:
difference.head()

STATION
1 AV            20973.767
103 ST          22757.600
103 ST-CORONA   11767.367
104 ST           4307.267
110 ST           8657.333
Name: VOLUME, dtype: float64

In [17]:
difference = pd.DataFrame(difference)

In [18]:
type(difference)

pandas.core.frame.DataFrame

In [19]:
difference.reset_index()

Unnamed: 0,STATION,VOLUME
0,1 AV,20973.767
1,103 ST,22757.600
2,103 ST-CORONA,11767.367
3,104 ST,4307.267
4,110 ST,8657.333
5,111 ST,8912.267
6,116 ST,18285.300
7,116 ST-COLUMBIA,12799.800
8,121 ST,3587.733
9,125 ST,48702.767


In [20]:
type(difference)

pandas.core.frame.DataFrame

In [21]:
difference.reset_index()

Unnamed: 0,STATION,VOLUME
0,1 AV,20973.767
1,103 ST,22757.600
2,103 ST-CORONA,11767.367
3,104 ST,4307.267
4,110 ST,8657.333
5,111 ST,8912.267
6,116 ST,18285.300
7,116 ST-COLUMBIA,12799.800
8,121 ST,3587.733
9,125 ST,48702.767


In [28]:
difference.sort_values('VOLUME', ascending = False, inplace = True)

In [31]:
difference.reset_index()

Unnamed: 0,STATION,VOLUME
0,GRD CNTRL-42 ST,187532.133
1,34 ST-PENN STA,155591.300
2,FULTON ST,120888.767
3,34 ST-HERALD SQ,110090.133
4,23 ST,105874.133
5,PATH NEW WTC,105547.667
6,47-50 STS ROCK,89943.367
7,14 ST-UNION SQ,85822.967
8,FLUSHING-MAIN,77887.167
9,TIMES SQ-42 ST,77014.033


In [23]:
#top_weekdays = weekday_average.loc[['GRD CNTRL-42 ST','34 ST-HERALD SQ','34 ST-PENN STA','42 ST-PORT AUTH','14 ST-UNION SQ',
 #                                  'TIMES SQ-42 ST','59 ST COLUMBUS','FULTON ST','FLUSHING-MAIN','PATH NEW WTC']]

In [24]:
#top_weekdays.head(50)

In [25]:
#top_weekdend = weekend_average.loc[['GRD CNTRL-42 ST','34 ST-HERALD SQ','34 ST-PENN STA','42 ST-PORT AUTH','14 ST-UNION SQ',
                                  # 'TIMES SQ-42 ST','59 ST COLUMBUS','FULTON ST','FLUSHING-MAIN','PATH NEW WTC']]