In [1059]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns

In [1060]:
engine = create_engine("sqlite:///data/mta_turnstile.db")

In [1061]:
all_tables = engine.table_names()
all_tables
some_tables = all_tables[0:3]
some_tables

  all_tables = engine.table_names()


['turnstile_2019_0105', 'turnstile_2019_0112', 'turnstile_2019_0119']

In [1062]:
dfs = []
for table in some_tables:
    dfs.append(pd.read_sql(table,engine))
    mtadf = pd.concat(dfs)
mtadf.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/29/2018,03:00:00,REGULAR,6889287,2335920
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/29/2018,07:00:00,REGULAR,6889299,2335936
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/29/2018,11:00:00,REGULAR,6889364,2336038
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/29/2018,15:00:00,REGULAR,6889605,2336101
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/29/2018,19:00:00,REGULAR,6889966,2336173


In [1063]:
print('DTYPES:\n',mtadf.dtypes,'\n')
print(mtadf.columns)
print(mtadf.shape)

DTYPES:
 C/A                                                                     object
UNIT                                                                    object
SCP                                                                     object
STATION                                                                 object
LINENAME                                                                object
DIVISION                                                                object
DATE                                                                    object
TIME                                                                    object
DESC                                                                    object
ENTRIES                                                                  int64
EXITS                                                                    int64
dtype: object 

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES',


In [1064]:
# fix the insane amount of space after 'EXITS'
mtadf.rename(str.strip, axis='columns',inplace=True)
mtadf.columns

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES', 'EXITS'],
      dtype='object')

In [1065]:
# change DATE and TIME to datetime objects
mtadf['OBSERVED_AT'] = pd.to_datetime(mtadf['DATE']+ ' ' + mtadf['TIME'], 
                                      format='%m/%d/%Y %H:%M:%S')

In [1041]:
# 21 days of data AKA 3 weeks
# mtadf.DATE.value_counts().sort_index()

In [1043]:
# SAVE FOR LATER
# create column for day of week
# mtadf['WEEKDAY']=mtadf['OBSERVED_AT'].dt.dayofweek
# change WEEKDAY from int64 to string Monday-Sunday
'''daymap = {0:'Monday',
          1: 'Tuesday',
          2: 'Wednesday',
          3: 'Thursday',
          4: 'Friday',
          5: 'Saturday',
          6: 'Sunday'}
mtadf['WEEKDAY']=mtadf['WEEKDAY'].map(daymap)
'''

"daymap = {0:'Monday',\n          1: 'Tuesday',\n          2: 'Wednesday',\n          3: 'Thursday',\n          4: 'Friday',\n          5: 'Saturday',\n          6: 'Sunday'}\nmtadf['WEEKDAY']=mtadf['WEEKDAY'].map(daymap)\n"

In [1066]:
# create some other columns based on the datetime object
mtadf['MONTH'] = mtadf.OBSERVED_AT.dt.month
mtadf['DAY'] = mtadf.OBSERVED_AT.dt.day

# don't end up using
# mtadf['HOUR'] = mtadf.OBSERVED_AT.dt.hour
# mtadf['MIN'] = mtadf.OBSERVED_AT.dt.minute

In [1067]:
# create identifyer for each turnstile then drop original columns
mtadf['TURNSTILE'] = mtadf['UNIT'] + '-' + mtadf['C/A'] + '-' + mtadf['SCP']

# didn't end up needing this
# mtadf['TURNSTILE_ID'] = mtadf['UNIT'] + '-' + mtadf['C/A'] + '-' + mtadf['SCP']\
                       # + '-' + mtadf['DATE'] + '-' + mtadf['TIME']

In [1068]:
mtadf.drop(['C/A','UNIT','SCP'],axis=1,inplace=True)

In [1069]:
mtadf.columns

Index(['STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME', 'DESC', 'ENTRIES',
       'EXITS', 'OBSERVED_AT', 'MONTH', 'DAY', 'TURNSTILE'],
      dtype='object')

In [1070]:
# rearrange order of columns
col_names = ['STATION','TURNSTILE','OBSERVED_AT',
             'DATE','MONTH','DAY','WEEKDAY','TIME',
             'ENTRIES','EXITS',
             'LINENAME','DIVISION','DESC']
mtadf = mtadf.reindex(columns=col_names)
mtadf.head(1)

Unnamed: 0,STATION,TURNSTILE,OBSERVED_AT,DATE,MONTH,DAY,WEEKDAY,TIME,ENTRIES,EXITS,LINENAME,DIVISION,DESC
0,59 ST,R051-A002-02-00-00,2018-12-29 03:00:00,12/29/2018,12,29,,03:00:00,6889287,2335920,NQR456W,BMT,REGULAR


In [1071]:
# sort the dataframe by station, then month/day/hour/min
mtadf.sort_values(['STATION','MONTH','DAY'],inplace=True)

In [1072]:
mtadf.OBSERVED_AT.dt.hour.value_counts()
#some observations are at HOUR 0 aka midnight

7     55924
11    55079
23    54427
15    54375
3     54336
19    54327
8     40925
16    39071
12    39040
0     38984
4     38978
20    38898
9      4834
5      4493
13     4470
21     4433
17     4431
1      4410
10     3290
6      2783
14     2511
18     2497
2      2459
22     2445
Name: OBSERVED_AT, dtype: int64

In [1073]:
# create an hour offset
one_hour = pd.offsets.Hour()

In [1074]:
# how many rows occur from midnight to midnight:59?
print(mtadf[mtadf.OBSERVED_AT.dt.hour == 0].shape)
print(mtadf.shape)

(38984, 13)
(607420, 13)


In [1075]:
# if the observation was from midnight to midight:59
# subtract 1 hour so the log is included for the previous day
mtadf['OBSERVED_AT'] = mtadf['OBSERVED_AT'].apply(lambda x: x-one_hour if x.hour == 0 else x)

In [1076]:
# check if we have any instances of midnight to midnight:59
mtadf['HOUR'] = mtadf['OBSERVED_AT'].dt.hour
print(mtadf[mtadf['HOUR'] == 0])

Empty DataFrame
Columns: [STATION, TURNSTILE, OBSERVED_AT, DATE, MONTH, DAY, WEEKDAY, TIME, ENTRIES, EXITS, LINENAME, DIVISION, DESC, HOUR]
Index: []


In [1077]:
# get daily cumulative # per turnstile per date
mtadf_ts_daily = (mtadf.groupby(['TURNSTILE','STATION','DATE'],as_index=False)
                        [['ENTRIES','EXITS']].max())
mtadf_ts_daily

Unnamed: 0,TURNSTILE,STATION,DATE,ENTRIES,EXITS
0,R001-A058-01-00-00,WHITEHALL S-FRY,01/01/2019,1495716,3470742
1,R001-A058-01-00-00,WHITEHALL S-FRY,01/02/2019,1496890,3473304
2,R001-A058-01-00-00,WHITEHALL S-FRY,01/03/2019,1498200,3475903
3,R001-A058-01-00-00,WHITEHALL S-FRY,01/04/2019,1499417,3478394
4,R001-A058-01-00-00,WHITEHALL S-FRY,01/05/2019,1499698,3479101
...,...,...,...,...,...
100631,R572-N702A-01-03-04,96 ST-2 AVE,01/17/2019,931008,304809
100632,R572-N702A-01-03-04,96 ST-2 AVE,01/18/2019,932433,305272
100633,R572-N702A-01-03-04,96 ST-2 AVE,12/29/2018,909936,297827
100634,R572-N702A-01-03-04,96 ST-2 AVE,12/30/2018,910208,297928


In [1078]:
mtadf_ts_daily[['PREV_DATE','PREV_ENTRIES','PREV_EXITS']]=mtadf_ts_daily\
                            .groupby(['TURNSTILE','STATION'])[['DATE', 'ENTRIES','EXITS']]\
                            .apply(lambda grp: grp.shift(1))
mtadf_ts_daily            

Unnamed: 0,TURNSTILE,STATION,DATE,ENTRIES,EXITS,PREV_DATE,PREV_ENTRIES,PREV_EXITS
0,R001-A058-01-00-00,WHITEHALL S-FRY,01/01/2019,1495716,3470742,,,
1,R001-A058-01-00-00,WHITEHALL S-FRY,01/02/2019,1496890,3473304,01/01/2019,1495716.0,3470742.0
2,R001-A058-01-00-00,WHITEHALL S-FRY,01/03/2019,1498200,3475903,01/02/2019,1496890.0,3473304.0
3,R001-A058-01-00-00,WHITEHALL S-FRY,01/04/2019,1499417,3478394,01/03/2019,1498200.0,3475903.0
4,R001-A058-01-00-00,WHITEHALL S-FRY,01/05/2019,1499698,3479101,01/04/2019,1499417.0,3478394.0
...,...,...,...,...,...,...,...,...
100631,R572-N702A-01-03-04,96 ST-2 AVE,01/17/2019,931008,304809,01/16/2019,929537.0,304322.0
100632,R572-N702A-01-03-04,96 ST-2 AVE,01/18/2019,932433,305272,01/17/2019,931008.0,304809.0
100633,R572-N702A-01-03-04,96 ST-2 AVE,12/29/2018,909936,297827,01/18/2019,932433.0,305272.0
100634,R572-N702A-01-03-04,96 ST-2 AVE,12/30/2018,910208,297928,12/29/2018,909936.0,297827.0


In [1079]:
# drop the rows for the earliest date in the DF for each Turnstile
mtadf_ts_daily.dropna(subset=["PREV_DATE"], axis=0, inplace=True)

In [1080]:
mtadf_ts_daily.shape

(95814, 8)

In [1081]:
# how many entries are going backwards
mtadf_ts_daily[mtadf_ts_daily['ENTRIES'] < mtadf_ts_daily['PREV_ENTRIES']].shape

(5245, 8)

In [1082]:
# how many turnstiles per station
# mtadf.groupby('STATION')['TURNSTILE'].nunique().describe()

In [1083]:
def get_daily_entries(df):
    #max_count is 2 person per minute through the turnstile
    max_count = 120 * 24
    count = df['ENTRIES'] - df['PREV_ENTRIES']
    if count < 0:
        count = -count
        
    if count > max_count:
        count = min(df["ENTRIES"], df["PREV_ENTRIES"])
    
    if count > max_count:
        return 0
    return count

In [1084]:
mtadf_ts_daily['NET_ENTRIES']= mtadf_ts_daily.apply(get_daily_entries, axis=1)

In [1085]:
def get_daily_exits(df):
    max_count = 120 * 24
    count = df['EXITS'] - df['PREV_EXITS']
    if count < 0:
        count = -count
        
    if count > max_count:
        count = min(df['EXITS'], df['PREV_EXITS'])
    
    if count > max_count:
        return 0
    return count

In [1086]:
mtadf_ts_daily['NET_EXITS']= mtadf_ts_daily.apply(get_daily_exits, axis=1)

In [1087]:
# create a DF with daily # per STATION per DATE, sum the NET_ENTRIES NET_EXITS per STATION
mtadf_st_daily = mtadf_ts_daily.groupby(['STATION','DATE'],as_index=False)[['NET_ENTRIES','NET_EXITS']].sum()

In [1088]:
# create a columb for NET_BUSY
mtadf_st_daily['NET_BUSY'] = mtadf_st_daily['NET_ENTRIES'] + mtadf_st_daily['NET_EXITS']

In [1089]:
# we have a dataframe that shows net entries, net exits, and net busy by station by day!
mtadf_st_daily

Unnamed: 0,STATION,DATE,NET_ENTRIES,NET_EXITS,NET_BUSY
0,1 AV,01/02/2019,7309.0,5242.0,12551.0
1,1 AV,01/03/2019,7533.0,5588.0,13121.0
2,1 AV,01/04/2019,8009.0,5905.0,13914.0
3,1 AV,01/05/2019,7542.0,6590.0,14132.0
4,1 AV,01/06/2019,12021.0,10851.0,22872.0
...,...,...,...,...,...
7522,ZEREGA AV,01/17/2019,2811.0,2600.0,5411.0
7523,ZEREGA AV,01/18/2019,2678.0,2430.0,5108.0
7524,ZEREGA AV,12/29/2018,528.0,1.0,529.0
7525,ZEREGA AV,12/30/2018,1068.0,1197.0,2265.0


In [1090]:
# put DATE back as a datetime object
mtadf_st_daily['DATE'] = pd.to_datetime(mtadf_st_daily['DATE'])

In [1091]:
mtadf_st_daily['WEEKDAY']=mtadf_st_daily['DATE'].dt.dayofweek
# change WEEKDAY from int64 to string Monday-Sunday
daymap = {0:'Monday',
          1: 'Tuesday',
          2: 'Wednesday',
          3: 'Thursday',
          4: 'Friday',
          5: 'Saturday',
          6: 'Sunday'}
mtadf_st_daily['WEEKDAY']=mtadf_st_daily['WEEKDAY'].map(daymap)

In [1115]:
# get the average business on a given day of the week for that station
mtadf_st_daymean = mtadf_st_daily.groupby(['STATION','WEEKDAY'])[['NET_BUSY']].mean()