In [394]:
from sqlalchemy import create_engine
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

In [395]:
engine = create_engine("sqlite:///data/mta_turnstile.db")

In [396]:
all_tables = engine.table_names()
#all_tables

  all_tables = engine.table_names()


In [397]:
mtadf = pd.read_sql('SELECT * FROM turnstile_2019_0105;',engine)
mtadf.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/29/2018,03:00:00,REGULAR,6889287,2335920
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/29/2018,07:00:00,REGULAR,6889299,2335936
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/29/2018,11:00:00,REGULAR,6889364,2336038
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/29/2018,15:00:00,REGULAR,6889605,2336101
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/29/2018,19:00:00,REGULAR,6889966,2336173


In [398]:
print('DTYPES:\n',mtadf.dtypes,'\n')
print(mtadf.columns)

DTYPES:
 C/A                                                                     object
UNIT                                                                    object
SCP                                                                     object
STATION                                                                 object
LINENAME                                                                object
DIVISION                                                                object
DATE                                                                    object
TIME                                                                    object
DESC                                                                    object
ENTRIES                                                                  int64
EXITS                                                                    int64
dtype: object 

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES',


In [399]:
# fix the insane amount of space after 'EXITS'
mtadf.rename(str.strip, axis='columns',inplace=True)
mtadf.columns

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES', 'EXITS'],
      dtype='object')

In [400]:
# change DATE and TIME to datetime objects
mtadf['DATE']=pd.to_datetime(mtadf['DATE'])
mtadf['TIME']=pd.to_datetime(mtadf['TIME'],format='%H:%M:%S')

In [401]:
# create column for HOUR and MINUTE
mtadf['HOUR']=mtadf['TIME'].dt.hour
mtadf['MIN']=mtadf['TIME'].dt.minute

# drop the TIME column
mtadf.drop(columns='TIME',axis=1,inplace=True)

In [402]:
# create unique identifyer for each turnstile then drop original columns
mtadf['TURNSTILE'] = mtadf['C/A'] + '-' + mtadf['UNIT'] + '-' + mtadf['SCP']
mtadf.drop(['C/A','UNIT','SCP'],axis=1,inplace=True)

# rearrange order of columns
col_names = ['STATION','TURNSTILE','DATE','HOUR','MIN','ENTRIES','EXITS','LINENAME','DIVISION','DESC']
mtadf = mtadf.reindex(columns=col_names)

In [403]:
# check for null values in dataset -- no null values
mtadf.isna().sum()

STATION      0
TURNSTILE    0
DATE         0
HOUR         0
MIN          0
ENTRIES      0
EXITS        0
LINENAME     0
DIVISION     0
DESC         0
dtype: int64

In [404]:
# sort the database by station, then turnstile, then date, then time
mtadfs = mtadf.sort_values(['STATION','TURNSTILE','DATE','HOUR','MIN'])

# which 10 stations have the least amount of turnstiles?
mtadfs.groupby(['STATION'])['TURNSTILE'].nunique().nsmallest(10)

# create a database with 5 stations
mtadfs_leastt = mtadfs[mtadfs['STATION'].isin(['138/GRAND CONC','182-183 STS','190 ST'])
                       &mtadfs['DATE'].isin(['2019-01-04'])]


# check max amount of entries per station
# CLEVELAND ST and SUTTER AV have significantly less entries
# remove them from above .isin() statement
mtadfs_leastt.groupby(['STATION'])['ENTRIES'].max()

# we now have a database with information 
# for 3 STATIONS (138/GRAND CONC, 182-183 STS, 190 ST)
# on 1 DAY (Jan 4, 2019)
# 9 TURNSTILES total
mtadfs_leastt['TURNSTILE'].unique()

array(['R259-R307-00-00-00', 'R259-R307-00-00-01', 'R259-R307-00-00-02',
       'N215-R237-00-00-00', 'N215-R237-00-00-01', 'N215-R237-00-00-02',
       'N006A-R280-00-00-00', 'N006A-R280-00-00-01',
       'N006A-R280-00-00-02'], dtype=object)

In [405]:
# 3 turnstiles per STATION
mtadfs_leastt.groupby(['STATION'])[['TURNSTILE']].describe().reset_index()

# 6 entries per TURNSTILE
mtadfs_leastt['TURNSTILE'].sort_values().value_counts().reset_index()

Unnamed: 0,index,TURNSTILE
0,N006A-R280-00-00-02,6
1,N006A-R280-00-00-00,6
2,N215-R237-00-00-02,6
3,N215-R237-00-00-00,6
4,R259-R307-00-00-00,6
5,R259-R307-00-00-02,6
6,R259-R307-00-00-01,6
7,N215-R237-00-00-01,6
8,N006A-R280-00-00-01,6


In [412]:
# create a database without the unnecessary columns
mtadfs_leastt_ee=mtadfs_leastt[['STATION',
                                'TURNSTILE',
                                'HOUR',
                                'MIN',
                                'ENTRIES',
                                'EXITS']].reset_index()
mtadfs_leastt_ee.drop('index',axis=1,inplace=True)

In [413]:
# calculate the sum of the entries and exits per time period
mtadfs_leastt_ee['SUM_ENTRIES'] = mtadfs_leastt_ee.groupby(['TURNSTILE'])['ENTRIES'].diff()
mtadfs_leastt_ee['SUM_EXITS'] = mtadfs_leastt_ee.groupby(['TURNSTILE'])['EXITS'].diff()

# shift the column data up by 1 row, drop the previous column
mtadfs_leastt_ee['ENTRIES_SUM'] = mtadfs_leastt_ee['SUM_ENTRIES'].shift(periods=-1, axis=0)
mtadfs_leastt_ee['EXITS_SUM'] = mtadfs_leastt_ee['SUM_EXITS'].shift(periods=-1, axis=0)
mtadfs_leastt_ee.drop('SUM_ENTRIES',axis=1,inplace=True)
mtadfs_leastt_ee.drop('SUM_EXITS',axis=1,inplace=True)

In [414]:
mtadfs_leastt_ee['BUSY'] = mtadfs_leastt_ee['EXITS_SUM'] + mtadfs_leastt_ee['ENTRIES_SUM']

In [415]:
mtadfs_leastt_ee

Unnamed: 0,STATION,TURNSTILE,HOUR,MIN,ENTRIES,EXITS,ENTRIES_SUM,EXITS_SUM,BUSY
0,138/GRAND CONC,R259-R307-00-00-00,3,0,3599462,3348054,166.0,128.0,294.0
1,138/GRAND CONC,R259-R307-00-00-00,7,0,3599628,3348182,345.0,355.0,700.0
2,138/GRAND CONC,R259-R307-00-00-00,11,0,3599973,3348537,307.0,210.0,517.0
3,138/GRAND CONC,R259-R307-00-00-00,15,0,3600280,3348747,389.0,375.0,764.0
4,138/GRAND CONC,R259-R307-00-00-00,19,0,3600669,3349122,180.0,167.0,347.0
5,138/GRAND CONC,R259-R307-00-00-00,23,0,3600849,3349289,,,
6,138/GRAND CONC,R259-R307-00-00-01,3,0,4257977,3755167,122.0,82.0,204.0
7,138/GRAND CONC,R259-R307-00-00-01,7,0,4258099,3755249,216.0,188.0,404.0
8,138/GRAND CONC,R259-R307-00-00-01,11,0,4258315,3755437,141.0,158.0,299.0
9,138/GRAND CONC,R259-R307-00-00-01,15,0,4258456,3755595,251.0,187.0,438.0
