In [2]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
engine = create_engine("sqlite:///data/mta_turnstile.db")

In [4]:
all_tables = engine.table_names()
all_tables
some_tables = all_tables[0:3]
some_tables

['turnstile_2019_0105', 'turnstile_2019_0112', 'turnstile_2019_0119']

In [5]:
dfs = []
for table in some_tables:
    dfs.append(pd.read_sql(table,engine))
    mtadf = pd.concat(dfs)
mtadf.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/29/2018,03:00:00,REGULAR,6889287,2335920
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/29/2018,07:00:00,REGULAR,6889299,2335936
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/29/2018,11:00:00,REGULAR,6889364,2336038
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/29/2018,15:00:00,REGULAR,6889605,2336101
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,12/29/2018,19:00:00,REGULAR,6889966,2336173


In [6]:
print('DTYPES:\n',mtadf.dtypes,'\n')
print(mtadf.columns)
print(mtadf.shape)

DTYPES:
 C/A                                                                     object
UNIT                                                                    object
SCP                                                                     object
STATION                                                                 object
LINENAME                                                                object
DIVISION                                                                object
DATE                                                                    object
TIME                                                                    object
DESC                                                                    object
ENTRIES                                                                  int64
EXITS                                                                    int64
dtype: object 

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES',


In [7]:
# fix the insane amount of space after 'EXITS'
mtadf.rename(str.strip, axis='columns',inplace=True)
mtadf.columns

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES', 'EXITS'],
      dtype='object')

In [8]:
# change DATE and TIME to datetime objects
mtadf['OBSERVED_AT'] = pd.to_datetime(mtadf['DATE']+ ' ' + mtadf['TIME'], 
                                      format='%m/%d/%Y %H:%M:%S')

In [9]:
# 21 days of data AKA 3 weeks
# mtadf.DATE.value_counts().sort_index()

In [10]:
# create column for day of week
mtadf['WEEKDAY']=mtadf['OBSERVED_AT'].dt.dayofweek

In [11]:
# change WEEKDAY from int64 to string Monday-Sunday
daymap = {0:'Monday',
          1: 'Tuesday',
          2: 'Wednesday',
          3: 'Thursday',
          4: 'Friday',
          5: 'Saturday',
          6: 'Sunday'}
mtadf['WEEKDAY']=mtadf['WEEKDAY'].map(daymap)

In [12]:
# create some other columns based on the datetime object
mtadf['MONTH'] = mtadf.OBSERVED_AT.dt.month
mtadf['DAY'] = mtadf.OBSERVED_AT.dt.day
mtadf['HOUR'] = mtadf.OBSERVED_AT.dt.hour
mtadf['MIN'] = mtadf.OBSERVED_AT.dt.minute

In [13]:
# create identifyer for each turnstile then drop original columns
mtadf['TURNSTILE'] = mtadf['UNIT'] + '-' + mtadf['C/A'] + '-' + mtadf['SCP']

In [14]:
mtadf.drop(['C/A','UNIT','SCP'],axis=1,inplace=True)

In [15]:
mtadf.columns

Index(['STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME', 'DESC', 'ENTRIES',
       'EXITS', 'OBSERVED_AT', 'WEEKDAY', 'MONTH', 'DAY', 'HOUR', 'MIN',
       'TURNSTILE'],
      dtype='object')

In [16]:
# rearrange order of columns
col_names = ['STATION','TURNSTILE','OBSERVED_AT',
             'DATE','MONTH','DAY','WEEKDAY','TIME','HOUR','MIN',
             'ENTRIES','EXITS',
             'LINENAME','DIVISION','DESC']
mtadf = mtadf.reindex(columns=col_names)
mtadf.head(1)

Unnamed: 0,STATION,TURNSTILE,OBSERVED_AT,DATE,MONTH,DAY,WEEKDAY,TIME,HOUR,MIN,ENTRIES,EXITS,LINENAME,DIVISION,DESC
0,59 ST,R051-A002-02-00-00,2018-12-29 03:00:00,12/29/2018,12,29,Saturday,03:00:00,3,0,6889287,2335920,NQR456W,BMT,REGULAR


In [17]:
# sort the dataframe by station, then month/day/hour/min
mtadf.sort_values(['STATION','MONTH','DAY'],inplace=True)

In [18]:
# mtadf.OBSERVED_AT.dt.hour.value_counts()
#some observations are at 00:00:00 aka midnight

In [19]:
# create an hour offset
one_hour = pd.offsets.Hour()

In [20]:
# how many rows occur from midnight to midnight:59?
# print(mtadf[mtadf['HOUR'] == 0].shape)
# print(mtadf.shape)

In [21]:
# if the observation was from midnight to midight:59
# subtract 1 hour so the log is included for the previous day
mtadf['OBSERVED_AT'] = mtadf['OBSERVED_AT'].apply(lambda x: x-one_hour if x.hour == 0 else x)

In [22]:
# check if we have any instances of midnight to midnight:59
# mtadf['HOUR'] = mtadf['OBSERVED_AT'].dt.hour
# print(mtadf[mtadf['HOUR'] == 0])

In [23]:
# fix our previous columbs made from OBSERVED_AT
mtadf['MONTH'] = mtadf.OBSERVED_AT.dt.month
mtadf['DAY'] = mtadf.OBSERVED_AT.dt.day
mtadf['HOUR'] = mtadf.OBSERVED_AT.dt.hour
mtadf['MIN'] = mtadf.OBSERVED_AT.dt.minute

In [24]:
# cumulative entries per day per turnstile per station
mtadf_ts_daily = (mtadf.groupby(['TURNSTILE','STATION','DATE'],as_index=False)
                        [['ENTRIES','EXITS']].max())
mtadf_ts_daily.head(30)

Unnamed: 0,TURNSTILE,STATION,DATE,ENTRIES,EXITS
0,R001-A058-01-00-00,WHITEHALL S-FRY,01/01/2019,1495716,3470742
1,R001-A058-01-00-00,WHITEHALL S-FRY,01/02/2019,1496890,3473304
2,R001-A058-01-00-00,WHITEHALL S-FRY,01/03/2019,1498200,3475903
3,R001-A058-01-00-00,WHITEHALL S-FRY,01/04/2019,1499417,3478394
4,R001-A058-01-00-00,WHITEHALL S-FRY,01/05/2019,1499698,3479101
5,R001-A058-01-00-00,WHITEHALL S-FRY,01/06/2019,1499912,3479811
6,R001-A058-01-00-00,WHITEHALL S-FRY,01/07/2019,1501081,3482319
7,R001-A058-01-00-00,WHITEHALL S-FRY,01/08/2019,1502262,3484936
8,R001-A058-01-00-00,WHITEHALL S-FRY,01/09/2019,1503601,3487633
9,R001-A058-01-00-00,WHITEHALL S-FRY,01/10/2019,1504933,3490320


In [28]:
mtadf_ts_daily[['PREV_DATE','PREV_ENTRIES','PREV_EXITS']]=mtadf_ts_daily\
                            .groupby(['TURNSTILE','STATION'])[['DATE', 'ENTRIES','EXITS']]\
                            .apply(lambda grp: grp.shift(1))
mtadf_ts_daily            

Unnamed: 0,TURNSTILE,STATION,DATE,ENTRIES,EXITS,PREV_DATE,PREV_ENTRIES,PREV_EXITS
0,R001-A058-01-00-00,WHITEHALL S-FRY,01/01/2019,1495716,3470742,,,
1,R001-A058-01-00-00,WHITEHALL S-FRY,01/02/2019,1496890,3473304,01/01/2019,1495716.0,3470742.0
2,R001-A058-01-00-00,WHITEHALL S-FRY,01/03/2019,1498200,3475903,01/02/2019,1496890.0,3473304.0
3,R001-A058-01-00-00,WHITEHALL S-FRY,01/04/2019,1499417,3478394,01/03/2019,1498200.0,3475903.0
4,R001-A058-01-00-00,WHITEHALL S-FRY,01/05/2019,1499698,3479101,01/04/2019,1499417.0,3478394.0
...,...,...,...,...,...,...,...,...
100631,R572-N702A-01-03-04,96 ST-2 AVE,01/17/2019,931008,304809,01/16/2019,929537.0,304322.0
100632,R572-N702A-01-03-04,96 ST-2 AVE,01/18/2019,932433,305272,01/17/2019,931008.0,304809.0
100633,R572-N702A-01-03-04,96 ST-2 AVE,12/29/2018,909936,297827,01/18/2019,932433.0,305272.0
100634,R572-N702A-01-03-04,96 ST-2 AVE,12/30/2018,910208,297928,12/29/2018,909936.0,297827.0


In [29]:
# drop the rows for the earliest date in the DF for each Turnstile
mtadf_ts_daily.dropna(subset=["PREV_DATE"], axis=0, inplace=True)

In [30]:
#mtadf_ts_daily.shape

In [31]:
# how many entries are going backwards
mtadf_ts_daily[mtadf_ts_daily['ENTRIES'] < mtadf_ts_daily['PREV_ENTRIES']].shape

(5245, 8)

In [32]:
# how many turnstiles per station
mtadf.groupby('STATION')['TURNSTILE'].nunique().describe()

count    377.000000
mean      12.790451
std       12.903122
min        2.000000
25%        5.000000
50%        9.000000
75%       16.000000
max      100.000000
Name: TURNSTILE, dtype: float64

In [35]:
# get the entries per day for each turnstile
def get_daily_entries(df):
    #max_count is 2 person per minute through the turnstile
    max_count = 120 * 24
    count = df['ENTRIES'] - df['PREV_ENTRIES']
    if count < 0:
        count = -count
        
    if count > max_count:
        count = min(df["ENTRIES"], df["PREV_ENTRIES"])
    
    if count > max_count:
        return 0
    return count

In [36]:
mtadf_ts_daily['NET_ENTRIES']= mtadf_ts_daily.apply(get_daily_entries, axis=1)

In [39]:
# same for exits
def get_daily_exits(df):
    max_count = 120 * 24
    count = df['EXITS'] - df['PREV_EXITS']
    if count < 0:
        count = -count
        
    if count > max_count:
        count = min(df['EXITS'], df['PREV_EXITS'])
    
    if count > max_count:
        return 0
    return count

In [40]:
mtadf_ts_daily['NET_EXITS']= mtadf_ts_daily.apply(get_daily_exits, axis=1)

In [1009]:
mtadf_ts_daily.groupby(['STATION','DATE'],as_index=False)[['NET_ENTRIES','NET_EXITS']].sum()

Unnamed: 0,STATION,DATE,NET_ENTRIES,NET_EXITS
0,1 AV,01/02/2019,7309.0,5242.0
1,1 AV,01/03/2019,7533.0,5588.0
2,1 AV,01/04/2019,8009.0,5905.0
3,1 AV,01/05/2019,7542.0,6590.0
4,1 AV,01/06/2019,12021.0,10851.0
...,...,...,...,...
7522,ZEREGA AV,01/17/2019,2811.0,2600.0
7523,ZEREGA AV,01/18/2019,2678.0,2430.0
7524,ZEREGA AV,12/29/2018,528.0,1.0
7525,ZEREGA AV,12/30/2018,1068.0,1197.0


In [1013]:
mtadf_ts_daily['NET_BUSY'] = mtadf_ts_daily['NET_ENTRIES'] + mtadf_ts_daily['NET_EXITS']

In [1014]:
mtadf_ts_daily

Unnamed: 0,TURNSTILE,STATION,DATE,ENTRIES,EXITS,PREV_DATE,PREV_ENTRIES,PREV_EXITS,NET_ENTRIES,NET_EXITS,NET_BUSY
1,R001-A058-01-00-00,WHITEHALL S-FRY,01/02/2019,1496890,3473304,01/01/2019,1495716.0,3470742.0,1174.0,2562.0,3736.0
2,R001-A058-01-00-00,WHITEHALL S-FRY,01/03/2019,1498200,3475903,01/02/2019,1496890.0,3473304.0,1310.0,2599.0,3909.0
3,R001-A058-01-00-00,WHITEHALL S-FRY,01/04/2019,1499417,3478394,01/03/2019,1498200.0,3475903.0,1217.0,2491.0,3708.0
4,R001-A058-01-00-00,WHITEHALL S-FRY,01/05/2019,1499698,3479101,01/04/2019,1499417.0,3478394.0,281.0,707.0,988.0
5,R001-A058-01-00-00,WHITEHALL S-FRY,01/06/2019,1499912,3479811,01/05/2019,1499698.0,3479101.0,214.0,710.0,924.0
...,...,...,...,...,...,...,...,...,...,...,...
100631,R572-N702A-01-03-04,96 ST-2 AVE,01/17/2019,931008,304809,01/16/2019,929537.0,304322.0,1471.0,487.0,1958.0
100632,R572-N702A-01-03-04,96 ST-2 AVE,01/18/2019,932433,305272,01/17/2019,931008.0,304809.0,1425.0,463.0,1888.0
100633,R572-N702A-01-03-04,96 ST-2 AVE,12/29/2018,909936,297827,01/18/2019,932433.0,305272.0,0.0,0.0,0.0
100634,R572-N702A-01-03-04,96 ST-2 AVE,12/30/2018,910208,297928,12/29/2018,909936.0,297827.0,272.0,101.0,373.0


In [815]:
# check for null values in dataset -- no null values
mtadf.isna().sum()

# check for duplicates
mtadf.groupby(['TURNSTILE_ID'])\
 .ENTRIES.count()\
 .sort_values(ascending=False).head(20)

TURNSTILE_ID
R080-A011-01-00-02-12/30/2018-15:00:00    2
R044-R210-00-00-01-01/13/2019-11:00:00    2
R080-A011-01-00-02-12/30/2018-11:00:00    2
R080-A011-01-00-00-12/30/2018-15:00:00    2
R080-A011-01-00-03-12/30/2018-15:00:00    2
R080-A011-01-00-04-12/30/2018-15:00:00    2
R080-A011-01-00-01-12/30/2018-15:00:00    2
R080-A011-01-00-01-12/30/2018-11:00:00    2
R285-N196-00-03-00-01/05/2019-19:00:00    2
R084-R158-00-06-00-01/14/2019-03:00:00    2
R268-N409-00-00-03-01/17/2019-12:00:00    1
R268-N409-00-00-03-12/29/2018-20:00:00    1
R268-N409-00-00-03-01/16/2019-08:23:12    1
R268-N409-00-00-03-12/31/2018-00:00:00    1
R268-N409-00-00-03-12/30/2018-20:00:00    1
R268-N409-00-00-03-12/30/2018-16:00:00    1
R268-N409-00-00-03-01/16/2019-10:09:48    1
R268-N409-00-00-03-01/16/2019-12:00:00    1
R268-N409-00-00-03-12/30/2018-12:00:00    1
R268-N409-00-00-03-01/16/2019-16:00:00    1
Name: ENTRIES, dtype: int64

In [726]:
# we have multiple TURNSTILES that have 
# a double entry for the same TIME on the DATE
mtadf[(mtadf.duplicated(subset=['TURNSTILE_ID'],keep='first'))\
     | (mtadf.duplicated(subset='TURNSTILE_ID',keep='last'))]


# looks like it's related to a recovery audit, let's deal with this later

# mtadf[(mtadf.TURNSTILE == 'R080-A011-01-00-00') & (mtadf.OBSERVED_AT.dt.month == 12) & (mtadf.OBSERVED_AT.dt.day == 30)].tail()

Unnamed: 0,TURNSTILE_ID,STATION,TURNSTILE,OBSERVED_AT,MONTH,DAY,WEEKDAY,ENTRIES,EXITS,LINENAME,DIVISION,DESC
1440,R080-A011-01-00-00-12/30/2018-15:00:00,57 ST-7 AV,R080-A011-01-00-00,2018-12-30 15:00:00,12,30,Sunday,885834893,490418389,NQRW,BMT,REGULAR
1441,R080-A011-01-00-00-12/30/2018-15:00:00,57 ST-7 AV,R080-A011-01-00-00,2018-12-30 15:00:00,12,30,Sunday,21,15,NQRW,BMT,RECOVR AUD
1488,R080-A011-01-00-01-12/30/2018-11:00:00,57 ST-7 AV,R080-A011-01-00-01,2018-12-30 11:00:00,12,30,Sunday,16816112,18379007,NQRW,BMT,REGULAR
1489,R080-A011-01-00-01-12/30/2018-11:00:00,57 ST-7 AV,R080-A011-01-00-01,2018-12-30 11:00:00,12,30,Sunday,25,21,NQRW,BMT,RECOVR AUD
1492,R080-A011-01-00-01-12/30/2018-15:00:00,57 ST-7 AV,R080-A011-01-00-01,2018-12-30 15:00:00,12,30,Sunday,16816230,18379288,NQRW,BMT,REGULAR
1493,R080-A011-01-00-01-12/30/2018-15:00:00,57 ST-7 AV,R080-A011-01-00-01,2018-12-30 15:00:00,12,30,Sunday,27,37,NQRW,BMT,RECOVR AUD
1538,R080-A011-01-00-02-12/30/2018-11:00:00,57 ST-7 AV,R080-A011-01-00-02,2018-12-30 11:00:00,12,30,Sunday,1562636578,1293612806,NQRW,BMT,REGULAR
1539,R080-A011-01-00-02-12/30/2018-11:00:00,57 ST-7 AV,R080-A011-01-00-02,2018-12-30 11:00:00,12,30,Sunday,23,17,NQRW,BMT,RECOVR AUD
1542,R080-A011-01-00-02-12/30/2018-15:00:00,57 ST-7 AV,R080-A011-01-00-02,2018-12-30 15:00:00,12,30,Sunday,1562636818,1293613149,NQRW,BMT,REGULAR
1543,R080-A011-01-00-02-12/30/2018-15:00:00,57 ST-7 AV,R080-A011-01-00-02,2018-12-30 15:00:00,12,30,Sunday,25,28,NQRW,BMT,RECOVR AUD


In [727]:
turnstiles_daily[["PREV_DATE", "PREV_ENTRIES"]] = (turnstiles_daily
                                                       .groupby(["C/A", "UNIT", "SCP", "STATION"])["DATE", "ENTRIES"]
                                                       .apply(lambda grp: grp.shift(1)))

NameError: name 'turnstiles_daily' is not defined

In [None]:
# sort the database by station, then turnstile, then date, then time
# mtadfs = mtadf.sort_values(['STATION','TURNSTILE','OBSERVED_AT'])

# which 10 stations have the least amount of turnstiles?
# mtadfs.groupby(['STATION'])['TURNSTILE'].nunique().nsmallest(10)

# create a database with 5 stations
# mtadfs_3 = mtadfs[mtadfs['STATION'].isin(['138/GRAND CONC','182-183 STS','190 ST'])]

In [None]:
# snippet from above (when isolating 1 day)
                       # &mtadfs['DATE'].isin(['2019-01-04'])]
    
# check max amount of entries per station
# CLEVELAND ST and SUTTER AV have significantly less entries
# remove them from above .isin() statement
mtadfs_3.groupby(['STATION'])['ENTRIES'].max()

# 3 turnstiles per STATION
mtadfs_3.groupby(['STATION'])[['TURNSTILE']].describe()

# 6 entries per day per TURNSTILE
mtadfs_3['TURNSTILE'].sort_values().value_counts()

In [None]:
# we now have a database with information 
# for 3 STATIONS (138/GRAND CONC, 182-183 STS, 190 ST)
mtadfs_3['DATE'].dt.day.value_counts()

# 9 unique TURNSTILES total
# 6 entries per day per TURNSTILE
# 7 days
# 42 total entries per TURNSTILE

In [None]:
# create a database without the unnecessary columns and reset index
mtadfs_3d=mtadfs_3[['STATION',
                                'TURNSTILE',
                                'LINENAME',
                                'DATE',
                                'WEEKDAY',
                                'HOUR',
                                'MIN',
                                'ENTRIES',
                                'EXITS']].reset_index()

In [None]:
# drop the old indexes from before the sort
mtadfs_3d.drop('index',axis=1,inplace=True)

In [None]:
# calculate the sum of the entries and exits per time period
# as long as it is in order by time and day, cumulative entries should always go up
mtadfs_3d['SUM_ENTRIES'] = mtadfs_3d.groupby(['TURNSTILE'])['ENTRIES'].diff()
mtadfs_3d['SUM_EXITS'] = mtadfs_3d.groupby(['TURNSTILE'])['EXITS'].diff()

# shift the column data up by 1 row (periods=-1)
mtadfs_3d['SUM_ENTRIES'] = mtadfs_3d['SUM_ENTRIES'].shift(periods=-1, axis=0)
mtadfs_3d['SUM_EXITS'] = mtadfs_3d['SUM_EXITS'].shift(periods=-1, axis=0)

# shape = 378 rows
mtadfs_3d.shape

In [None]:
# double check for negative values in the SUM_ENTRIES 
# (if it is cumulative all values should be positive)
mtadfs_3d_nonegs = mtadfs_3d[mtadfs_3d['SUM_ENTRIES'] > 0]

# shape check = 287 rows
mtadfs_3d_nonegs.shape

# double check for negative values in the SUM_EXITS 
# (if it is cumulative all values should be positive)
mtadfs_3d_nonegs = mtadfs_3d_nonegs[mtadfs_3d_nonegs['SUM_EXITS'] > 0]

#shape check = 286 rows
mtadfs_3d_nonegs.shape

# alternate syntax
# nonegs = (mtadfs_3d['SUM_ENTRIES'] > 0) | (mtadfs_3d['SUM_EXITS'] > 0)
# mtadfs_3d_nonegs = mtadfs_3d[nonegs]

In [None]:
# create a column to indicate how 'busy a station is during that interval'
mtadfs_3d_nonegs['BUSY'] = mtadfs_3d_nonegs['SUM_ENTRIES'] + mtadfs_3d_nonegs['SUM_EXITS']

In [None]:
# check the dataframe
mtadfs_3d_nonegs.head(10)

In [None]:
# turn the lines into categorical data
# first split up the string to have one letter/number per column
splits = mtadfs_3d_nonegs['LINENAME'].str.split('',expand=True)

# look at spits
splits

# it didn't create a column per unique line

In [None]:
# double check what has happened
print(splits.loc[:,0].unique())
print(splits.loc[:,1].unique())
print(splits.loc[:,2].unique())
print(splits.loc[:,3].unique())

# column location 0 and column location 3 don't contain anything

In [None]:
# drop column location 0 and column location 3
splits.drop(labels=[0,3], axis=1,inplace=True)

In [None]:
# column names are integers, use .loc to indicate 0:3 is not the index but the column name
# get dummies
dummy=pd.get_dummies(splits.loc[:,1:2])

In [None]:
# we should have FIVE columns: A, B, D, 4, 5
# but we found an extra column called '2_'
dummy.drop(columns='2_',axis=1,inplace=True)

In [None]:
# lets rename these to make more sense
linemap = {'1_4':'LINE_4','1_A':'LINE_A','1_B':'LINE_B','2_5':'LINE_5','2_D':'LINE_D'}
dummy.rename(mapper=linemap,axis=1,inplace=True)

# add back into our dataframe
mtadfs_3d_nonegs_lines = pd.concat([mtadfs_3d_nonegs,dummy],axis=1)

In [None]:
mtadfs_3d_nonegs_lines.drop('LINENAME',axis=1,inplace=True)

In [None]:
# need the SUM_ENTRIES per STATION per DAY per HOUR
# start with clean empty DataFrame
dfdaily = pd.DataFrame()

In [None]:
# GROUPBY OBJECT: DATE, WEEKDAY (matches the day of week with the date), STATION
# RETURN: SUM_ENTRIES, SUM_EXITS, BUSY
# AGGREGATE: .sum() per day per station
# ASSIGN TO NEW COLUMN IN NEW DATAFRAME

dfdaily['DAY_SUM_ENTRIES']= mtadfs_3d_nonegs_lines.groupby([mtadfs_3d_nonegs_lines.DATE,
                                                            mtadfs_3d_nonegs_lines.WEEKDAY,
                                                            mtadfs_3d_nonegs_lines.STATION])['SUM_ENTRIES'].sum()

dfdaily['DAY_SUM_EXITS']= mtadfs_3d_nonegs_lines.groupby([mtadfs_3d_nonegs_lines.DATE,
                                                          mtadfs_3d_nonegs_lines.WEEKDAY,
                                                          mtadfs_3d_nonegs_lines.STATION])['SUM_EXITS'].sum()

dfdaily['DAY_SUM_BUSY']= mtadfs_3d_nonegs_lines.groupby([mtadfs_3d_nonegs_lines.DATE,
                                                          mtadfs_3d_nonegs_lines.WEEKDAY,
                                                          mtadfs_3d_nonegs_lines.STATION])['BUSY'].sum()

In [None]:
# show our work
dfdaily

In [None]:
# no longer needed because I kept DATE as one column instead of separating into DAY and MONTH
# dfdaily.rename_axis(index=['DAY','WEEKDAY','MONTH','STATION'],inplace=True)

In [None]:
# reset the index so they are now columns
dfdaily.reset_index(inplace=True)
dfdaily

In [None]:
fig = plt.figure(dpi=200)
axes = fig.add_axes([0,0,1,1])
sns.barplot(data=dfdaily,
            x='WEEKDAY',
            y='DAY_SUM_ENTRIES',
            hue='STATION',
            palette='viridis')
axes.set_xlabel('DAY OF WEEK')
axes.set_ylabel('TOTAL ENTRIES')
axes.set_title('TOTAL ENTRIES PER DAY OF WEEK')
axes.legend(loc=(1.02,.5))
plt.show()

In [None]:
linemap = {'Broadway 7 Av EXPRESS' : '23',
           'Broadway 7 Av LOCAL' : '1',
           'Lexington Av EXPRESS':'456',
           'Lexington Av LOCAL': '6',
           'Flushing EXPRESS':'7',
           'Flushing LOCAL':'7',
           '8 Av EXPRESS':'A',
           '8 AV LOCAL':'CE',
           '6 Av EXPRESS':'BD',
           '6 AV LOCAL':'FM',
           'Broadway EXPRESS':'NQ',
           'Broadway LOCAL':'RW',
           'Nassau St EXPRESS':'JZ',
           'Canarsie LOCAL':'L',
           'Crosstown LOCAL':'G',
           'Shuttle':'S',
          }

In [41]:
linemap = {'Broadway 7 Av EXPRESS' : '2',
           'Bradway 7 Av EXPRESS': '3',
           'Broadway 7 Av LOCAL' : '1',
           'Lexington Av EXPRESS':'4',
           'Lexington Av EXPRESS':'5',
           'Lexington Av EXPRESS':'6',
           'Lexington Av LOCAL': '6',
           'Flushing EXPRESS':'7',
           'Flushing LOCAL':'7',
           '8 Av EXPRESS':'A',
           '8 AV LOCAL':'C',
           '8 AV LOCAL': 'E',
           '6 Av EXPRESS':'B',
           '6 Av EXPRESS':'D',
           '6 AV LOCAL':'F',
           '6 AV LOCAL':'M',
           'Broadway EXPRESS':'N',
           'Broadway EXPRESS':'Q',
           'Broadway LOCAL':'R',
           'Broadway LOCAL':'W',
           'Nassau St EXPRESS':'J',
           'Nassau St EXPRESS':'Z',
           'Canarsie LOCAL':'L',
           'Crosstown LOCAL':'G',
           'Shuttle':'S',}

In [42]:
def get_line_name(x):
    linelist = []
    for key,value in linemap.items():
        if value in x:
            if key not in linelist:
                linelist.append(key)
    if linelist:
        return linelist
    else:
        return x

In [None]:
mtadf_st_daily['LINES'] = mtadf_st_daily['LINENAME'].map(get_line_name)