## Set Up

In [12]:
import pandas as pd
import calendar
import datetime

In [13]:
mta190504_raw = pd.read_csv("http://web.mta.info/developers/data/nyct/turnstile/turnstile_190504.txt")
mta190504_raw.describe()

Unnamed: 0,ENTRIES,EXITS
count,206857.0,206857.0
mean,40575300.0,33197990.0
std,208275200.0,192669900.0
min,0.0,0.0
25%,349546.0,141334.0
50%,2176408.0,1241604.0
75%,6775342.0,4590174.0
max,2129343000.0,2124127000.0


## Data Cleaning and Variable Creation

In [14]:
#Clean column names
mta190504_raw.columns = mta190504_raw.columns.str.strip()
mta190504_raw.columns

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES', 'EXITS'],
      dtype='object')

In [15]:
# Go from raw DF to new DF that will have rows dropped
mta190504 = mta190504_raw.copy()

# Create variables:
# "time_hour" that simplifies the hour for later grouping purposes
mta190504["time_hour"] = pd.to_numeric(mta190504["TIME"].str[0:2])
# formatted date variable "DDATE"
mta190504['DDATE']=[datetime.datetime.strptime(x, '%m/%d/%Y') for x in mta190504['DATE']]
# formatted time variable "DTIME"
mta190504['DTIME']=[format(datetime.datetime.strptime(x, '%H:%M:%S'),"%H:%M:%S") for x in mta190504['TIME']]
# formatted day of the week variable "DDAY"
mta190504['DDAY']=[calendar.day_name[datetime.datetime.weekday(x)] for x in mta190504['DDATE']]

In [16]:
# Sort before grouping
mta190504.sort_values(["C/A","UNIT","SCP","STATION","LINENAME","DIVISION","DATE","TIME","DESC"])
# Create difference columns to calculate difference in entries and exits between the row and the row before (aka the time before)
mta190504["entries_diff"] = mta190504.groupby(["C/A","UNIT","SCP","STATION","LINENAME","DIVISION"]).ENTRIES.diff()
mta190504["exits_diff"] = mta190504.groupby(["C/A","UNIT","SCP","STATION","LINENAME","DIVISION"]).EXITS.diff()
mta190504["entries-exits"] = mta190504["entries_diff"] - mta190504["exits_diff"]
mta190504["entries+exits"] = mta190504["entries_diff"] + mta190504["exits_diff"]
mta190504.describe()

Unnamed: 0,ENTRIES,EXITS,time_hour,entries_diff,exits_diff,entries-exits,entries+exits
count,206857.0,206857.0,206857.0,202006.0,202006.0,202006.0,202006.0
mean,40575300.0,33197990.0,10.517256,2075.907,7433.828,-5357.922,9509.735
std,208275200.0,192669900.0,6.806081,3508686.0,3415174.0,1754742.0,6698467.0
min,0.0,0.0,0.0,-839251700.0,-384349600.0,-671409900.0,-1007094000.0
25%,349546.0,141334.0,4.0,9.0,8.0,-25.0,27.0
50%,2176408.0,1241604.0,10.0,72.0,52.0,1.0,168.0
75%,6775342.0,4590174.0,16.0,239.0,166.0,97.0,455.0
max,2129343000.0,2124127000.0,23.0,1126553000.0,1078346000.0,48206440.0,2204899000.0


In [17]:
#keep only rows with positive entries_diff, exits_diff, and ENTRIES
mta190504= mta190504[mta190504.entries_diff > 0]
mta190504 = mta190504[mta190504.exits_diff > 0]
mta190504 = mta190504[mta190504.ENTRIES > 0]
mta190504 = mta190504[mta190504.entries_diff < 10**7]
mta190504 = mta190504[mta190504.exits_diff < 10**7]
mta190504.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 165744 entries, 1 to 206772
Data columns (total 19 columns):
C/A              165744 non-null object
UNIT             165744 non-null object
SCP              165744 non-null object
STATION          165744 non-null object
LINENAME         165744 non-null object
DIVISION         165744 non-null object
DATE             165744 non-null object
TIME             165744 non-null object
DESC             165744 non-null object
ENTRIES          165744 non-null int64
EXITS            165744 non-null int64
time_hour        165744 non-null int64
DDATE            165744 non-null datetime64[ns]
DTIME            165744 non-null object
DDAY             165744 non-null object
entries_diff     165744 non-null float64
exits_diff       165744 non-null float64
entries-exits    165744 non-null float64
entries+exits    165744 non-null float64
dtypes: datetime64[ns](1), float64(4), int64(3), object(11)
memory usage: 25.3+ MB


In [18]:
# Checks
# No negative values in entries_diff or exits_diff
mta190504.describe()

Unnamed: 0,ENTRIES,EXITS,time_hour,entries_diff,exits_diff,entries-exits,entries+exits
count,165744.0,165744.0,165744.0,165744.0,165744.0,165744.0,165744.0
mean,27176130.0,20166160.0,10.914603,222.023162,321.2516,-99.22846,543.2748
std,159218500.0,134544200.0,6.713721,1045.876071,14706.97,13696.93,15721.71
min,2.0,9.0,0.0,1.0,1.0,-1281387.0,2.0
25%,676636.8,379818.5,5.0,31.0,24.0,-40.0,82.0
50%,2814699.0,1657634.0,12.0,116.0,78.0,12.0,246.0
75%,7036347.0,5019779.0,17.0,290.0,202.0,134.0,533.0
max,2115816000.0,2037805000.0,23.0,94944.0,1376261.0,6345.0,1471135.0


In [19]:
#create intervals: 01-04;05-08;09-12;13-16;17-20;21-00
#start with 01 instead of 00, assuming that most audits are on the hour so first element is inclusive
#and last element is exclusive
def time_interval(x):
    if x in [1,2,3,4]:
        return "01:00-04:59"
    elif x in [5,6,7,8]:
        return "05:00-08:59"
    elif x in [9,10,11,12]:
        return "09:00-12:59"
    elif x in [13,14,15,16]:
        return "13:00-16:59"
    elif x in [17,18,19,20]:
        return "17:00-20:59"
    elif x in [21,22,23,0]:
        return "21:00-00:59"

In [20]:
# create time period category "time_cat"
mta190504["time_cat"] = mta190504["time_hour"].apply(time_interval)
mta190504.head(50)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,time_hour,DDATE,DTIME,DDAY,entries_diff,exits_diff,entries-exits,entries+exits,time_cat
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,04:00:00,REGULAR,7035269,2384840,4,2019-04-27,04:00:00,Saturday,20.0,7.0,13.0,27.0,01:00-04:59
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,08:00:00,REGULAR,7035292,2384875,8,2019-04-27,08:00:00,Saturday,23.0,35.0,-12.0,58.0,05:00-08:59
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,12:00:00,REGULAR,7035392,2384951,12,2019-04-27,12:00:00,Saturday,100.0,76.0,24.0,176.0,09:00-12:59
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,16:00:00,REGULAR,7035651,2385020,16,2019-04-27,16:00:00,Saturday,259.0,69.0,190.0,328.0,13:00-16:59
5,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,20:00:00,REGULAR,7035930,2385070,20,2019-04-27,20:00:00,Saturday,279.0,50.0,229.0,329.0,17:00-20:59
6,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/28/2019,00:00:00,REGULAR,7036100,2385087,0,2019-04-28,00:00:00,Sunday,170.0,17.0,153.0,187.0,21:00-00:59
7,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/28/2019,04:00:00,REGULAR,7036119,2385088,4,2019-04-28,04:00:00,Sunday,19.0,1.0,18.0,20.0,01:00-04:59
8,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/28/2019,08:00:00,REGULAR,7036125,2385103,8,2019-04-28,08:00:00,Sunday,6.0,15.0,-9.0,21.0,05:00-08:59
9,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/28/2019,12:00:00,REGULAR,7036197,2385155,12,2019-04-28,12:00:00,Sunday,72.0,52.0,20.0,124.0,09:00-12:59
10,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/28/2019,16:00:00,REGULAR,7036372,2385198,16,2019-04-28,16:00:00,Sunday,175.0,43.0,132.0,218.0,13:00-16:59


In [21]:
# For outlier entries and exit differences, replace with imputed mean for the remaining valid entries
# INSERT SHREYAK'S CODE

## Output grouped data

In [22]:
# Group by station
mta_station = mta190504.groupby(["STATION"])[["entries_diff","exits_diff","entries+exits"]].sum().sort_values(["entries+exits","entries_diff","exits_diff"], ascending=False)
mta_station

Unnamed: 0_level_0,entries_diff,exits_diff,entries+exits
STATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TWENTY THIRD ST,1854775.0,26142573.0,27997348.0
34 ST-PENN STA,1023199.0,922432.0,1945631.0
GRD CNTRL-42 ST,881117.0,769767.0,1650884.0
34 ST-HERALD SQ,703365.0,661376.0,1364741.0
14 ST-UNION SQ,650999.0,613366.0,1264365.0
TIMES SQ-42 ST,614463.0,579138.0,1193601.0
23 ST,666245.0,465315.0,1131560.0
FULTON ST,586908.0,488472.0,1075380.0
42 ST-PORT AUTH,565110.0,482997.0,1048107.0
86 ST,507234.0,475735.0,982969.0


In [23]:
# Group by station and date
mta_station_date = mta190504.groupby(["STATION","DDATE"])[["entries_diff","exits_diff","entries+exits"]].sum().sort_values(["entries+exits","entries_diff","exits_diff"], ascending=False)
mta_station_date

Unnamed: 0_level_0,Unnamed: 1_level_0,entries_diff,exits_diff,entries+exits
STATION,DDATE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TWENTY THIRD ST,2019-04-28,570806.0,8238772.0,8809578.0
TWENTY THIRD ST,2019-04-30,484237.0,6882493.0,7366730.0
TWENTY THIRD ST,2019-04-29,388084.0,5502804.0,5890888.0
TWENTY THIRD ST,2019-04-27,381941.0,5491754.0,5873695.0
34 ST-PENN STA,2019-05-02,175657.0,157005.0,332662.0
34 ST-PENN STA,2019-05-01,175541.0,156632.0,332173.0
34 ST-PENN STA,2019-04-30,172612.0,150738.0,323350.0
34 ST-PENN STA,2019-05-03,167620.0,155012.0,322632.0
34 ST-PENN STA,2019-04-29,166379.0,144031.0,310410.0
GRD CNTRL-42 ST,2019-05-01,161753.0,137785.0,299538.0


In [24]:
# Group by station and day of week
mta_station_day = mta190504.groupby(["STATION","DDAY"])[["entries_diff","exits_diff","entries+exits"]].sum().sort_values(["entries+exits","entries_diff","exits_diff"], ascending=False)
mta_station_day

Unnamed: 0_level_0,Unnamed: 1_level_0,entries_diff,exits_diff,entries+exits
STATION,DDAY,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TWENTY THIRD ST,Sunday,570806.0,8238772.0,8809578.0
TWENTY THIRD ST,Tuesday,484237.0,6882493.0,7366730.0
TWENTY THIRD ST,Monday,388084.0,5502804.0,5890888.0
TWENTY THIRD ST,Saturday,381941.0,5491754.0,5873695.0
34 ST-PENN STA,Thursday,175657.0,157005.0,332662.0
34 ST-PENN STA,Wednesday,175541.0,156632.0,332173.0
34 ST-PENN STA,Tuesday,172612.0,150738.0,323350.0
34 ST-PENN STA,Friday,167620.0,155012.0,322632.0
34 ST-PENN STA,Monday,166379.0,144031.0,310410.0
GRD CNTRL-42 ST,Wednesday,161753.0,137785.0,299538.0


In [25]:
# Group by station, day of week, and time of day
mta_station_day_time = mta190504.groupby(["STATION","DDAY","time_cat"])[["entries_diff","exits_diff","entries+exits"]].sum().sort_values(["entries+exits","entries_diff","exits_diff"], ascending=False)
mta_station_day_time

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,entries_diff,exits_diff,entries+exits
STATION,DDAY,time_cat,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TWENTY THIRD ST,Tuesday,17:00-20:59,98621.0,1377162.0,1475783.0
TWENTY THIRD ST,Monday,17:00-20:59,98952.0,1375547.0,1474499.0
TWENTY THIRD ST,Tuesday,09:00-12:59,95714.0,1377954.0,1473668.0
TWENTY THIRD ST,Monday,09:00-12:59,95462.0,1377769.0,1473231.0
TWENTY THIRD ST,Tuesday,13:00-16:59,95494.0,1376928.0,1472422.0
TWENTY THIRD ST,Monday,13:00-16:59,95768.0,1374993.0,1470761.0
TWENTY THIRD ST,Tuesday,05:00-08:59,94997.0,1375028.0,1470025.0
TWENTY THIRD ST,Tuesday,01:00-04:59,95186.0,1374559.0,1469745.0
TWENTY THIRD ST,Saturday,17:00-20:59,96052.0,1373243.0,1469295.0
TWENTY THIRD ST,Monday,05:00-08:59,94887.0,1374314.0,1469201.0
