## Set Up

In [51]:
import pandas as pd
import calendar
import datetime

## Import Data

In [52]:
# Create function to import multiple data files

data_path = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_"

def import_data(date):
    """
    import data
    clean column names
    """
    dfname = pd.read_csv(data_path+date+".txt")
    dfname.columns = dfname.columns.str.strip()
    return dfname
          
mta190504_raw = import_data("190504")
mta190511_raw = import_data("190511")
mta190518_raw = import_data("190518")
mta190525_raw = import_data("190525")
mta190601_raw = import_data("190601")


In [53]:
# Concatenate all data
frames = [mta190504_raw, mta190511_raw, mta190518_raw, mta190525_raw, mta190601_raw]

mta19_raw = pd.concat(frames).reset_index().drop(["index"], axis=1)

mta19_raw.info()
mta19_raw.describe()
#mta19_raw.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1026784 entries, 0 to 1026783
Data columns (total 11 columns):
C/A         1026784 non-null object
UNIT        1026784 non-null object
SCP         1026784 non-null object
STATION     1026784 non-null object
LINENAME    1026784 non-null object
DIVISION    1026784 non-null object
DATE        1026784 non-null object
TIME        1026784 non-null object
DESC        1026784 non-null object
ENTRIES     1026784 non-null int64
EXITS       1026784 non-null int64
dtypes: int64(2), object(9)
memory usage: 86.2+ MB


Unnamed: 0,ENTRIES,EXITS
count,1026784.0,1026784.0
mean,41548680.0,34022820.0
std,211672400.0,194975100.0
min,0.0,0.0
25%,352555.5,144215.0
50%,2174353.0,1245738.0
75%,6789071.0,4603742.0
max,2129343000.0,2124127000.0


In [54]:
def time_interval(x):
    if x in [0,1,2,3]:
        return "0:00-03:59"
    elif x in [4,5,6,7]:
        return "04:00-07:59"
    elif x in [8,9,10,11]:
        return "08:00-11:59"
    elif x in [12,13,14,15]:
        return "12:00-15:59"
    elif x in [16,17,18,19]:
        return "16:00-19:59"
    elif x in [20,21,22,23]:
        return "20:00-23:59"

In [55]:
# Add and format new variables

def data_clean(dfname):
    """
    add and format new variables
    """
    dfname2 = dfname.copy()
    # Create variables:
    # "time_hour" that simplifies the hour for later grouping purposes
    dfname2["time_hour"] = pd.to_numeric(dfname2["TIME"].str[0:2])
    # formatted date variable "DDATE"
    dfname2['DDATE']=[datetime.datetime.strptime(x, '%m/%d/%Y') for x in dfname2['DATE']]
    # formatted time variable "DTIME"
    dfname2['DTIME']=[datetime.datetime.strptime(x, '%H:%M:%S') for x in dfname2['TIME']]
    dfname2['DDATETIME'] = pd.to_datetime(dfname2['DATE']+" "+dfname2['TIME'])
    dfname2['DDATETIME2'] = dfname2['DDATETIME'] - datetime.timedelta(seconds=1)
    dfname2['DTIME']=[format(x,"%H:%M:%S") for x in dfname2['DTIME']]
    # formatted day of the week variable "DDAY"
    dfname2['DDAY']=[calendar.day_name[datetime.datetime.weekday(x)] for x in dfname2['DDATE']]
    # create time period category "time_cat"
    dfname2["time_cat"] = dfname2["time_hour"].apply(time_interval)
    
    # Sort before grouping for difference calculating
    dfname2.sort_values(["C/A","UNIT","SCP","STATION","LINENAME","DIVISION","DATE","TIME","DESC"], inplace = True)
    # Create difference columns to calculate difference in entries and exits between the row and the row before (aka the time before)
    dfname2["entries_diff"] = dfname2.groupby(["C/A","UNIT","SCP","STATION","LINENAME","DIVISION"]).ENTRIES.diff()
    dfname2["exits_diff"] = dfname2.groupby(["C/A","UNIT","SCP","STATION","LINENAME","DIVISION"]).EXITS.diff()
    dfname2["entries-exits"] = dfname2["entries_diff"] - dfname2["exits_diff"]
    dfname2["entries+exits"] = dfname2["entries_diff"] + dfname2["exits_diff"]
    
    # PLACEHOLDER Create better flag for suspiciously high differences in exits and entries diff
    
    #keep only rows with positive entries_diff, exits_diff, and ENTRIES
    dfname2= dfname2[dfname2.entries_diff > 0]
    dfname2 = dfname2[dfname2.exits_diff > 0]
    dfname2 = dfname2[dfname2.ENTRIES > 0]
    # dropping turnstile 00-04-00 at 23rd st due to data anomaly
    dfname2 = dfname2[(dfname2["STATION"] != "TWENTY THIRD ST") & (dfname2["SCP"] != "00-04-00")]
    # exclude high entries and exits that are likely a result of a terminal reset
    dfname2 = dfname2[dfname2.entries_diff < 10**7]
    dfname2 = dfname2[dfname2.exits_diff < 10**7].reset_index().drop(["index"],axis=1)
    return dfname2

mta19 = data_clean(mta19_raw)
#data_clean(mta18_raw, mta18)
#etc. - can run it for multiple years

In [26]:
mta19.info()
mta19.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 833153 entries, 0 to 833152
Data columns (total 23 columns):
C/A              833153 non-null object
UNIT             833153 non-null object
SCP              833153 non-null object
STATION          833153 non-null object
LINENAME         833153 non-null object
DIVISION         833153 non-null object
DATE             833153 non-null object
TIME             833153 non-null object
DESC             833153 non-null object
ENTRIES          833153 non-null int64
EXITS            833153 non-null int64
time_hour        833153 non-null int64
DDATE            833153 non-null datetime64[ns]
DTIME            833153 non-null object
DDATETIME        833153 non-null datetime64[ns]
DDATETIME2       833153 non-null datetime64[ns]
DDAY             833153 non-null object
time_cat         833153 non-null object
entries_diff     833153 non-null float64
exits_diff       833153 non-null float64
entries-exits    833153 non-null float64
entries+exits    833153 n

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,...,DTIME,DDATETIME,DDATETIME2,DDAY,time_cat,entries_diff,exits_diff,entries-exits,entries+exits,suspicious
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,04:00:00,REGULAR,7035269,...,04:00:00,2019-04-27 04:00:00,2019-04-27 03:59:59,Saturday,04:00-07:59,20.0,7.0,13.0,27.0,
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,08:00:00,REGULAR,7035292,...,08:00:00,2019-04-27 08:00:00,2019-04-27 07:59:59,Saturday,08:00-11:59,23.0,35.0,-12.0,58.0,3.0
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,12:00:00,REGULAR,7035392,...,12:00:00,2019-04-27 12:00:00,2019-04-27 11:59:59,Saturday,12:00-15:59,100.0,76.0,24.0,176.0,77.0
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,16:00:00,REGULAR,7035651,...,16:00:00,2019-04-27 16:00:00,2019-04-27 15:59:59,Saturday,16:00-19:59,259.0,69.0,190.0,328.0,159.0
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,20:00:00,REGULAR,7035930,...,20:00:00,2019-04-27 20:00:00,2019-04-27 19:59:59,Saturday,20:00-23:59,279.0,50.0,229.0,329.0,20.0
5,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/28/2019,00:00:00,REGULAR,7036100,...,00:00:00,2019-04-28 00:00:00,2019-04-27 23:59:59,Sunday,0:00-03:59,170.0,17.0,153.0,187.0,-109.0
6,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/28/2019,04:00:00,REGULAR,7036119,...,04:00:00,2019-04-28 04:00:00,2019-04-28 03:59:59,Sunday,04:00-07:59,19.0,1.0,18.0,20.0,-151.0
7,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/28/2019,08:00:00,REGULAR,7036125,...,08:00:00,2019-04-28 08:00:00,2019-04-28 07:59:59,Sunday,08:00-11:59,6.0,15.0,-9.0,21.0,-13.0
8,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/28/2019,12:00:00,REGULAR,7036197,...,12:00:00,2019-04-28 12:00:00,2019-04-28 11:59:59,Sunday,12:00-15:59,72.0,52.0,20.0,124.0,66.0
9,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/28/2019,16:00:00,REGULAR,7036372,...,16:00:00,2019-04-28 16:00:00,2019-04-28 15:59:59,Sunday,16:00-19:59,175.0,43.0,132.0,218.0,103.0


In [13]:
mta19.describe()

Unnamed: 0,ENTRIES,EXITS,time_hour,entries_diff,exits_diff,entries-exits,entries+exits
count,833153.0,833153.0,833153.0,833153.0,833153.0,833153.0,833153.0
mean,27211950.0,20228520.0,10.737178,213.6133,182.3627,31.25065,395.976
std,158731700.0,134528700.0,6.81871,2515.161,8782.872,8032.819,10119.47
min,2.0,1.0,0.0,1.0,1.0,-5049355.0,2.0
25%,674302.0,377388.0,5.0,31.0,24.0,-41.0,82.0
50%,2812985.0,1660193.0,12.0,113.0,78.0,11.0,242.0
75%,7058666.0,5032537.0,16.0,284.0,200.0,129.0,524.0
max,2115843000.0,2037865000.0,23.0,2091931.0,5368363.0,548439.0,5687589.0


## Ongoing checking - DO NOT RUN CELLS BELOW HERE

In [16]:
mta19.sort_values(["entries_diff"], ascending=False)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,...,DDATE,DTIME,DDATETIME,DDATETIME2,DDAY,time_cat,entries_diff,exits_diff,entries-exits,entries+exits
233899,N076,R111,00-06-01,23 ST,CE,IND,05/09/2019,20:00:00,REGULAR,2223006,...,2019-05-09,20:00:00,2019-05-09 20:00:00,2019-05-09 19:59:59,Thursday,20:00-23:59,2091931.0,1867552.0,224379.0,3959483.0
496544,PTH11,R545,00-00-00,14TH STREET,1,PTH,05/07/2019,15:35:46,REGULAR,590624,...,2019-05-07,15:35:46,2019-05-07 15:35:46,2019-05-07 15:35:45,Tuesday,12:00-15:59,585091.0,36652.0,548439.0,621743.0
476339,PTH02,R544,00-00-00,HARRISON,1,PTH,05/28/2019,11:56:37,REGULAR,343349,...,2019-05-28,11:56:37,2019-05-28 11:56:37,2019-05-28 11:56:36,Tuesday,08:00-11:59,343347.0,45468.0,297879.0,388815.0
621908,R210A,R044,03-03-01,BROOKLYN BRIDGE,456JZ,IRT,05/15/2019,20:00:00,REGULAR,9290094,...,2019-05-15,20:00:00,2019-05-15 20:00:00,2019-05-15 19:59:59,Wednesday,20:00-23:59,320118.0,5367471.0,-5047353.0,5687589.0
622117,R210A,R044,03-03-02,BROOKLYN BRIDGE,456JZ,IRT,05/15/2019,12:00:00,REGULAR,9288333,...,2019-05-15,12:00:00,2019-05-15 12:00:00,2019-05-15 11:59:59,Wednesday,12:00-15:59,319008.0,5368363.0,-5049355.0,5687371.0
509088,PTH19,R549,02-02-08,NEWARK C,1,PTH,05/13/2019,14:14:11,REGULAR,258946,...,2019-05-13,14:14:11,2019-05-13 14:14:11,2019-05-13 14:14:10,Monday,12:00-15:59,200748.0,3050.0,197698.0,203798.0
507146,PTH18,R549,01-02-00,NEWARK BM BW,1,PTH,05/17/2019,13:02:56,REGULAR,207647,...,2019-05-17,13:02:56,2019-05-17 13:02:56,2019-05-17 13:02:55,Friday,12:00-15:59,196340.0,17150.0,179190.0,213490.0
502254,PTH16,R550,01-02-00,LACKAWANNA,1,PTH,05/09/2019,14:34:42,REGULAR,195824,...,2019-05-09,14:34:42,2019-05-09 14:34:42,2019-05-09 14:34:41,Thursday,12:00-15:59,183916.0,1144868.0,-960952.0,1328784.0
502241,PTH16,R550,01-02-00,LACKAWANNA,1,PTH,05/06/2019,15:23:39,REGULAR,195170,...,2019-05-06,15:23:39,2019-05-06 15:23:39,2019-05-06 15:23:38,Monday,12:00-15:59,183471.0,1140653.0,-957182.0,1324124.0
170603,JFK03,R536,00-00-04,JFK JAMAICA CT1,E,IND,05/12/2019,01:00:00,REGULAR,83389,...,2019-05-12,01:00:00,2019-05-12 01:00:00,2019-05-12 00:59:59,Sunday,0:00-03:59,73151.0,466202.0,-393051.0,539353.0


In [18]:
mta19.loc[(mta19["STATION"]=="23 ST") & (mta19["SCP"] == "00-06-01")]

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,...,DDATE,DTIME,DDATETIME,DDATETIME2,DDAY,time_cat,entries_diff,exits_diff,entries-exits,entries+exits
233898,N076,R111,00-06-01,23 ST,CE,IND,04/29/2019,16:00:00,REGULAR,2216058,...,2019-04-29,16:00:00,2019-04-29 16:00:00,2019-04-29 15:59:59,Monday,16:00-19:59,185.0,2.0,183.0,187.0
233899,N076,R111,00-06-01,23 ST,CE,IND,05/09/2019,20:00:00,REGULAR,2223006,...,2019-05-09,20:00:00,2019-05-09 20:00:00,2019-05-09 19:59:59,Thursday,20:00-23:59,2091931.0,1867552.0,224379.0,3959483.0
233900,N076,R111,00-06-01,23 ST,CE,IND,05/10/2019,00:00:00,REGULAR,2223218,...,2019-05-10,00:00:00,2019-05-10 00:00:00,2019-05-09 23:59:59,Friday,0:00-03:59,212.0,125.0,87.0,337.0
233901,N076,R111,00-06-01,23 ST,CE,IND,05/10/2019,04:00:00,REGULAR,2223228,...,2019-05-10,04:00:00,2019-05-10 04:00:00,2019-05-10 03:59:59,Friday,04:00-07:59,10.0,6.0,4.0,16.0
233902,N076,R111,00-06-01,23 ST,CE,IND,05/10/2019,08:00:00,REGULAR,2223292,...,2019-05-10,08:00:00,2019-05-10 08:00:00,2019-05-10 07:59:59,Friday,08:00-11:59,64.0,186.0,-122.0,250.0
233903,N076,R111,00-06-01,23 ST,CE,IND,05/10/2019,12:00:00,REGULAR,2223574,...,2019-05-10,12:00:00,2019-05-10 12:00:00,2019-05-10 11:59:59,Friday,12:00-15:59,282.0,804.0,-522.0,1086.0
233904,N076,R111,00-06-01,23 ST,CE,IND,05/10/2019,16:00:00,REGULAR,2223977,...,2019-05-10,16:00:00,2019-05-10 16:00:00,2019-05-10 15:59:59,Friday,16:00-19:59,403.0,411.0,-8.0,814.0
233905,N076,R111,00-06-01,23 ST,CE,IND,05/10/2019,20:00:00,REGULAR,2224661,...,2019-05-10,20:00:00,2019-05-10 20:00:00,2019-05-10 19:59:59,Friday,20:00-23:59,684.0,357.0,327.0,1041.0
233906,N076,R111,00-06-01,23 ST,CE,IND,05/11/2019,00:00:00,REGULAR,2224797,...,2019-05-11,00:00:00,2019-05-11 00:00:00,2019-05-10 23:59:59,Saturday,0:00-03:59,136.0,156.0,-20.0,292.0
233907,N076,R111,00-06-01,23 ST,CE,IND,05/11/2019,04:00:00,REGULAR,2224816,...,2019-05-11,04:00:00,2019-05-11 04:00:00,2019-05-11 03:59:59,Saturday,04:00-07:59,19.0,8.0,11.0,27.0


In [23]:
mta19.loc[(mta19["STATION"]=="14TH STREET") & (mta19["SCP"] == "00-00-00") & (mta19["DATE"] == "05/07/2019")]

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,...,DDATE,DTIME,DDATETIME,DDATETIME2,DDAY,time_cat,entries_diff,exits_diff,entries-exits,entries+exits
496541,PTH11,R545,00-00-00,14TH STREET,1,PTH,05/07/2019,02:45:46,REGULAR,590494,...,2019-05-07,02:45:46,2019-05-07 02:45:46,2019-05-07 02:45:45,Tuesday,0:00-03:59,29.0,7.0,22.0,36.0
496542,PTH11,R545,00-00-00,14TH STREET,1,PTH,05/07/2019,07:11:46,REGULAR,590498,...,2019-05-07,07:11:46,2019-05-07 07:11:46,2019-05-07 07:11:45,Tuesday,04:00-07:59,4.0,1.0,3.0,5.0
496543,PTH11,R545,00-00-00,14TH STREET,1,PTH,05/07/2019,11:23:46,REGULAR,590531,...,2019-05-07,11:23:46,2019-05-07 11:23:46,2019-05-07 11:23:45,Tuesday,08:00-11:59,33.0,5.0,28.0,38.0
496544,PTH11,R545,00-00-00,14TH STREET,1,PTH,05/07/2019,15:35:46,REGULAR,590624,...,2019-05-07,15:35:46,2019-05-07 15:35:46,2019-05-07 15:35:45,Tuesday,12:00-15:59,585091.0,36652.0,548439.0,621743.0
496545,PTH11,R545,00-00-00,14TH STREET,1,PTH,05/07/2019,19:47:46,REGULAR,591302,...,2019-05-07,19:47:46,2019-05-07 19:47:46,2019-05-07 19:47:45,Tuesday,16:00-19:59,678.0,26.0,652.0,704.0
496546,PTH11,R545,00-00-00,14TH STREET,1,PTH,05/07/2019,23:59:46,REGULAR,591574,...,2019-05-07,23:59:46,2019-05-07 23:59:46,2019-05-07 23:59:45,Tuesday,20:00-23:59,272.0,8.0,264.0,280.0


In [24]:
mta19.loc[(mta19["STATION"]=="HARRISON") & (mta19["SCP"] == "00-00-00") & (mta19["DATE"] == "05/28/2019")]

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,...,DDATE,DTIME,DDATETIME,DDATETIME2,DDAY,time_cat,entries_diff,exits_diff,entries-exits,entries+exits
476337,PTH02,R544,00-00-00,HARRISON,1,PTH,05/28/2019,03:32:37,REGULAR,342357,...,2019-05-28,03:32:37,2019-05-28 03:32:37,2019-05-28 03:32:36,Tuesday,0:00-03:59,1.0,1.0,0.0,2.0
476338,PTH02,R544,00-00-00,HARRISON,1,PTH,05/28/2019,07:44:37,REGULAR,342646,...,2019-05-28,07:44:37,2019-05-28 07:44:37,2019-05-28 07:44:36,Tuesday,04:00-07:59,289.0,8.0,281.0,297.0
476339,PTH02,R544,00-00-00,HARRISON,1,PTH,05/28/2019,11:56:37,REGULAR,343349,...,2019-05-28,11:56:37,2019-05-28 11:56:37,2019-05-28 11:56:36,Tuesday,08:00-11:59,343347.0,45468.0,297879.0,388815.0
476340,PTH02,R544,00-00-00,HARRISON,1,PTH,05/28/2019,16:08:37,REGULAR,343355,...,2019-05-28,16:08:37,2019-05-28 16:08:37,2019-05-28 16:08:36,Tuesday,16:00-19:59,6.0,1.0,5.0,7.0
476341,PTH02,R544,00-00-00,HARRISON,1,PTH,05/28/2019,20:20:37,REGULAR,343378,...,2019-05-28,20:20:37,2019-05-28 20:20:37,2019-05-28 20:20:36,Tuesday,20:00-23:59,23.0,14.0,9.0,37.0


In [35]:
# Identify entries_diff values that are super high absolute numbers - need to figure out how to find high ones relative to the date before
mta19["suspicious"] = mta19.groupby(["C/A","UNIT","SCP","STATION","LINENAME","DIVISION"]).entries_diff.diff()

# May need to calculate ENTRIES_diff grouped by date as well - too many gaps in days are causing issues
mta19["entries_diff2"] = mta19.groupby(["C/A","UNIT","SCP","STATION","LINENAME","DIVISION","DATE"]).ENTRIES.diff()

In [37]:
mta19.iloc[60:100]

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,...,DDATETIME,DDATETIME2,DDAY,time_cat,entries_diff,exits_diff,entries-exits,entries+exits,suspicious,entries_diff2
60,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/06/2019,20:00:00,REGULAR,7047102,...,2019-05-06 20:00:00,2019-05-06 19:59:59,Monday,20:00-23:59,782.0,70.0,712.0,852.0,450.0,782.0
61,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/07/2019,00:00:00,REGULAR,7047315,...,2019-05-07 00:00:00,2019-05-06 23:59:59,Tuesday,0:00-03:59,213.0,21.0,192.0,234.0,-569.0,
62,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/07/2019,04:00:00,REGULAR,7047331,...,2019-05-07 04:00:00,2019-05-07 03:59:59,Tuesday,04:00-07:59,16.0,9.0,7.0,25.0,-197.0,16.0
63,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/07/2019,08:00:00,REGULAR,7047380,...,2019-05-07 08:00:00,2019-05-07 07:59:59,Tuesday,08:00-11:59,49.0,114.0,-65.0,163.0,33.0,49.0
64,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/07/2019,12:00:00,REGULAR,7047539,...,2019-05-07 12:00:00,2019-05-07 11:59:59,Tuesday,12:00-15:59,159.0,229.0,-70.0,388.0,110.0,159.0
65,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/07/2019,16:00:00,REGULAR,7047849,...,2019-05-07 16:00:00,2019-05-07 15:59:59,Tuesday,16:00-19:59,310.0,69.0,241.0,379.0,151.0,310.0
66,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/07/2019,20:00:00,REGULAR,7048572,...,2019-05-07 20:00:00,2019-05-07 19:59:59,Tuesday,20:00-23:59,723.0,100.0,623.0,823.0,413.0,723.0
67,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/08/2019,00:00:00,REGULAR,7048774,...,2019-05-08 00:00:00,2019-05-07 23:59:59,Wednesday,0:00-03:59,202.0,33.0,169.0,235.0,-521.0,
68,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/08/2019,04:00:00,REGULAR,7048790,...,2019-05-08 04:00:00,2019-05-08 03:59:59,Wednesday,04:00-07:59,16.0,2.0,14.0,18.0,-186.0,16.0
69,A002,R051,02-00-00,59 ST,NQR456W,BMT,05/08/2019,08:00:00,REGULAR,7048852,...,2019-05-08 08:00:00,2019-05-08 07:59:59,Wednesday,08:00-11:59,62.0,101.0,-39.0,163.0,46.0,62.0


In [43]:
mta_check = mta19.sort_values(["suspicious"], ascending=False).reset_index()
mta_check.iloc[61:90]

Unnamed: 0,index,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,...,DDATETIME,DDATETIME2,DDAY,time_cat,entries_diff,exits_diff,entries-exits,entries+exits,suspicious,entries_diff2
61,486880,PTH05,R543,00-00-03,EXCHANGE PLACE,1,PTH,05/24/2019,06:58:32,REGULAR,...,2019-05-24 06:58:32,2019-05-24 06:58:31,Friday,04:00-07:59,5139.0,808.0,4331.0,5947.0,4613.0,
62,403033,N503,R021,00-00-03,42 ST-BRYANT PK,BDFM7,IND,05/11/2019,00:00:00,RECOVR AUD,...,2019-05-11 00:00:00,2019-05-10 23:59:59,Saturday,0:00-03:59,4796.0,4435.0,361.0,9231.0,4602.0,
63,762789,R523,R147,00-00-06,61 ST WOODSIDE,7,IRT,05/04/2019,01:00:00,REGULAR,...,2019-05-04 01:00:00,2019-05-04 00:59:59,Saturday,0:00-03:59,5346.0,7108.0,-1762.0,12454.0,4594.0,
64,507017,PTH18,R549,01-01-03,NEWARK BM BW,1,PTH,05/24/2019,07:03:12,REGULAR,...,2019-05-24 07:03:12,2019-05-24 07:03:11,Friday,04:00-07:59,5310.0,359.0,4951.0,5669.0,4498.0,
65,504987,PTH17,R541,01-01-06,THIRTY THIRD ST,1,PTH,05/24/2019,06:58:17,REGULAR,...,2019-05-24 06:58:17,2019-05-24 06:58:16,Friday,04:00-07:59,4554.0,1378.0,3176.0,5932.0,4352.0,
66,491908,PTH07,R550,00-00-02,CITY / BUS,1,PTH,05/24/2019,07:50:20,REGULAR,...,2019-05-24 07:50:20,2019-05-24 07:50:19,Friday,04:00-07:59,5177.0,3465.0,1712.0,8642.0,4218.0,
67,762593,R523,R147,00-00-05,61 ST WOODSIDE,7,IRT,05/04/2019,01:00:00,REGULAR,...,2019-05-04 01:00:00,2019-05-04 00:59:59,Saturday,0:00-03:59,4941.0,8867.0,-3926.0,13808.0,4105.0,
68,402447,N503,R021,00-00-00,42 ST-BRYANT PK,BDFM7,IND,05/11/2019,12:00:00,RECOVR AUD,...,2019-05-11 12:00:00,2019-05-11 11:59:59,Saturday,12:00-15:59,4180.0,13170.0,-8990.0,17350.0,4081.0,
69,489259,PTH05,R543,00-04-03,EXCHANGE PLACE,1,PTH,05/24/2019,07:16:08,REGULAR,...,2019-05-24 07:16:08,2019-05-24 07:16:07,Friday,04:00-07:59,4633.0,1349.0,3284.0,5982.0,4037.0,
70,492643,PTH07,R550,00-00-06,CITY / BUS,1,PTH,05/24/2019,06:56:09,REGULAR,...,2019-05-24 06:56:09,2019-05-24 06:56:08,Friday,04:00-07:59,5448.0,7079.0,-1631.0,12527.0,4009.0,


In [44]:
#mta19.iloc[504100:504120]
# mta19.iloc[140900:140910]
# mta19.iloc[402635:402645]
mta19.iloc[485820:485830]

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,...,DDATETIME,DDATETIME2,DDAY,time_cat,entries_diff,exits_diff,entries-exits,entries+exits,suspicious,entries_diff2
485820,PTH04,R551,00-04-04,GROVE STREET,1,PTH,05/20/2019,13:06:37,REGULAR,206772,...,2019-05-20 13:06:37,2019-05-20 13:06:36,Monday,12:00-15:59,421.0,100.0,321.0,521.0,-176.0,421.0
485821,PTH04,R551,00-04-04,GROVE STREET,1,PTH,05/20/2019,17:18:37,REGULAR,207010,...,2019-05-20 17:18:37,2019-05-20 17:18:36,Monday,16:00-19:59,238.0,115.0,123.0,353.0,-183.0,238.0
485822,PTH04,R551,00-04-04,GROVE STREET,1,PTH,05/20/2019,21:30:37,REGULAR,207200,...,2019-05-20 21:30:37,2019-05-20 21:30:36,Monday,20:00-23:59,190.0,640.0,-450.0,830.0,-48.0,190.0
485823,PTH04,R551,00-04-04,GROVE STREET,1,PTH,05/21/2019,01:42:37,REGULAR,207223,...,2019-05-21 01:42:37,2019-05-21 01:42:36,Tuesday,0:00-03:59,23.0,87.0,-64.0,110.0,-167.0,
485824,PTH04,R551,00-04-04,GROVE STREET,1,PTH,05/21/2019,05:54:37,REGULAR,207237,...,2019-05-21 05:54:37,2019-05-21 05:54:36,Tuesday,04:00-07:59,14.0,5.0,9.0,19.0,-9.0,14.0
485825,PTH04,R551,00-04-04,GROVE STREET,1,PTH,05/21/2019,10:06:37,REGULAR,208163,...,2019-05-21 10:06:37,2019-05-21 10:06:36,Tuesday,08:00-11:59,926.0,207.0,719.0,1133.0,912.0,926.0
485826,PTH04,R551,00-04-04,GROVE STREET,1,PTH,05/21/2019,14:18:37,REGULAR,208409,...,2019-05-21 14:18:37,2019-05-21 14:18:36,Tuesday,12:00-15:59,246.0,53.0,193.0,299.0,-680.0,246.0
485827,PTH04,R551,00-04-04,GROVE STREET,1,PTH,05/24/2019,06:56:37,REGULAR,212011,...,2019-05-24 06:56:37,2019-05-24 06:56:36,Friday,04:00-07:59,3602.0,3186.0,416.0,6788.0,3356.0,
485828,PTH04,R551,00-04-04,GROVE STREET,1,PTH,05/24/2019,11:08:37,REGULAR,212688,...,2019-05-24 11:08:37,2019-05-24 11:08:36,Friday,08:00-11:59,677.0,147.0,530.0,824.0,-2925.0,677.0
485829,PTH04,R551,00-04-04,GROVE STREET,1,PTH,05/24/2019,15:20:37,REGULAR,212970,...,2019-05-24 15:20:37,2019-05-24 15:20:36,Friday,12:00-15:59,282.0,152.0,130.0,434.0,-395.0,282.0


In [50]:
import matplotlib.pyplot as plt
# You can configure the format of the images: ‘png’, ‘retina’, ‘jpeg’, ‘svg’, ‘pdf’.
%config InlineBackend.figure_format = 'svg'
# this statement allows the visuals to render within your Jupyter Notebook
%matplotlib inline 

In [None]:
# Function to output grouped data

def grouped_data(dfname2):
    """
    add statements to return grouped dataframes
    """
    return grouped_data


## Below is scratchwork

In [63]:
mta_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 822989 entries, 0 to 822988
Data columns (total 11 columns):
C/A         822989 non-null object
UNIT        822989 non-null object
SCP         822989 non-null object
STATION     822989 non-null object
LINENAME    822989 non-null object
DIVISION    822989 non-null object
DATE        822989 non-null object
TIME        822989 non-null object
DESC        822989 non-null object
ENTRIES     822989 non-null int64
EXITS       822989 non-null int64
dtypes: int64(2), object(9)
memory usage: 69.1+ MB


In [64]:
mta_raw.sort_values(["STATION","LINENAME","DIVISION","C/A","UNIT","SCP","DATE","TIME","DESC"])
mta_raw.head(10)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,00:00:00,REGULAR,7035249,2384833
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,04:00:00,REGULAR,7035269,2384840
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,08:00:00,REGULAR,7035292,2384875
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,12:00:00,REGULAR,7035392,2384951
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,16:00:00,REGULAR,7035651,2385020
5,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,20:00:00,REGULAR,7035930,2385070
6,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/28/2019,00:00:00,REGULAR,7036100,2385087
7,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/28/2019,04:00:00,REGULAR,7036119,2385088
8,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/28/2019,08:00:00,REGULAR,7036125,2385103
9,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/28/2019,12:00:00,REGULAR,7036197,2385155


In [65]:
mta1 = mta_raw.copy()
mta1["time_hour"] = pd.to_numeric(mta1["TIME"].str[0:2])
mta1.tail(10)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,time_hour
822979,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,05/23/2019,09:00:00,REGULAR,5554,378,9
822980,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,05/23/2019,13:00:00,REGULAR,5554,378,13
822981,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,05/23/2019,17:00:00,REGULAR,5554,378,17
822982,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,05/23/2019,21:00:00,REGULAR,5554,378,21
822983,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,05/24/2019,01:00:00,REGULAR,5554,378,1
822984,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,05/24/2019,05:00:00,REGULAR,5554,378,5
822985,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,05/24/2019,09:00:00,REGULAR,5554,378,9
822986,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,05/24/2019,13:00:00,REGULAR,5554,378,13
822987,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,05/24/2019,17:00:00,REGULAR,5554,378,17
822988,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,05/24/2019,21:00:00,REGULAR,5554,378,21


In [68]:
mta1["entries_diff"] = mta1.groupby(by = ["STATION","LINENAME","DIVISION","C/A","UNIT","SCP"]).ENTRIES.diff()
mta1["exits_diff"] = mta1.groupby(by = ["STATION","LINENAME","DIVISION","C/A","UNIT","SCP"]).EXITS.diff()
mta1["entries-exits"] = mta1["entries_diff"] - mta1["exits_diff"]

In [80]:
mta1.tail(10)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,time_hour,entries_diff,exits_diff,entries-exits
822979,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,05/23/2019,09:00:00,REGULAR,5554,378,9,0.0,0.0,0.0
822980,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,05/23/2019,13:00:00,REGULAR,5554,378,13,0.0,0.0,0.0
822981,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,05/23/2019,17:00:00,REGULAR,5554,378,17,0.0,0.0,0.0
822982,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,05/23/2019,21:00:00,REGULAR,5554,378,21,0.0,0.0,0.0
822983,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,05/24/2019,01:00:00,REGULAR,5554,378,1,0.0,0.0,0.0
822984,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,05/24/2019,05:00:00,REGULAR,5554,378,5,0.0,0.0,0.0
822985,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,05/24/2019,09:00:00,REGULAR,5554,378,9,0.0,0.0,0.0
822986,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,05/24/2019,13:00:00,REGULAR,5554,378,13,0.0,0.0,0.0
822987,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,05/24/2019,17:00:00,REGULAR,5554,378,17,0.0,0.0,0.0
822988,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,05/24/2019,21:00:00,REGULAR,5554,378,21,0.0,0.0,0.0


In [69]:
mta1.describe()

Unnamed: 0,ENTRIES,EXITS,time_hour,entries_diff,exits_diff,entries-exits
count,822989.0,822989.0,822989.0,818119.0,818119.0,818119.0
mean,41374440.0,33920400.0,10.537974,6501.273,4389.621,2111.652
std,211023300.0,194599200.0,6.813749,3880945.0,2153482.0,2952274.0
min,0.0,0.0,0.0,-1437242000.0,-384349600.0,-1086304000.0
25%,355281.0,145355.0,4.0,9.0,8.0,-26.0
50%,2179688.0,1249356.0,10.0,71.0,52.0,1.0
75%,6796341.0,4610048.0,16.0,239.0,167.0,95.0
max,2129343000.0,2124127000.0,23.0,2055526000.0,1078346000.0,1888396000.0


In [70]:
mta2 = mta1.groupby(["STATION","LINENAME","DIVISION","C/A","UNIT","SCP","DATE"])[["ENTRIES","EXITS"]].sum()
mta2.head(40)

In [75]:
mta3 = mta1.groupby(["STATION","LINENAME"])[["ENTRIES","EXITS"]].sum().sort_values("ENTRIES",ascending = False)
mta3.head(40)

Unnamed: 0_level_0,Unnamed: 1_level_0,ENTRIES,EXITS
STATION,LINENAME,Unnamed: 2_level_1,Unnamed: 3_level_1
42 ST-PORT AUTH,ACENQRS1237W,1325100766251,1087383399661
TIMES SQ-42 ST,1237ACENQRSW,1050107602568,572042830342
3 AV-149 ST,25,871050714699,573974218369
34 ST-HERALD SQ,BDFMNQRW,792675667461,993997895389
183 ST,4,688797769222,440331418268
72 ST,123,595692069534,794986496814
FULTON ST,ACJZ2345,566891838239,371440694638
ELMHURST AV,MR,542939140010,785995739092
104 ST,JZ,530523541168,426588183018
CANAL ST,JNQRZ6W,523768132773,820303844992


In [77]:
mta1.sort_values("entries_diff")

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,time_hour,entries_diff,exits_diff,entries-exits
582980,R288,R275,00-00-03,183 ST,4,IRT,05/15/2019,12:00:00,REGULAR,140396510,51950488,12,-1.437242e+09,-350938302.0,-1.086304e+09
72426,N196,R285,00-05-01,FAR ROCKAWAY,A,IND,05/03/2019,16:00:00,REGULAR,65536,2,16,-8.392517e+08,-167841839.0,-6.714099e+08
167363,R258,R132,00-03-00,125 ST,456,IRT,05/03/2019,13:00:00,REGULAR,68858058,68916176,13,-3.676523e+08,-384349631.0,1.669731e+07
553728,R160A,R164,00-00-02,66 ST-LINCOLN,1,IRT,05/11/2019,17:00:00,REGULAR,1058563,262707,17,-1.013321e+08,-50788219.0,-5.054392e+07
170314,R289,R119,00-05-01,FORDHAM RD,4,IRT,05/03/2019,16:00:00,REGULAR,458752,7,16,-1.002045e+08,-16777623.0,-8.342692e+07
544936,R121,R290,01-06-00,HOUSTON ST,1,IRT,05/11/2019,17:00:00,REGULAR,196625,30,17,-6.826523e+07,-17065778.0,-5.119945e+07
551480,R151,R033,00-00-05,TIMES SQ-42 ST,1237ACENQRSW,IRT,05/15/2019,16:00:00,REGULAR,146,258,16,-3.696648e+07,-89243892.0,5.227741e+07
655279,H041,R152,00-00-01,CANARSIE-ROCKAW,L,BMT,05/18/2019,13:00:00,REGULAR,191,169,13,-1.541868e+07,-7550836.0,-7.867844e+06
399044,R601A,R108,02-00-01,BOROUGH HALL,2345R,IRT,05/06/2019,00:00:00,RECOVR AUD,7,0,0,-1.365362e+07,-8665111.0,-4.988508e+06
163490,R245A,R051,01-00-01,59 ST,456NQRW,IRT,05/02/2019,20:00:00,REGULAR,115,23,20,-1.273900e+07,-6526790.0,-6.212212e+06


In [79]:
mta1.iloc[72420:72440]

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,time_hour,entries_diff,exits_diff,entries-exits
72420,N196,R285,00-05-01,FAR ROCKAWAY,A,IND,05/02/2019,16:00:00,REGULAR,839317244,167841841,16,0.0,0.0,0.0
72421,N196,R285,00-05-01,FAR ROCKAWAY,A,IND,05/02/2019,20:00:00,REGULAR,839317244,167841841,20,0.0,0.0,0.0
72422,N196,R285,00-05-01,FAR ROCKAWAY,A,IND,05/03/2019,00:00:00,REGULAR,839317244,167841841,0,0.0,0.0,0.0
72423,N196,R285,00-05-01,FAR ROCKAWAY,A,IND,05/03/2019,04:00:00,REGULAR,839317244,167841841,4,0.0,0.0,0.0
72424,N196,R285,00-05-01,FAR ROCKAWAY,A,IND,05/03/2019,08:00:00,REGULAR,839317244,167841841,8,0.0,0.0,0.0
72425,N196,R285,00-05-01,FAR ROCKAWAY,A,IND,05/03/2019,12:00:00,REGULAR,839317244,167841841,12,0.0,0.0,0.0
72426,N196,R285,00-05-01,FAR ROCKAWAY,A,IND,05/03/2019,16:00:00,REGULAR,65536,2,16,-839251708.0,-167841839.0,-671409869.0
72427,N196,R285,00-05-01,FAR ROCKAWAY,A,IND,05/03/2019,20:00:00,REGULAR,65536,2,20,0.0,0.0,0.0
72428,N202,R315,00-00-00,155 ST,BD,IND,04/27/2019,01:00:00,REGULAR,8938117,8025191,1,,,
72429,N202,R315,00-00-00,155 ST,BD,IND,04/27/2019,05:00:00,REGULAR,8938131,8025234,5,14.0,43.0,-29.0
