Data from [MTA](http://web.mta.info/developers/turnstile.html)
Description [here](http://web.mta.info/developers/resources/nyct/turnstile/ts_Field_Description.txt)

This is just a starting attempt to play around with the data. I'm only looking at the week of May 4th 2019 and Abishek and Max are looking at the two subsequent weeks.

High # of entries *per turnstile*.

[GitHub](https://github.com/huge-reality/MTA_Project_Group_5)

__Action item__: Filter based on times

Brainstorming ways to approach manipulating the data:
- only look at the 8AM - 12PM and 4PM - 8PM blocks

In [1]:
import pandas as pd
import numpy as np
import matplotlib
pd.set_option('display.float_format', lambda x: '%.1f' % x)

In [2]:
df = pd.read_csv('turnstile_190504.txt')
df.tail(30)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
206827,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,04/29/2019,01:00:00,REGULAR,5554,376
206828,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,04/29/2019,05:00:00,REGULAR,5554,376
206829,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,04/29/2019,09:00:00,REGULAR,5554,376
206830,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,04/29/2019,13:00:00,REGULAR,5554,376
206831,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,04/29/2019,17:00:00,REGULAR,5554,376
206832,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,04/29/2019,21:00:00,REGULAR,5554,376
206833,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,04/30/2019,01:00:00,REGULAR,5554,376
206834,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,04/30/2019,05:00:00,REGULAR,5554,376
206835,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,04/30/2019,09:00:00,REGULAR,5554,376
206836,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,04/30/2019,13:00:00,REGULAR,5554,376


In [3]:
df.columns

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES',
       'EXITS                                                               '],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206857 entries, 0 to 206856
Data columns (total 11 columns):
C/A                                                                     206857 non-null object
UNIT                                                                    206857 non-null object
SCP                                                                     206857 non-null object
STATION                                                                 206857 non-null object
LINENAME                                                                206857 non-null object
DIVISION                                                                206857 non-null object
DATE                                                                    206857 non-null object
TIME                                                                    206857 non-null object
DESC                                                                    206857 non-null object
ENTRIES                           

First info: no null values

In [5]:
df['UNIT']

0         R051
1         R051
2         R051
3         R051
4         R051
          ... 
206852    R469
206853    R469
206854    R469
206855    R469
206856    R469
Name: UNIT, Length: 206857, dtype: object

In [6]:
df.rename(columns = {'EXITS                                                               ':'EXITS'}, inplace = True)

In [7]:
df.head(119)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,00:00:00,REGULAR,7035249,2384833
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,04:00:00,REGULAR,7035269,2384840
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,08:00:00,REGULAR,7035292,2384875
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,12:00:00,REGULAR,7035392,2384951
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,16:00:00,REGULAR,7035651,2385020
...,...,...,...,...,...,...,...,...,...,...,...
114,A002,R051,02-03-00,59 ST,NQR456W,BMT,05/01/2019,08:00:00,REGULAR,1226009,4609409
115,A002,R051,02-03-00,59 ST,NQR456W,BMT,05/01/2019,12:00:00,REGULAR,1226067,4610091
116,A002,R051,02-03-00,59 ST,NQR456W,BMT,05/01/2019,16:00:00,REGULAR,1226156,4610489
117,A002,R051,02-03-00,59 ST,NQR456W,BMT,05/01/2019,20:00:00,REGULAR,1226417,4611104


In [8]:
df.describe()

Unnamed: 0,ENTRIES,EXITS
count,206857.0,206857.0
mean,40575302.0,33197992.8
std,208275164.4,192669891.6
min,0.0,0.0
25%,349546.0,141334.0
50%,2176408.0,1241604.0
75%,6775342.0,4590174.0
max,2129342740.0,2124126951.0


In [9]:
print('Unique value counts:')
print('SCP', df.SCP.nunique())
print('UNIT', df.UNIT.nunique())
print('C/A', df['C/A'].nunique())
print('STATION', df.STATION.nunique())


Unique value counts:
SCP 220
UNIT 467
C/A 743
STATION 377


In [10]:
df['entry_count'] = df['ENTRIES'].diff()
df['exit_count'] = df['EXITS'].diff()

In [11]:
df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,entry_count,exit_count
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,00:00:00,REGULAR,7035249,2384833,,
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,04:00:00,REGULAR,7035269,2384840,20.0,7.0
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,08:00:00,REGULAR,7035292,2384875,23.0,35.0
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,12:00:00,REGULAR,7035392,2384951,100.0,76.0
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,16:00:00,REGULAR,7035651,2385020,259.0,69.0


In [12]:
df.describe()

Unnamed: 0,ENTRIES,EXITS,entry_count,exit_count
count,206857.0,206857.0,206856.0,206856.0
mean,40575302.0,33197992.8,-34.0,-11.5
std,208275164.4,192669891.6,44082744.0,40678076.7
min,0.0,0.0,-2116018673.0,-2111174859.0
25%,349546.0,141334.0,9.0,7.0
50%,2176408.0,1241604.0,72.0,52.0
75%,6775342.0,4590174.0,245.0,171.0
max,2129342740.0,2124126951.0,2121120457.0,2118903676.0


In [13]:
df = pd.read_csv('turnstile_190504.txt')
df.rename(columns = {'EXITS                                                               ':'EXITS'}, inplace = True)
# df['entry_count'] = df['ENTRIES'].diff()
# df['exit_count'] = df['EXITS'].diff()

# df.entry_count = df.entry_count.clip(lower = 0)
# df.exit_count = df.exit_count.clip(lower = 0)
# df.entry_count = df.entry_count.clip(upper = 2000)
# df.exit_count = df.exit_count.clip(lower = 0)

In [14]:
df.describe()

Unnamed: 0,ENTRIES,EXITS
count,206857.0,206857.0
mean,40575302.0,33197992.8
std,208275164.4,192669891.6
min,0.0,0.0
25%,349546.0,141334.0
50%,2176408.0,1241604.0
75%,6775342.0,4590174.0
max,2129342740.0,2124126951.0


In [15]:
# df.entry_count.plot.hist(bins=50, grid=False, figsize=(12,8), color='#86bf91', zorder=2, rwidth=0.9)

In [16]:
df = pd.read_csv('turnstile_190504.txt')
df.rename(columns = {'EXITS                                                               ':'EXITS'}, inplace = True)

# df['entry_count'] = 0
df['entry_count'] = df['ENTRIES'].diff()


# df.groupby(['SCP']) == 0

df.head()

# for row in df.groupby(['SCP']):
#    print(row)

# def check(a):
#    if a < 0:
#        a = 0

# df.groupby('SCP')['entry_count'].transform(check)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,entry_count
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,00:00:00,REGULAR,7035249,2384833,
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,04:00:00,REGULAR,7035269,2384840,20.0
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,08:00:00,REGULAR,7035292,2384875,23.0
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,12:00:00,REGULAR,7035392,2384951,100.0
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,16:00:00,REGULAR,7035651,2385020,259.0


In [17]:
df.describe()

Unnamed: 0,ENTRIES,EXITS,entry_count
count,206857.0,206857.0,206856.0
mean,40575302.0,33197992.8,-34.0
std,208275164.4,192669891.6,44082744.0
min,0.0,0.0,-2116018673.0
25%,349546.0,141334.0,9.0
50%,2176408.0,1241604.0,72.0
75%,6775342.0,4590174.0,245.0
max,2129342740.0,2124126951.0,2121120457.0


In [28]:
def zoey(a):
    a = 1
    
s = df.groupby(['SCP']).entry_count.sum()
print(type(s))
print(s)

<class 'pandas.core.series.Series'>
SCP
00-00-00    1853103751.0
00-00-01    1382061716.0
00-00-02    2213271538.0
00-00-03   -4662380349.0
00-00-04    3336043671.0
                ...     
05-03-06     -10104718.0
05-05-00      33554853.0
05-05-01     -33554853.0
05-06-00       4098737.0
05-06-01      -3458358.0
Name: entry_count, Length: 220, dtype: float64


In [113]:
df = pd.read_csv('turnstile_190504.txt')
df.rename(columns = {'EXITS                                                               ':'EXITS'}, inplace = True)
# df3 = df[df.TIME == '12:00:00']

# df['DATETIME'] = df['DATE'] + ' ' + df['TIME']
df['DATE'] = pd.to_datetime(df['TIME'], format= "%H:%M:%S")
# df.drop(columns=['DATE', 'TIME'], inplace=True)

df.info()
print(df.describe())
print(df.head())

print('\n\n\n\n')


index0 = df[(df['TIME'] == '00:00:00')].index 
df = df.drop(index0)
index4 = df[(df['TIME'] == '04:00:00')].index 
df = df.drop(index4)
index16 = df[(df['TIME'] == '16:00:00')].index 
df = df.drop(index16)
index20 = df[(df['TIME'] == '20:00:00')].index 
df = df.drop(index20)
df.info()
df.describe()
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206857 entries, 0 to 206856
Data columns (total 11 columns):
C/A         206857 non-null object
UNIT        206857 non-null object
SCP         206857 non-null object
STATION     206857 non-null object
LINENAME    206857 non-null object
DIVISION    206857 non-null object
DATE        206857 non-null datetime64[ns]
TIME        206857 non-null object
DESC        206857 non-null object
ENTRIES     206857 non-null int64
EXITS       206857 non-null int64
dtypes: datetime64[ns](1), int64(2), object(8)
memory usage: 17.4+ MB
           ENTRIES        EXITS
count     206857.0     206857.0
mean    40575302.0   33197992.8
std    208275164.4  192669891.6
min            0.0          0.0
25%       349546.0     141334.0
50%      2176408.0    1241604.0
75%      6775342.0    4590174.0
max   2129342740.0 2124126951.0
    C/A  UNIT       SCP STATION LINENAME DIVISION                DATE  \
0  A002  R051  02-00-00   59 ST  NQR456W      BMT 1900-01-01 00:00:

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,1900-01-01 08:00:00,08:00:00,REGULAR,7035292,2384875
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,1900-01-01 12:00:00,12:00:00,REGULAR,7035392,2384951
8,A002,R051,02-00-00,59 ST,NQR456W,BMT,1900-01-01 08:00:00,08:00:00,REGULAR,7036125,2385103
9,A002,R051,02-00-00,59 ST,NQR456W,BMT,1900-01-01 12:00:00,12:00:00,REGULAR,7036197,2385155
14,A002,R051,02-00-00,59 ST,NQR456W,BMT,1900-01-01 08:00:00,08:00:00,REGULAR,7036789,2385385


In [114]:
df['entry_count'] = df['ENTRIES'].diff()
df.describe()

Unnamed: 0,ENTRIES,EXITS,entry_count
count,137839.0,137839.0,137838.0
mean,35989825.1,29094562.6,-51.0
std,192406027.4,178317112.7,54002114.1
min,0.0,0.0,-2116018673.0
25%,227137.5,99366.0,8.0
50%,1718141.0,1024323.0,91.0
75%,6072619.5,3885787.5,323.0
max,2129342740.0,2124126951.0,2121120457.0
