In [10]:
import pandas as pd
from datetime import date, timedelta
import urllib.request
import shutil
import os

In [8]:
# Batch download MTA turnstile files
start_date = date(2020, 1, 4)
end_date = date(2020, 4, 18)
while start_date <= end_date:
    filename = 'turnstile_'+start_date.strftime('%y%m%d')
    url = 'http://web.mta.info/developers/data/nyct/turnstile/'+filename+'.txt'
    print('Downloading '+filename+'.csv')
    with urllib.request.urlopen(url) as response, open('./data/'+filename+'.csv', 'wb') as out_file:
        shutil.copyfileobj(response, out_file)
    start_date += timedelta(weeks=1)

Downloading turnstile_200104.csv
Downloading turnstile_200111.csv
Downloading turnstile_200118.csv
Downloading turnstile_200125.csv
Downloading turnstile_200201.csv
Downloading turnstile_200208.csv
Downloading turnstile_200215.csv
Downloading turnstile_200222.csv
Downloading turnstile_200229.csv
Downloading turnstile_200307.csv
Downloading turnstile_200314.csv
Downloading turnstile_200321.csv
Downloading turnstile_200328.csv
Downloading turnstile_200404.csv
Downloading turnstile_200411.csv
Downloading turnstile_200418.csv


In [12]:
# Import station data with latitude and longitude
pd.options.display.float_format = '{:.6f}'.format
geocoded = pd.read_csv('geocoded.csv')

In [9]:
def filter(data):
    # Filter by regular records, remove PATH
    data_filtered = data[(data['DESC']=='REGULAR') | (data['DESC']=='RECOVR AUD')]
    data_filtered = data_filtered[data_filtered['DIVISION']!='PTH']

    # Get diff for each row
    data_filtered['ENTRIES_DIFF'] = data_filtered.groupby(['UNIT', 'SCP'])['ENTRIES'].diff()
    data_filtered['EXITS_DIFF'] = data_filtered.groupby(['UNIT', 'SCP'])['EXITS'].diff()
    data_filtered = data_filtered[(data_filtered['ENTRIES_DIFF']>=0) & (data_filtered['EXITS_DIFF']>=0)]

    # Remove some columns
    data_filtered.drop(columns=['STATION', 'LINENAME', 'DIVISION', 'DESC'], inplace=True)
    
    return data_filtered

In [13]:
# Batch process
directory = os.fsencode('./data')
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    print('Cleaning '+filename)

    # Import MTA weekly turnstile data
    data = pd.read_csv('./data/'+filename, parse_dates=['DATE'])
    data.columns = data.columns.str.strip()
    
    data_filtered = filter(data)
    data_geocoded = pd.merge(data_filtered, geocoded, how='left', left_on=['UNIT'], right_on=['UNIT'])
    
    # Group and sum 
    data_cleaned = data_geocoded.groupby(['DATE', 'STATION', 'LATITUDE', 'LONGITUDE'])['ENTRIES_DIFF', 'EXITS_DIFF'].sum()
    data_cleaned = data_cleaned.astype(int)
    
    # Export
    data_cleaned.to_csv('./cleaned/cleaned_'+filename, float_format='%.6f')

Cleaning turnstile_200104.csv
Cleaning turnstile_200111.csv
Cleaning turnstile_200118.csv
Cleaning turnstile_200125.csv
Cleaning turnstile_200201.csv
Cleaning turnstile_200208.csv
Cleaning turnstile_200215.csv
Cleaning turnstile_200222.csv
Cleaning turnstile_200229.csv
Cleaning turnstile_200307.csv
Cleaning turnstile_200314.csv
Cleaning turnstile_200321.csv
Cleaning turnstile_200328.csv
Cleaning turnstile_200404.csv
Cleaning turnstile_200411.csv
Cleaning turnstile_200418.csv
Cleaning turnstile_200425.csv


In [89]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(data_cleaned)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,ENTRIES_DIFF,EXITS_DIFF
DATE,STATION,LATITUDE,LONGITUDE,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-04-18,1 AVE,40.730901,-73.981719,1014,2378
2020-04-18,103 ST,40.790582,-73.947473,1070,1544
2020-04-18,103 ST,40.796105,-73.961399,199,238
2020-04-18,103 ST,40.799354,-73.968329,602,701
2020-04-18,103 ST-CORONA,40.749858,-73.862672,1646,1458
2020-04-18,104 ST,40.695184,-73.844326,184,94
2020-04-18,110 ST,40.795066,-73.944297,1618,2336
2020-04-18,110 ST-CATHEDRL,40.804032,-73.966742,1190,1454
2020-04-18,110 ST-CPN,40.79911,-73.951807,487,847
2020-04-18,111 ST,40.697405,-73.836354,131,86
