In [None]:
import pandas as pd
from datetime import date, timedelta
import urllib.request
import shutil
import os

In [None]:
# Batch download MTA turnstile files
start_date = date(2019, 1, 5)
end_date = date(2020, 1, 1)
while start_date <= end_date:
    filename = 'turnstile_'+start_date.strftime('%y%m%d')
    url = 'http://web.mta.info/developers/data/nyct/turnstile/'+filename+'.txt'
    print('Downloading '+filename+'.csv')
    with urllib.request.urlopen(url) as response, open('./data/'+start_date.strftime('%Y')+'/'+filename+'.csv', 'wb') as out_file:
        shutil.copyfileobj(response, out_file)
    start_date += timedelta(weeks=1)

In [None]:
# Import station data with latitude and longitude
pd.options.display.float_format = '{:.6f}'.format
url = 'https://raw.githubusercontent.com/chriswhong/nycturnstiles/master/geocoded.csv'
with urllib.request.urlopen(url) as response, open('./data/geocoded.csv', 'wb') as out_file:
        shutil.copyfileobj(response, out_file)
geocoded = pd.read_csv('./data/geocoded.csv')

In [None]:
def filter_mta(data):
    # Filter by regular records, remove PATH and Staten Island
    data_filtered = data[(data['DESC']=='REGULAR') | (data['DESC']=='RECOVR AUD')]
    data_filtered = data_filtered[(data_filtered['DIVISION']!='PTH') & (data_filtered['DIVISION']!='SRT')]

    # Get diff for each row
    data_filtered['ENTRIES_DIFF'] = data_filtered.groupby(['UNIT', 'SCP'])['ENTRIES'].diff()
    data_filtered['EXITS_DIFF'] = data_filtered.groupby(['UNIT', 'SCP'])['EXITS'].diff()
    
    # Some turnstiles count backward??
    data_filtered = data_filtered[(data_filtered['ENTRIES_DIFF']>=-5000) & (data_filtered['EXITS_DIFF']>=-5000)]
    data_filtered['ENTRIES_DIFF'] = data_filtered['ENTRIES_DIFF'].abs()
    data_filtered['EXITS_DIFF'] = data_filtered['EXITS_DIFF'].abs()

    # Counts occasionally make huge jumps 
    data_filtered = data_filtered[(data_filtered['ENTRIES_DIFF']<10000) & (data_filtered['EXITS_DIFF']<10000)]

    # Remove some columns
    data_filtered.drop(columns=['STATION', 'LINENAME', 'DIVISION', 'DESC'], inplace=True)
    
    return data_filtered

In [None]:
# Batch process
directory = './data/2019/'
dir = os.fsencode(directory)
dfs = []
for file in os.listdir(dir):
    filename = os.fsdecode(file)
    print('Cleaning '+filename)

    # Import MTA weekly turnstile data
    data = pd.read_csv(directory+filename, parse_dates=['DATE'])
    data.columns = data.columns.str.strip()
    
    data_filtered = filter_mta(data)
    data_geocoded = pd.merge(data_filtered, geocoded, how='left', left_on=['UNIT'], right_on=['UNIT'])
    
    # Group and sum 
    data_cleaned = data_geocoded.groupby(['DATE', 'STATION', 'LATITUDE', 'LONGITUDE']).agg(
        {'ENTRIES_DIFF': 'sum', 'EXITS_DIFF': 'sum', 'LINENAME': lambda x: x.iloc[0], 'UNIT': lambda x: x.iloc[0]})
    data_cleaned = data_cleaned.astype({'ENTRIES_DIFF': 'int', 'EXITS_DIFF': 'int'})
    
    dfs.append(data_cleaned)
    
# Export
pd.concat(dfs).to_csv('./cleaned/cleaned_2019.csv', float_format='%.6f')