Challenge 1
Open up a new IPython notebook
Download a few MTA turnstile data files
Open up a file, use csv reader to read it and ensure there is a column for each feature (C/A, UNIT, SCP, STATION). These are the first four columns.

In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from datetime import datetime, timedelta

def mta_end_of_week(d):
    ''' Calculates the end of the week for a given date to conform to MTA data publication on Saturday
        
        d = date vaule
        return: date 
    '''
    return d - timedelta(days=d.weekday()) + timedelta(days=5)

def read_mta_turnstile(start='20180501', end='20180531'):
    ''' Read MTA turnstile data. Calculates 4-hour bucket entries and exits for each (C/A, UNIT, SCP, STATION)

    start = start date for analysis in yyymmdd format
    end = end date for analysis in yyymmmdd format

    return pd.DataFrame (same list of columns as in the MTA CSV +
                        [date_time, entries_offset, exits_offset, hourly_entries, hourly_exits])
    '''
    first = mta_end_of_week(datetime.strptime(start, '%Y%m%d').date())
    last = mta_end_of_week(datetime.strptime(end, '%Y%m%d').date())
    
    url = 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_'
    
    df = pd.DataFrame(columns=['C/A','UNIT','SCP','STATION','LINENAME','DIVISION','DATE','TIME','DESC','ENTRIES','EXITS'])
    while first <= last:
        data_file = url + datetime.strftime(first,'%y%m%d') + '.txt'
        print('Reading:', data_file)
        df = df.append(pd.read_csv(data_file), sort=False)
        first = first + timedelta(weeks=1)
    
    # fix some weird column thing where EXISTS is name with whitespace
    df.iloc[:,10] = df.iloc[:,11]
    df = df.iloc[:,:11]
    df.rename(columns={'C/A':'CA'}, inplace=True) # rename column for easier (dot) access

    # group df into (CA, UNIT, SCP, STATION)
    df['date_time'] = pd.to_datetime(df.DATE + ' ' + df.TIME)
    df['entries_offset'] = df.groupby(['CA','UNIT','SCP','STATION'])['ENTRIES'].shift(-1) # get everything one row down and shift up
    df['exits_offset'] = df.groupby(['CA','UNIT','SCP','STATION'])['EXITS'].shift(-1)
    df['hourly_entries'] = df.entries_offset - df.ENTRIES
    df['hourly_exits'] = df.exits_offset - df.EXITS

    # set all hourly entires < 0 to negative
    df.loc[df.hourly_entries < 0,'hourly_entries'] = np.nan
    df.loc[df.hourly_exits < 0,'hourly_exits'] = np.nan
    
    return df

mta = read_mta_turnstile()
mta.sample(5)

Reading: http://web.mta.info/developers/data/nyct/turnstile/turnstile_180505.txt
Reading: http://web.mta.info/developers/data/nyct/turnstile/turnstile_180512.txt
Reading: http://web.mta.info/developers/data/nyct/turnstile/turnstile_180519.txt
Reading: http://web.mta.info/developers/data/nyct/turnstile/turnstile_180526.txt
Reading: http://web.mta.info/developers/data/nyct/turnstile/turnstile_180602.txt


Unnamed: 0,CA,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,date_time,entries_offset,exits_offset,hourly_entries,hourly_exits
79984,N324,R018,00-00-00,JKSN HT-ROOSVLT,EFMR7,IND,05/25/2018,16:00:00,REGULAR,4396781,5575183.0,2018-05-25 16:00:00,4396932,5575598.0,151,415.0
23530,C023,R213,00-06-00,BAY RIDGE AV,R,BMT,05/29/2018,00:00:00,REGULAR,13627,81347.0,2018-05-29 00:00:00,13627,81358.0,0,11.0
105847,N602,R259,00-00-00,ROOSEVELT ISLND,F,IND,05/12/2018,20:00:00,REGULAR,857917,1846036.0,2018-05-12 20:00:00,857929,1846141.0,12,105.0
131814,R147,R033,04-00-04,TIMES SQ-42 ST,1237ACENQRS,IRT,05/30/2018,20:00:00,REGULAR,15403798,7305483.0,2018-05-30 20:00:00,15404821,7305605.0,1023,122.0
16362,B021,R228,00-05-02,AVENUE J,BQ,BMT,05/22/2018,16:00:00,REGULAR,1498564,749775.0,2018-05-22 16:00:00,1498755,749898.0,191,123.0
