# Lesson 4 - Problem Set 2: Wrangling Subway Data
https://s3.amazonaws.com/content.udacity-data.com/courses/ud359/weather_underground.csv

In [2]:
import pandas as pd
import pandasql as pdsql
import os

In [3]:
def input_dir():
    return os.getcwd() + '/data/input/'

def output_dir():
    return os.getcwd() + '/data/output/'

### Useful functions

In [4]:
def read_csv_data(filename, input_dir):
    '''
    Receives a file name (csv)
    Returns a DataFrame
    '''
    data = pd.read_csv(input_dir + filename)
    
    #Rename the columns by replacing spaces with underscores and setting all characters to lowercase
    data.rename(columns = lambda x: x.replace(' ', '_').lower(), inplace=True)
    
    return data

In [5]:
def exe_sql_query(data, query):
    '''
    Receives a DataFrame and a String (SQL query)
    Execute SQL query and returns a DataFrame
    '''

    q = """
        {0}
    """.format(query)

    #Execute your SQL command against the pandas frame
    # OBS: case sensitive
    solution = pdsql.sqldf(q, locals())
    return solution

### Read data

In [6]:
weather_data = read_csv_data('weather-underground.csv',input_dir())
print (weather_data)

          date  maxpressurem  maxdewptm  maxpressurei  maxdewpti  \
0   2011-05-01          1026          6         30.31         42   
1   2011-05-02          1026         10         30.31         50   
2   2011-05-03          1021         15         30.14         59   
3   2011-05-04          1017         14         30.03         57   
4   2011-05-05          1016          4         30.01         39   
5   2011-05-06          1015          8         29.99         46   
6   2011-05-07          1011          8         29.87         47   
7   2011-05-08          1014          9         29.96         48   
8   2011-05-09          1016          9         30.01         49   
9   2011-05-10          1017          8         30.05         47   
10  2011-05-11          1019         10         30.09         50   
11  2011-05-12          1019         11         30.10         51   
12  2011-05-13          1017         11         30.03         51   
13  2011-05-14          1012         13         

In [7]:
query = """
    SELECT * FROM data;
"""

exe_sql_query(weather_data, query)

Unnamed: 0,date,maxpressurem,maxdewptm,maxpressurei,maxdewpti,since1julheatingdegreedaysnormal,heatingdegreedaysnormal,since1sepcoolingdegreedaysnormal,hail,since1julsnowfallm,...,precipi,snowfalli,since1jancoolingdegreedaysnormal,precipm,snowfallm,thunder,monthtodateheatingdegreedays,meantempi,maxvism,meantempm
0,2011-05-01,1026,6,30.31,42,4646,8,,0,157.23,...,0.0,0.0,13,0.0,0.0,0,5,60,16,16
1,2011-05-02,1026,10,30.31,50,4653,7,,0,157.23,...,0.0,0.0,14,0.0,0.0,0,13,57,16,14
2,2011-05-03,1021,15,30.14,59,4660,7,,0,157.23,...,0.0,0.0,15,0.0,0.0,0,13,65,16,18
3,2011-05-04,1017,14,30.03,57,4667,7,,0,157.23,...,0.5,0.0,16,12.7,0.0,0,23,55,16,13
4,2011-05-05,1016,4,30.01,39,4673,6,,0,157.23,...,0.0,0.0,17,0.0,0.0,0,32,56,16,13
5,2011-05-06,1015,8,29.99,46,4679,6,,0,157.23,...,0.0,0.0,18,0.0,0.0,0,36,61,16,16
6,2011-05-07,1011,8,29.87,47,4685,6,,0,157.23,...,0.0,0.0,19,0.0,0.0,0,37,64,16,18
7,2011-05-08,1014,9,29.96,48,4691,6,,0,157.23,...,0.0,0.0,20,0.0,0.0,0,39,63,16,17
8,2011-05-09,1016,9,30.01,49,4696,5,,0,157.23,...,0.0,0.0,21,0.0,0.0,0,41,63,16,17
9,2011-05-10,1017,8,30.05,47,4701,5,,0,157.23,...,0.0,0.0,22,0.0,0.0,0,42,64,16,18


In [8]:
sorted(list(weather_data.columns.values))

['coolingdegreedays',
 'coolingdegreedaysnormal',
 'date',
 'fog',
 'gdegreedays',
 'hail',
 'heatingdegreedays',
 'heatingdegreedaysnormal',
 'humidity',
 'maxdewpti',
 'maxdewptm',
 'maxhumidity',
 'maxpressurei',
 'maxpressurem',
 'maxtempi',
 'maxtempm',
 'maxvisi',
 'maxvism',
 'maxwspdi',
 'maxwspdm',
 'meandewpti',
 'meandewptm',
 'meanpressurei',
 'meanpressurem',
 'meantempi',
 'meantempm',
 'meanvisi',
 'meanvism',
 'meanwdird',
 'meanwdire',
 'meanwindspdi',
 'meanwindspdm',
 'mindewpti',
 'mindewptm',
 'minhumidity',
 'minpressurei',
 'minpressurem',
 'mintempi',
 'mintempm',
 'minvisi',
 'minvism',
 'minwspdi',
 'minwspdm',
 'monthtodatecoolingdegreedays',
 'monthtodatecoolingdegreedaysnormal',
 'monthtodateheatingdegreedays',
 'monthtodateheatingdegreedaysnormal',
 'monthtodatesnowfalli',
 'monthtodatesnowfallm',
 'precipi',
 'precipm',
 'precipsource',
 'rain',
 'since1jancoolingdegreedays',
 'since1jancoolingdegreedaysnormal',
 'since1julheatingdegreedays',
 'since1julh

## Quiz 1: Number of Rainy Days
Get the number of rainy days in DataFrame

In [9]:
def get_rainy_days(data):
    query = """
        SELECT count(rain) FROM data WHERE rain > 0;
    """

    return exe_sql_query(data, query)

In [10]:
get_rainy_days(weather_data)

Unnamed: 0,count(rain)
0,10


## Quiz 2: Temp on Foggy and Nonfoggy Days

In [11]:
def get_max_temp_aggregate_by_fog(data):
    query = """
        SELECT fog, MAX(maxtempi)  FROM data GROUP BY fog;
    """

    return exe_sql_query(data, query)

In [12]:
get_max_temp_aggregate_by_fog(weather_data)

Unnamed: 0,fog,MAX(maxtempi)
0,0,86
1,1,81


## Quiz 3: Mean Temp on Weekends

### Create day of week column

In [13]:
def get_weekday(data, date_col):
    data[date_col] = pd.to_datetime(data[date_col])
    return data[date_col].dt.weekday_name

In [14]:
weather_data['weekday'] = get_weekday(weather_data, 'date')
weather_data['weekday']

0        Sunday
1        Monday
2       Tuesday
3     Wednesday
4      Thursday
5        Friday
6      Saturday
7        Sunday
8        Monday
9       Tuesday
10    Wednesday
11     Thursday
12       Friday
13     Saturday
14       Sunday
15       Monday
16      Tuesday
17    Wednesday
18     Thursday
19       Friday
20     Saturday
21       Sunday
22       Monday
23      Tuesday
24    Wednesday
25     Thursday
26       Friday
27     Saturday
28       Sunday
29       Monday
Name: weekday, dtype: object

In [15]:
def get_avg_weekend_temp(data):
    query = """
        SELECT AVG(CAST(meantempi AS INTEGER)) FROM data WHERE weekday == 'Saturday' OR weekday == "Sunday";
    """


    return exe_sql_query(data, query)

In [16]:
get_avg_weekend_temp(weather_data)

Unnamed: 0,AVG(CAST(meantempi AS INTEGER))
0,65.111111


## Quiz 4: Mean Temp on Rainy Days
Find average minimum temperature (mintempi column) on rainy days where the minimum temperature > 55 degres

In [17]:
def get_avg_min_temp(data):
    query = """
        SELECT AVG(CAST(mintempi AS INTEGER)) FROM data WHERE rain > 0 AND mintempi > 55;
    """


    return exe_sql_query(data, query)

In [18]:
get_avg_min_temp(weather_data)

Unnamed: 0,AVG(CAST(mintempi AS INTEGER))
0,61.25


## Quiz 5: Fixing Turnstile Data

### Task
Receive a list of MTA Subway turnstile text files. A link to an example MTA Subway turnstile text file can be seen at the URL: http://web.mta.info/developers/data/nyct/turnstile/turnstile_110507.txt
    
Write a function that will update each row in the text file so there is only one entry per row. Write the updates to a different text file in the format of "updated_" + filename.

### How to update each row (example):
#### Input:
A002,R051,02-00-00,05-21-11,00:00:00,REGULAR,003169391,001097585,05-21-11,04:00:00,REGULAR,003169415,001097588,05-21-11,08:00:00,REGULAR,003169431,001097607,05-21-11,12:00:00,REGULAR,003169506,001097686,05-21-11,16:00:00,REGULAR,003169693,001097734,05-21-11,20:00:00,REGULAR,003169998,001097769,05-22-11,00:00:00,REGULAR,003170119,001097792,05-22-11,04:00:00,REGULAR,003170146,001097801

#### Should be converted to:

A002,R051,02-00-00,05-21-11,00:00:00,REGULAR,003169391,001097585
A002,R051,02-00-00,05-21-11,04:00:00,REGULAR,003169415,001097588
A002,R051,02-00-00,05-21-11,08:00:00,REGULAR,003169431,001097607
A002,R051,02-00-00,05-21-11,12:00:00,REGULAR,003169506,001097686
A002,R051,02-00-00,05-21-11,16:00:00,REGULAR,003169693,001097734
A002,R051,02-00-00,05-21-11,20:00:00,REGULAR,003169998,001097769
A002,R051,02-00-00,05-22-11,00:00:00,REGULAR,003170119,001097792
A002,R051,02-00-00,05-22-11,04:00:00,REGULAR,003170146,001097801

In [19]:
import csv

In [20]:
def split_list (lst, n):
    it = iter(lst)
    new = [[next(it) for _ in range(n)] for _ in range(len(lst) // n)]

    for i, x in enumerate(it):
        new[i].append(x)

    return new

In [45]:
def fix_turnstile_data(filenames):
    for name in filenames:
        print (name)
        f_in = open(input_dir() + name, 'r')
        f_out = open(output_dir() + 'updated_' + name, 'w')
    
        prefix = ''
        for line in f_in:
            # ignore empty lines
            if not line.strip():
                continue
            # get the prefix (first 3 elements)
            # obs -> split python 2: string.split(',')
            new_prefix = ','.join(line.split(sep= ',')[:3])
            if  new_prefix + ',' != prefix:
                prefix = new_prefix + ',' 

            # split the other part of the row into sublists of length 5 
            # each sublist content will be used to create a new row (converted output using prefix in each row)
            splited_list = split_list(line.split(sep= ',')[3:], 5)
            
            # remove \n\t\r from last element of current line
            splited_list[len(splited_list) - 1][len(splited_list[len(splited_list)-1])-1] = splited_list[len(splited_list)-1][len(splited_list[len(splited_list)-1])-1].strip(' \t\n\r')
            
            # for each subrow, write a new row using the prefix 
            for subrow in splited_list:
                print (prefix + ','.join(subrow) + '\n')
                f_out.write(prefix + ','.join(subrow) + '\n')
            


### Read and Fix Data

In [144]:
fix_turnstile_data(['turnstile-110528.txt'])

turnstile-110528.txt
A002,R051,02-00-00,05-21-11,00:00:00,REGULAR,003169391,001097585

A002,R051,02-00-00,05-21-11,04:00:00,REGULAR,003169415,001097588

A002,R051,02-00-00,05-21-11,08:00:00,REGULAR,003169431,001097607

A002,R051,02-00-00,05-21-11,12:00:00,REGULAR,003169506,001097686

A002,R051,02-00-00,05-21-11,16:00:00,REGULAR,003169693,001097734

A002,R051,02-00-00,05-21-11,20:00:00,REGULAR,003169998,001097769

A002,R051,02-00-00,05-22-11,00:00:00,REGULAR,003170119,001097792

A002,R051,02-00-00,05-22-11,04:00:00,REGULAR,003170146,001097801

A002,R051,02-00-00,05-22-11,08:00:00,REGULAR,003170164,001097820

A002,R051,02-00-00,05-22-11,12:00:00,REGULAR,003170240,001097867

A002,R051,02-00-00,05-22-11,16:00:00,REGULAR,003170388,001097912

A002,R051,02-00-00,05-22-11,20:00:00,REGULAR,003170611,001097941

A002,R051,02-00-00,05-23-11,00:00:00,REGULAR,003170695,001097964

A002,R051,02-00-00,05-23-11,04:00:00,REGULAR,003170701,001097964

A002,R051,02-00-00,05-23-11,08:00:00,REGULAR,003170746,

A002,R051,02-03-04,05-26-11,08:00:00,REGULAR,003067950,002008679

A002,R051,02-03-04,05-26-11,12:00:00,REGULAR,003068246,002008953

A002,R051,02-03-04,05-26-11,16:00:00,REGULAR,003068526,002009041

A002,R051,02-03-04,05-26-11,20:00:00,REGULAR,003069206,002009135

A002,R051,02-03-04,05-27-11,00:00:00,REGULAR,003069478,002009169

A002,R051,02-03-04,05-27-11,04:00:00,REGULAR,003069499,002009174

A002,R051,02-03-04,05-27-11,08:00:00,REGULAR,003069543,002009360

A002,R051,02-03-04,05-27-11,12:00:00,REGULAR,003069775,002009634

A002,R051,02-03-04,05-27-11,16:00:00,REGULAR,003070149,002009748

A002,R051,02-03-04,05-27-11,20:00:00,REGULAR,003070766,002009834

A002,R051,02-03-05,05-21-11,00:00:00,REGULAR,005122953,000976390

A002,R051,02-03-05,05-21-11,04:00:00,REGULAR,005123040,000976392

A002,R051,02-03-05,05-21-11,08:00:00,REGULAR,005123098,000976401

A002,R051,02-03-05,05-21-11,12:00:00,REGULAR,005123350,000976430

A002,R051,02-03-05,05-21-11,16:00:00,REGULAR,005123779,000976444

A002,R051,


A011,R080,01-00-02,05-24-11,16:00:00,REGULAR,004330547,003737900

A011,R080,01-00-02,05-24-11,20:00:00,REGULAR,004331271,003738088

A011,R080,01-00-02,05-25-11,00:00:00,REGULAR,004331503,003738146

A011,R080,01-00-02,05-25-11,04:00:00,REGULAR,004331539,003738152

A011,R080,01-00-02,05-25-11,08:00:00,REGULAR,004331585,003738312

A011,R080,01-00-02,05-25-11,12:00:00,REGULAR,004331819,003738845

A011,R080,01-00-02,05-25-11,16:00:00,REGULAR,004332076,003738997

A011,R080,01-00-02,05-25-11,20:00:00,REGULAR,004332755,003739214

A011,R080,01-00-02,05-26-11,00:00:00,REGULAR,004332980,003739266

A011,R080,01-00-02,05-26-11,04:00:00,REGULAR,004333009,003739274

A011,R080,01-00-02,05-26-11,08:00:00,REGULAR,004333048,003739454

A011,R080,01-00-02,05-26-11,12:00:00,REGULAR,004333269,003740003

A011,R080,01-00-02,05-26-11,16:00:00,REGULAR,004333542,003740202

A011,R080,01-00-02,05-26-11,20:00:00,REGULAR,004334225,003740396

A011,R080,01-00-02,05-27-11,00:00:00,REGULAR,004334449,003740463

A011,R080

A021,R032,01-00-05,05-27-11,16:00:00,REGULAR,000748760,002547144

A021,R032,01-00-05,05-27-11,20:00:00,REGULAR,000748935,002547819

A021,R032,01-00-06,05-21-11,00:00:00,REGULAR,002124516,005181513

A021,R032,01-00-06,05-21-11,04:00:00,REGULAR,002124536,005181593

A021,R032,01-00-06,05-21-11,08:00:00,REGULAR,002124543,005181660

A021,R032,01-00-06,05-21-11,12:00:00,REGULAR,002124563,005182050

A021,R032,01-00-06,05-21-11,16:00:00,REGULAR,002124630,005182779

A021,R032,01-00-06,05-21-11,20:00:00,REGULAR,002124731,005183614

A021,R032,01-00-06,05-22-11,00:00:00,REGULAR,002124774,005184017

A021,R032,01-00-06,05-22-11,04:00:00,REGULAR,002124788,005184117

A021,R032,01-00-06,05-22-11,08:00:00,REGULAR,002124795,005184149

A021,R032,01-00-06,05-22-11,12:00:00,REGULAR,002124811,005184358

A021,R032,01-00-06,05-22-11,16:00:00,REGULAR,002124859,005184883

A021,R032,01-00-06,05-22-11,20:00:00,REGULAR,002124931,005185466

A021,R032,01-00-06,05-23-11,00:00:00,REGULAR,002124967,005185735

A021,R032,


A030,R083,01-00-01,05-27-11,04:00:00,RECOVR,003670609,001278226

A030,R083,01-00-01,05-27-11,08:00:00,AUD,003670632,001278265

A030,R083,01-00-01,05-27-11,12:00:00,REGULAR,003670782,001278456

A030,R083,01-00-01,05-27-11,16:00:00,REGULAR,003671148,001278574

A030,R083,01-00-01,05-27-11,20:00:00,REGULAR,003671553,001278644

A030,R083,01-00-02,05-21-11,00:00:00,REGULAR,002631582,001232568

A030,R083,01-00-02,05-21-11,04:00:00,REGULAR,002631611,001232574

A030,R083,01-00-02,05-21-11,04:00:00,RECOVR,002631611,001232574

A030,R083,01-00-02,05-21-11,08:00:00,AUD,002631624,001232599

A030,R083,01-00-02,05-21-11,12:00:00,REGULAR,002631686,001232666

A030,R083,01-00-02,05-21-11,16:00:00,REGULAR,002631920,001232752

A030,R083,01-00-02,05-21-11,20:00:00,REGULAR,002632206,001232820

A030,R083,01-00-02,05-22-11,00:00:00,REGULAR,002632317,001232857

A030,R083,01-00-02,05-22-11,04:00:00,REGULAR,002632360,001232864

A030,R083,01-00-02,05-22-11,08:00:00,REGULAR,002632372,001232867

A030,R083,01-00-02,

A034,R170,03-06-00,05-24-11,09:00:00,REGULAR,009192602,005808823

A034,R170,03-06-00,05-24-11,13:00:00,REGULAR,009192804,005808916

A034,R170,03-06-00,05-24-11,17:00:00,REGULAR,009193190,005809086

A034,R170,03-06-00,05-24-11,21:00:00,REGULAR,009193766,005809370

A034,R170,03-06-00,05-25-11,01:00:00,REGULAR,009194035,005809390

A034,R170,03-06-00,05-25-11,05:00:00,REGULAR,009194058,005809392

A034,R170,03-06-00,05-25-11,09:00:00,REGULAR,009194163,005809457

A034,R170,03-06-00,05-25-11,13:00:00,REGULAR,009194410,005809586

A034,R170,03-06-00,05-25-11,17:00:00,REGULAR,009194865,005809866

A034,R170,03-06-00,05-25-11,21:00:00,REGULAR,009195428,005810169

A034,R170,03-06-00,05-26-11,01:00:00,REGULAR,009195676,005810192

A034,R170,03-06-00,05-26-11,05:00:00,REGULAR,009195698,005810193

A034,R170,03-06-00,05-26-11,09:00:00,REGULAR,009195801,005810240

A034,R170,03-06-00,05-26-11,13:00:00,REGULAR,009196011,005810336

A034,R170,03-06-00,05-26-11,17:00:00,REGULAR,009196423,005810508

A034,R170,

A041,R086,00-00-04,05-23-11,08:00:00,REGULAR,003263329,001542114

A041,R086,00-00-04,05-23-11,12:00:00,REGULAR,003263571,001542560

A041,R086,00-00-04,05-23-11,16:00:00,REGULAR,003264139,001542914

A041,R086,00-00-04,05-23-11,20:00:00,REGULAR,003265114,001543262

A041,R086,00-00-04,05-24-11,00:00:00,REGULAR,003265451,001543314

A041,R086,00-00-04,05-24-11,04:00:00,REGULAR,003265478,001543320

A041,R086,00-00-04,05-24-11,08:00:00,REGULAR,003265505,001543401

A041,R086,00-00-04,05-24-11,12:00:00,REGULAR,003265760,001543798

A041,R086,00-00-04,05-24-11,16:00:00,REGULAR,003266318,001544113

A041,R086,00-00-04,05-24-11,20:00:00,REGULAR,003267350,001544499

A041,R086,00-00-04,05-25-11,00:00:00,REGULAR,003267758,001544564

A041,R086,00-00-04,05-25-11,04:00:00,REGULAR,003267797,001544570

A041,R086,00-00-04,05-25-11,04:00:00,RECOVR,003267797,001544570

A041,R086,00-00-04,05-25-11,08:00:00,AUD,003267831,001544635

A041,R086,00-00-04,05-25-11,12:00:00,REGULAR,003268075,001545039

A041,R086,00-00

## Quiz 7: Filtering Irregular Data

In [145]:
def filter_by_descn(data, descn_value):
    '''
    Filter the dataframe to only rows where the 'DESCn' column has the value descn = descn_value.
    
    For example, if the pandas dataframe is as follows:
    ,C/A,UNIT,SCP,DATEn,TIMEn,DESCn,ENTRIESn,EXITSn
    0,A002,R051,02-00-00,05-01-11,00:00:00,REGULAR,3144312,1088151
    1,A002,R051,02-00-00,05-01-11,04:00:00,DOOR,3144335,1088159
    2,A002,R051,02-00-00,05-01-11,08:00:00,REGULAR,3144353,1088177
    3,A002,R051,02-00-00,05-01-11,12:00:00,DOOR,3144424,1088231
    
    The dataframe will look like below after filtering to only rows where DESCn column
    has the value 'REGULAR':
    0,A002,R051,02-00-00,05-01-11,00:00:00,REGULAR,3144312,1088151
    2,A002,R051,02-00-00,05-01-11,08:00:00,REGULAR,3144353,1088177
    '''
    
    return data[data['descn'] == descn_value.upper()]

### Read Data

In [146]:
turnstile_data = read_csv_data('updated_turnstile-110528.txt', input_dir())

In [147]:
print (turnstile_data)

       c/a  unit       scp     daten     timen    descn  entriesn   exitsn
0     A002  R051  02-00-00  05-21-11  00:00:00  REGULAR   3169391  1097585
1     A002  R051  02-00-00  05-21-11  04:00:00  REGULAR   3169415  1097588
2     A002  R051  02-00-00  05-21-11  08:00:00  REGULAR   3169431  1097607
3     A002  R051  02-00-00  05-21-11  12:00:00  REGULAR   3169506  1097686
4     A002  R051  02-00-00  05-21-11  16:00:00  REGULAR   3169693  1097734
5     A002  R051  02-00-00  05-21-11  20:00:00  REGULAR   3169998  1097769
6     A002  R051  02-00-00  05-22-11  00:00:00  REGULAR   3170119  1097792
7     A002  R051  02-00-00  05-22-11  04:00:00  REGULAR   3170146  1097801
8     A002  R051  02-00-00  05-22-11  08:00:00  REGULAR   3170164  1097820
9     A002  R051  02-00-00  05-22-11  12:00:00  REGULAR   3170240  1097867
10    A002  R051  02-00-00  05-22-11  16:00:00  REGULAR   3170388  1097912
11    A002  R051  02-00-00  05-22-11  20:00:00  REGULAR   3170611  1097941
12    A002  R051  02-00-0

In [148]:
filter_by_descn(turnstile_data, 'regular')

Unnamed: 0,c/a,unit,scp,daten,timen,descn,entriesn,exitsn
0,A002,R051,02-00-00,05-21-11,00:00:00,REGULAR,3169391,1097585
1,A002,R051,02-00-00,05-21-11,04:00:00,REGULAR,3169415,1097588
2,A002,R051,02-00-00,05-21-11,08:00:00,REGULAR,3169431,1097607
3,A002,R051,02-00-00,05-21-11,12:00:00,REGULAR,3169506,1097686
4,A002,R051,02-00-00,05-21-11,16:00:00,REGULAR,3169693,1097734
5,A002,R051,02-00-00,05-21-11,20:00:00,REGULAR,3169998,1097769
6,A002,R051,02-00-00,05-22-11,00:00:00,REGULAR,3170119,1097792
7,A002,R051,02-00-00,05-22-11,04:00:00,REGULAR,3170146,1097801
8,A002,R051,02-00-00,05-22-11,08:00:00,REGULAR,3170164,1097820
9,A002,R051,02-00-00,05-22-11,12:00:00,REGULAR,3170240,1097867


## Quiz 8: Get Hourly Entries
The data in the MTA Subway Turnstile data reports on the cumulative number of entries and exits per row. Assume that you have a dataframe called df that contains only the rows for a particular turnstile machine (i.e., unique SCP, C/A, and UNIT). This function should change these cumulative entry numbers to a count of entries since the last reading (i.e., entries since the last row in the dataframe).

       1) Create a new column called ENTRIESn_hourly
       2) Assign to the column the difference between ENTRIESn of the current row 
          and the previous row. If there is any NaN, fill/replace it with 1.

In [160]:
def get_hourly_entries(data):
    data['ENTRIESn_hourly'] = data['entriesn'] - data['entriesn'].shift(1)
    data = data.fillna(1)
    
    return data


### Read Data

In [161]:
turnstile_data = read_csv_data('updated_turnstile-110528.txt', input_dir())

In [162]:
print(turnstile_data)

       c/a  unit       scp     daten     timen    descn  entriesn   exitsn
0     A002  R051  02-00-00  05-21-11  00:00:00  REGULAR   3169391  1097585
1     A002  R051  02-00-00  05-21-11  04:00:00  REGULAR   3169415  1097588
2     A002  R051  02-00-00  05-21-11  08:00:00  REGULAR   3169431  1097607
3     A002  R051  02-00-00  05-21-11  12:00:00  REGULAR   3169506  1097686
4     A002  R051  02-00-00  05-21-11  16:00:00  REGULAR   3169693  1097734
5     A002  R051  02-00-00  05-21-11  20:00:00  REGULAR   3169998  1097769
6     A002  R051  02-00-00  05-22-11  00:00:00  REGULAR   3170119  1097792
7     A002  R051  02-00-00  05-22-11  04:00:00  REGULAR   3170146  1097801
8     A002  R051  02-00-00  05-22-11  08:00:00  REGULAR   3170164  1097820
9     A002  R051  02-00-00  05-22-11  12:00:00  REGULAR   3170240  1097867
10    A002  R051  02-00-00  05-22-11  16:00:00  REGULAR   3170388  1097912
11    A002  R051  02-00-00  05-22-11  20:00:00  REGULAR   3170611  1097941
12    A002  R051  02-00-0

### Get Hourly Entries

In [163]:
turnstile_data = get_hourly_entries(turnstile_data)

In [164]:
print(turnstile_data)

       c/a  unit       scp     daten     timen    descn  entriesn   exitsn  \
0     A002  R051  02-00-00  05-21-11  00:00:00  REGULAR   3169391  1097585   
1     A002  R051  02-00-00  05-21-11  04:00:00  REGULAR   3169415  1097588   
2     A002  R051  02-00-00  05-21-11  08:00:00  REGULAR   3169431  1097607   
3     A002  R051  02-00-00  05-21-11  12:00:00  REGULAR   3169506  1097686   
4     A002  R051  02-00-00  05-21-11  16:00:00  REGULAR   3169693  1097734   
5     A002  R051  02-00-00  05-21-11  20:00:00  REGULAR   3169998  1097769   
6     A002  R051  02-00-00  05-22-11  00:00:00  REGULAR   3170119  1097792   
7     A002  R051  02-00-00  05-22-11  04:00:00  REGULAR   3170146  1097801   
8     A002  R051  02-00-00  05-22-11  08:00:00  REGULAR   3170164  1097820   
9     A002  R051  02-00-00  05-22-11  12:00:00  REGULAR   3170240  1097867   
10    A002  R051  02-00-00  05-22-11  16:00:00  REGULAR   3170388  1097912   
11    A002  R051  02-00-00  05-22-11  20:00:00  REGULAR   317061

## Quiz 9: Get Hourly Exits

The data in the MTA Subway Turnstile data reports on the cumulative number of entries and exits per row.
Change the cumulative exit numbers to a count of exits since the last reading:

    1) Create a new column called EXITSn_hourly
    2) Assign to the column the difference between EXITSn of the current row and the previous row. If there is any NaN, fill/replace it with 0.

In [180]:
def get_hourly_exits(data):
    data['EXITSn_hourly'] = data['exitsn'] - data['exitsn'].shift(1)
    data = data.fillna(0)
    
    return data

In [181]:
turnstile_data = get_hourly_exits(turnstile_data)

In [182]:
print(turnstile_data)

       c/a  unit       scp     daten     timen    descn  entriesn   exitsn  \
0     A002  R051  02-00-00  05-21-11  00:00:00  REGULAR   3169391  1097585   
1     A002  R051  02-00-00  05-21-11  04:00:00  REGULAR   3169415  1097588   
2     A002  R051  02-00-00  05-21-11  08:00:00  REGULAR   3169431  1097607   
3     A002  R051  02-00-00  05-21-11  12:00:00  REGULAR   3169506  1097686   
4     A002  R051  02-00-00  05-21-11  16:00:00  REGULAR   3169693  1097734   
5     A002  R051  02-00-00  05-21-11  20:00:00  REGULAR   3169998  1097769   
6     A002  R051  02-00-00  05-22-11  00:00:00  REGULAR   3170119  1097792   
7     A002  R051  02-00-00  05-22-11  04:00:00  REGULAR   3170146  1097801   
8     A002  R051  02-00-00  05-22-11  08:00:00  REGULAR   3170164  1097820   
9     A002  R051  02-00-00  05-22-11  12:00:00  REGULAR   3170240  1097867   
10    A002  R051  02-00-00  05-22-11  16:00:00  REGULAR   3170388  1097912   
11    A002  R051  02-00-00  05-22-11  20:00:00  REGULAR   317061

## Quiz 10: Time to Hour
Given an input variable time that represents time in the format of: "00:00:00" (hour:minutes:seconds)

Extract the hour part from the input variable time and return it as an integer

        1) if hour is 00, your code should return 0
        2) if hour is 01, your code should return 1
        3) if hour is 21, your code should return 21

In [183]:
def time_to_hour(time):
    return int(time[:2])

In [185]:
print(time_to_hour('21:39:42'))

21


## Quiz 11: Reformat Subway Dates

The dates in our subway data are formatted in the format month-day-year. The dates in our weather underground data are formatted year-month-day.
    
Join these two data sets together, we'll want the dates formatted the same way. 

Write a function that takes as its input a date in the MTA Subway data format, and returns a date in the weather underground format.
    
Datetime Library

http://docs.python.org/2/library/datetime.html#datetime.datetime.strptime

In [206]:
import datetime

In [217]:
def reformat_date(date):
    if type(date) is str:
        date = pd.to_datetime(date)

    date = date.isoformat()

    return date[:10]

In [218]:
reformat_date('10-10-1999')

'1999-10-10'