In [1]:
import pandas as pd
from datetime import datetime
import csv
import urllib.request
import codecs

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 3)

In [157]:
def load_csv_file(date_string):
    # for multiple files date_string must be a string separated by a return 
    date_obj_lst = [datetime.strftime(datetime.strptime(line, '%A, %B %d, %Y'), '%y%m%d') for line in date_string.split('\n') if line]
    files_lst = []
    for date in date_obj_lst:
        site_url = 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_'
        turnstile_url = ['{0}{1}.txt'.format(site_url, date) for date in date_obj_lst]
        filename = 'turnstile_' + str(date)
        with open(filename, 'w') as outfile:
            writer = csv.writer(outfile, delimiter=',')
            ftpstream = urllib.request.urlopen(turnstile_url[0])
            csvfile = csv.reader(codecs.iterdecode(ftpstream, 'utf-8'))
            for line in csvfile:
                writer.writerow(line)    
            for url in turnstile_url[1:]:
                ftpstream = urllib.request.urlopen(url)
                csvfile = csv.reader(codecs.iterdecode(ftpstream, 'utf-8'))
                firstline = True
                for line in csvfile:
                    if firstline:    #skips first line
                        firstline = False
                        continue
                    writer.writerow(line)
        files_lst.append(filename)
    return files_lst

In [211]:
def make_df(files_lst):
    df = pd.read_csv(files_lst[0])
    if len(files_lst) > 1:    
        for file in files_lst[1:]:
            df.append(pd.read_csv(file))
    df.columns = [col.strip(' ') for col in list(df.columns)]
    df['c_ENTRIES'] = df['ENTRIES'].shift(-1) - df['ENTRIES']
    df['c_EXITS'] = df['EXITS'].shift(-1) - df['EXITS']
    df['obj_DATE'] = pd.to_datetime(df['DATE'] + df['TIME'], format='%m/%d/%Y%H:%M:%S')
    columns_dict = {k:v for k,v in enumerate(list(df.columns))}
    # print(columns_dict)
    drop_key = [5, 8, 9, 10]
    drop_lst = [columns_dict[key] for key in drop_key]
    df = df.drop(drop_lst, 1)
    print('{} were dropped'.format(drop_lst))
    return df

In [212]:
def drop_outliers(pd_df):
    initial_count = pd_df.count()
    pd_df = pd_df[(pd_df['c_ENTRIES'] > 0) & 
            (pd_df['c_ENTRIES'] < 14400) &  # Assuming each turnstile can do 3600 person/hr * 4hrs = 14400 persons
            (pd_df['c_EXITS'] > 0) & 
            (pd_df['c_EXITS'] < 14400)]
    final_count = pd_df.count()
    return pd_df

In [214]:
load_csv_file("""Saturday, January 12, 2019
Saturday, January 05, 2019""")

['turnstile_190112', 'turnstile_190105']

In [239]:
test_df = make_df(['turnstile_190112', 'turnstile_190105'])

['DIVISION', 'DESC', 'ENTRIES', 'EXITS'] were dropped


In [240]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 402693 entries, 0 to 402692
Data columns (total 10 columns):
C/A          402693 non-null object
UNIT         402693 non-null object
SCP          402693 non-null object
STATION      402693 non-null object
LINENAME     402693 non-null object
DATE         402693 non-null object
TIME         402693 non-null object
c_ENTRIES    402692 non-null float64
c_EXITS      402692 non-null float64
obj_DATE     402693 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(2), object(7)
memory usage: 30.7+ MB


In [241]:
test_df = drop_outliers(test_df)

In [242]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 323091 entries, 0 to 402648
Data columns (total 10 columns):
C/A          323091 non-null object
UNIT         323091 non-null object
SCP          323091 non-null object
STATION      323091 non-null object
LINENAME     323091 non-null object
DATE         323091 non-null object
TIME         323091 non-null object
c_ENTRIES    323091 non-null float64
c_EXITS      323091 non-null float64
obj_DATE     323091 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(2), object(7)
memory usage: 27.1+ MB


In [255]:
def rides_per_station(pd_df):
    time_period = input('Input 4hrs, daily, weekly, monthly')
    if time_period == '4hrs':
        pass
    if time_period == 'daily':
        pass
    if time_period == 'weekly':
        pass
    if time_period == 'monthly':
        pass


In [256]:
rides_per_station(test_df)

Input 4hrs, daily, weekly, monthly daily
