# Dublin Buses - Download Data

Use this notebook to download and consolidate the daily data sets into a single parquet file.

In [1]:
import numpy as np
import pandas as pd
import zipfile
import urllib
import os
from ipywidgets import interact, interact_manual
from tqdm import tqdm_notebook as tqdm

In [2]:
zip_file_name = "data/sir010113-310113.zip"
parquet_file_name = "data/sir010113-310113.parquet"

In [3]:
def download_zip():
    url = "http://opendata.dublincity.ie/TrafficOpenData/sir010113-310113.zip"
    urllib.request.urlretrieve(url, zip_file_name)

In [4]:
def read_data_frame(filename):
    header = ['Timestamp', 'LineID', 'Direction', 'PatternID', 'TimeFrame', 
              'JourneyID', 'Operator', 'Congestion', 'Lon', 'Lat', 
              'Delay', 'BlockID', 'VehicleID', 'StopID', 'AtStop']   
    types = {'Timestamp': np.int64,
             'JourneyID': np.int32,
             'Congestion': np.int8,
             'Lon': np.float64,
             'Lat': np.float64,
             'Delay': np.int8,
             'VehicleID': np.int32,
             'AtStop': np.int8}
    df = pd.read_csv(filename, header=None, names=header, dtype=types, 
                     parse_dates=['TimeFrame'], infer_datetime_format=True)
    return df

In [5]:
def get_day(ts):
    return ts.day

def get_hour(ts):
    return ts.hour

def get_minute(ts):
    return ts.minute

def prepare_data_frame(df): 
    null_replacements = {'LineID': 0, 'StopID': 0}
    df = df.fillna(value=null_replacements)
    df['LineID'] = df['LineID'].astype(np.int32)
    df['StopID'] = df['StopID'].astype(np.int32)
    df['DateTime'] = pd.to_datetime(df['Timestamp'], unit='us')
#     df['Day'] = df['DateTime'].apply(get_day)
#     df['Hour'] = df['DateTime'].apply(get_hour)
#     df['Minute'] = df['DateTime'].apply(get_minute)
    return df

Use this function to read in all the extracted files into a single DataFrame. The generator expression make the concatenation quite swift and preclude the use of other supporting variables.

In [7]:
def read_zip_file(filename):
    final_df = None
    file_names = []
    with zipfile.ZipFile(filename) as z:
        files = z.infolist()
        for f in tqdm(files):
            z.extract(f, path='data')
            file_names.append("data/" + f.filename)
    
    df = pd.concat((read_data_frame(file) for file in tqdm(file_names)), ignore_index=True)
    for file in tqdm(file_names):
        os.remove(file)
    df = prepare_data_frame(df)
    return df

Create the data directory if it does not yet exist.

In [8]:
if not os.path.exists("data"):
    os.makedirs("data")

Conditionally download and process the data file. The download process can take a *long time*, so please make sure to retain the zip file for future use.

In [9]:
if not os.path.exists(zip_file_name):
    download_zip()

In [10]:
df = read_zip_file(zip_file_name)

HBox(children=(IntProgress(value=0, max=31), HTML(value='')))




HBox(children=(IntProgress(value=0, max=31), HTML(value='')))




HBox(children=(IntProgress(value=0, max=31), HTML(value='')))




Save the consolidated DataFrame to a parquet-formatted file.

In [11]:
if not os.path.exists(parquet_file_name):
    df.to_parquet(parquet_file_name, index=False)