In [1]:
import numpy as np
import pandas as pd
import zipfile
import urllib
import os

In [2]:
def download_zip():
    url = "http://opendata.dublincity.ie/TrafficOpenData/sir010113-310113.zip"
    urllib.request.urlretrieve(url, "data/sir010113-310113.zip")

In [3]:
def read_data_frame(filename):
    header = ['Timestamp', 'LineID', 'Direction', 'PatternID', 'TimeFrame', 
              'JourneyID', 'Operator', 'Congestion', 'Lon', 'Lat', 
              'Delay', 'BlockID', 'VehicleID', 'StopID', 'AtStop']   
    types = {'Timestamp': np.int64,
             'JourneyID': np.int32,
             'Congestion': np.int8,
             'Lon': np.float64,
             'Lat': np.float64,
             'Delay': np.int8,
             'VehicleID': np.int32,
             'AtStop': np.int8}
    df = pd.read_csv(filename, header=None, names=header, dtype=types, 
                     parse_dates=['TimeFrame'], infer_datetime_format=True)
    return df

In [4]:
def get_day(ts):
    return ts.day

def get_hour(ts):
    return ts.hour

def get_minute(ts):
    return ts.minute

def prepare_data_frame(df): 
    null_replacements = {'LineID': 0, 'StopID': 0}
    df = df.fillna(value=null_replacements)
    df['LineID'] = df['LineID'].astype(np.int32)
    df['StopID'] = df['StopID'].astype(np.int32)
    df['DateTime'] = pd.to_datetime(df['Timestamp'], unit='us')
    df['Day'] = df['DateTime'].apply(get_day)
    df['Hour'] = df['DateTime'].apply(get_hour)
    df['Minute'] = df['DateTime'].apply(get_minute)
    return df

In [5]:
def read_zip_file(filename):
    final_df = None
    with zipfile.ZipFile(filename) as z:
        files = z.infolist()
        for f in files:
            print(f.filename)
            z.extract(f, path='data')
            df = read_data_frame("data/" + f.filename)
            if final_df is None:
                final_df = df
            else:
                final_df = final_df.append(df)
    final_df = prepare_data_frame(final_df)
    return final_df

Create the data directory if it does not yet exist.

In [6]:
if not os.path.exists("data"):
    os.makedirs("data")

In [7]:
file_name = "data/sir010113-310113.zip"

Conditionally download and process the data file.

In [8]:
if not os.path.exists(file_name):
    download_zip()

In [9]:
df = read_zip_file(file_name)

siri.20130101.csv.gz
siri.20130102.csv.gz
siri.20130103.csv.gz
siri.20130104.csv.gz
siri.20130105.csv.gz
siri.20130106.csv.gz
siri.20130107.csv.gz
siri.20130108.csv.gz
siri.20130109.csv.gz
siri.20130110.csv.gz
siri.20130111.csv.gz
siri.20130112.csv.gz
siri.20130113.csv.gz
siri.20130114.csv.gz
siri.20130115.csv.gz
siri.20130116.csv.gz
siri.20130117.csv.gz
siri.20130118.csv.gz
siri.20130119.csv.gz
siri.20130120.csv.gz
siri.20130121.csv.gz
siri.20130122.csv.gz
siri.20130123.csv.gz
siri.20130124.csv.gz
siri.20130125.csv.gz
siri.20130126.csv.gz
siri.20130127.csv.gz
siri.20130128.csv.gz
siri.20130129.csv.gz
siri.20130130.csv.gz
siri.20130131.csv.gz


In [10]:
if not os.path.exists("data/sir010113-310113.parquet"):
    df.to_parquet("data/sir010113-310113.parquet", index=False)

In [11]:
df.head()

Unnamed: 0,Timestamp,LineID,Direction,PatternID,TimeFrame,JourneyID,Operator,Congestion,Lon,Lat,Delay,BlockID,VehicleID,StopID,AtStop,DateTime,Day,Hour,Minute
0,1356998403000000,747,0,7470001.0,2012-12-31,3493,SL,0,-6.236852,53.425327,59,747006,40040,7411,0,2013-01-01 00:00:03,1,0,0
1,1356998405000000,27,0,,2012-12-31,3883,RD,0,-6.233417,53.342232,0,27017,33521,395,0,2013-01-01 00:00:05,1,0,0
2,1356998407000000,40,0,,2012-12-31,2226,HN,0,-6.27825,53.416683,0,40206,33142,6071,0,2013-01-01 00:00:07,1,0,0
3,1356998407000000,7,0,71003.0,2012-12-31,6106,D1,0,-6.231633,53.317768,0,7019,43004,3222,1,2013-01-01 00:00:07,1,0,0
4,1356998411000000,747,0,7471001.0,2012-12-31,3531,SL,0,-6.254617,53.355484,58,747007,40039,1445,0,2013-01-01 00:00:11,1,0,0
