In [3]:
%matplotlib inline
import pandas

In [1]:
# Tools for downloading dataset
def trips_basename(year, month):
    import calendar
    firstday, lastday = (1, calendar.monthrange(year, month)[1])
    return "trips-{year}.{month}.{firstday}-{year}.{month}.{lastday}".format(**locals())
def trips_url(year, month):
    base = trips_basename(year, month)
    extension = '.csv.zip'
    server_dir = 'http://oslo-citybike.s3.amazonaws.com/exports/'
    return server_dir + base + extension

def download_trip(year, month):
    import urllib.request
    from io import BytesIO
    from zipfile import ZipFile
    import os.path
    
    url = trips_url(year, month)
    filename = trips_basename(year, month) + '.csv'
    outpath = "data/"+filename
    if os.path.exists(outpath):
        print('skipping existing %s' % (url))
        return outpath
    
    print('downloading %s' % (url,))
    
    # Download ZIP to memory
    # ZipFile requires seek() which urlib does not implement
    temp = BytesIO()
    temp.write(urllib.request.urlopen(url).read())
    zipfile = ZipFile(temp)

    # Write to disk
    csvfile = open(outpath, 'wb+')
    csvfile.write(zipfile.read(filename))

    csvfile.close()
    zipfile.close()
    return outpath

def months_between(start, end):
    periods = []
    current = list(start)
    while (current != list(end)):
        periods.append(tuple(current))

        # calculate next
        if current[1] == 12:
            # end of year
            current[0] += 1
            current[1] = 1
        else:
           # just new month
           current[1] += 1
    return periods

start = (2016, 6)
end = (2017, 8)
notexisting = [
    (2017, 1), (2017, 2), (2017, 3)
]
periods = sorted(set(months_between(start, end)).difference(notexisting))
for period in periods:
    try:
        filename = download_trip(*period)
    except Exception as e:
        raise RuntimeError("Could not download %d-%d: %s" % (*period, e.msg))
"done"

skipping existing http://oslo-citybike.s3.amazonaws.com/exports/trips-2016.6.1-2016.6.30.csv.zip
skipping existing http://oslo-citybike.s3.amazonaws.com/exports/trips-2016.7.1-2016.7.31.csv.zip
skipping existing http://oslo-citybike.s3.amazonaws.com/exports/trips-2016.8.1-2016.8.31.csv.zip
skipping existing http://oslo-citybike.s3.amazonaws.com/exports/trips-2016.9.1-2016.9.30.csv.zip
skipping existing http://oslo-citybike.s3.amazonaws.com/exports/trips-2016.10.1-2016.10.31.csv.zip
skipping existing http://oslo-citybike.s3.amazonaws.com/exports/trips-2016.11.1-2016.11.30.csv.zip
skipping existing http://oslo-citybike.s3.amazonaws.com/exports/trips-2016.12.1-2016.12.31.csv.zip
skipping existing http://oslo-citybike.s3.amazonaws.com/exports/trips-2017.4.1-2017.4.30.csv.zip
skipping existing http://oslo-citybike.s3.amazonaws.com/exports/trips-2017.5.1-2017.5.31.csv.zip
skipping existing http://oslo-citybike.s3.amazonaws.com/exports/trips-2017.6.1-2017.6.30.csv.zip
skipping existing http:/

'done'

In [2]:
import json
stations = json.loads(open('data/oslo_stations.json', 'r').read())
stations['stations'][:3]

[{'bounds': [{'latitude': 59.915418602160436, 'longitude': 10.762068629264832},
   {'latitude': 59.91565254992276, 'longitude': 10.762672126293182},
   {'latitude': 59.915807169665264, 'longitude': 10.762433409690855},
   {'latitude': 59.91557994562126, 'longitude': 10.761821866035461},
   {'latitude': 59.915418602160436, 'longitude': 10.762068629264832}],
  'center': {'latitude': 59.91562, 'longitude': 10.762248},
  'id': 157,
  'in_service': True,
  'number_of_locks': 30,
  'subtitle': 'mellom Norbygata og Urtegata',
  'title': 'Nylandsveien'},
 {'bounds': [{'latitude': 59.938998693156904, 'longitude': 10.758989453315735},
   {'latitude': 59.939057810485, 'longitude': 10.759515166282652},
   {'latitude': 59.93939638951557, 'longitude': 10.759338140487671},
   {'latitude': 59.93932383715719, 'longitude': 10.758823156356812},
   {'latitude': 59.938998693156904, 'longitude': 10.758989453315735}],
  'center': {'latitude': 59.939192, 'longitude': 10.759168},
  'id': 158,
  'in_service': T

In [4]:
# NOTE: Uses about 700MB of RAM
trips = pandas.DataFrame()
for period in periods:
    filename = "data/"+ trips_basename(*period)+'.csv'
    frame = pandas.read_csv(filename, index_col=None, header=0)
    trips = pandas.concat((trips, frame))
trips[:3]

Unnamed: 0,Start station,Start time,End station,End time
0,226.0,2016-06-01 05:59:59 +0200,243.0,2016-06-01 06:02:14 +0200
1,206.0,2016-06-01 06:00:02 +0200,212.0,2016-06-01 06:18:46 +0200
2,290.0,2016-06-01 06:00:06 +0200,261.0,2016-06-01 06:02:14 +0200


In [5]:
number_trips = trips.shape[0]
number_trips

3180520

In [7]:
first = trips['Start time'].min()
last = trips['Start time'].max()
first, last

('2016-06-01 05:59:59 +0200', '2017-07-31 23:59:42 +0200')

In [8]:
from datetime import datetime
# 2013-12-21 09:12:00, ref http://strftime.org/
date_format = "%Y-%m-%d %H:%M:%S +0200"
f = datetime.strptime(first, date_format)
l = datetime.strptime(last, date_format)
delta = l - f 
delta.days

425

In [9]:
"Trips per day (average): %d" % (number_trips / delta.days)

'Trips per day (average): 7483'

In [19]:
def calculate_duration(row):
    # FIXME: normalize dataframe to have proper datetime 
    start = datetime.strptime(row['Start time'], date_format) 
    end = datetime.strptime(row['End time'], date_format)
    duration = (end - start).total_seconds()
    return duration
subs = trips[:5].copy() 
subs['Duration'] = subs.apply(calculate_duration, 'columns')
subs

Unnamed: 0,Start station,Start time,End station,End time,Duration
0,226.0,2016-06-01 05:59:59 +0200,243.0,2016-06-01 06:02:14 +0200,135.0
1,206.0,2016-06-01 06:00:02 +0200,212.0,2016-06-01 06:18:46 +0200,1124.0
2,290.0,2016-06-01 06:00:06 +0200,261.0,2016-06-01 06:02:14 +0200,128.0
3,206.0,2016-06-01 06:00:06 +0200,233.0,2016-06-01 06:02:47 +0200,161.0
4,184.0,2016-06-01 06:00:06 +0200,179.0,2016-06-01 06:15:13 +0200,907.0


In [38]:
import geopy.distance

stations_by_id = {} # id -> data
for station in stations['stations']:
    station_id = station['id']
    # sanity checking
    if not isinstance(station_id, int):
        raise ValueError("Station identifier not an integer: %s" % repr(station_id))
    if stations_by_id.get(station_id):
        raise ValueError("Duplicate station id: %d" % (station_id,))
    stations_by_id[station_id] = station
def station_location(station_id):
    point = stations_by_id[station_id]['center']
    return (point['latitude'], point['longitude'])
def calculate_distance(row):
    start = station_location(int(row['Start station']))
    end = station_location(int(row['End station']))
    dist = geopy.distance.great_circle(start, end)
    return dist.meters
 
subs['Distance'] = subs.apply(calculate_distance, 'columns')
subs

Unnamed: 0,Start station,Start time,End station,End time,Duration,Distance,Velocity
0,226.0,2016-06-01 05:59:59 +0200,243.0,2016-06-01 06:02:14 +0200,135.0,610.115142,4.519371
1,206.0,2016-06-01 06:00:02 +0200,212.0,2016-06-01 06:18:46 +0200,1124.0,2917.74692,2.59586
2,290.0,2016-06-01 06:00:06 +0200,261.0,2016-06-01 06:02:14 +0200,128.0,354.690116,2.771017
3,206.0,2016-06-01 06:00:06 +0200,233.0,2016-06-01 06:02:47 +0200,161.0,767.07241,4.764425
4,184.0,2016-06-01 06:00:06 +0200,179.0,2016-06-01 06:15:13 +0200,907.0,2986.506648,3.292731


In [37]:

def calculate_velocity(row):
    velocity = row['Distance'] / row['Duration']
    return velocity
 
subs['Velocity'] = subs.apply(calculate_velocity, 'columns')
subs

Unnamed: 0,Start station,Start time,End station,End time,Duration,Distance,Velocity
0,226.0,2016-06-01 05:59:59 +0200,243.0,2016-06-01 06:02:14 +0200,135.0,610.115142,4.519371
1,206.0,2016-06-01 06:00:02 +0200,212.0,2016-06-01 06:18:46 +0200,1124.0,2917.74692,2.59586
2,290.0,2016-06-01 06:00:06 +0200,261.0,2016-06-01 06:02:14 +0200,128.0,354.690116,2.771017
3,206.0,2016-06-01 06:00:06 +0200,233.0,2016-06-01 06:02:47 +0200,161.0,767.07241,4.764425
4,184.0,2016-06-01 06:00:06 +0200,179.0,2016-06-01 06:15:13 +0200,907.0,2986.506648,3.292731


In [80]:
# Find missing station info
def not_nan(n):
    return not math.isnan(n)
known_stations = set(stations_by_id.keys())
start_stations = set(filter(not_nan, trips['Start station'].unique()))
end_stations =  set(filter(not_nan, trips['End station'].unique()))
trip_stations = start_stations | end_stations
unknown_stations = trip_stations - known_stations
unknown_stations

{171.0, 172.0, 173.0, 186.0, 271.0, 288.0, 307.0, 310.0}

In [16]:
# Trips with missing start/end stations
missing_start = trips[pandas.isnull(trips['Start station'])]
missing_end = trips[pandas.isnull(trips['End station'])]
len(missing_start + missing_end)

219

In [43]:
len(trips['End station'].unique())

155