# A visualisation of distances traveled in Switzerland
In this notebook, we'll create a plot showing the distribution of traveled distances during the year 2017, obtained from all distances traveled by devices connected to Swisscom network.
These distributions are available for every day of 2017 on the Swisscom Open Data portal `opendata.swisscom.com`.

In [None]:
import wget
import csv
from collections import defaultdict
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams.update({'font.size': 18})

Set this boolean to decide whether to plot actual traveled distances (on transport graph), or as-the-crow-flies distance

In [None]:
effective_distance = True

In [None]:
# download dataset
url = 'https://opendata.swisscom.com/explore/dataset/travel-distances-per-day-in-2017-en/download/' + \
      '?format=csv&timezone=Europe/Berlin&use_labels_for_header=true'

fname = wget.download(url)

## Parse the CSV file

In [None]:
date_key = 'Travel date'
bucket_key = 'Distance group in km'

if effective_distance:
    rail_count_key = 'Number of journeys by train (effective)'
    road_count_key = 'Number of journeys by road (effective)'
else:
    rail_count_key = 'Number of journeys by train (direct)'
    road_count_key = 'Number of journeys by road (direct)'

date_to_road_hist = defaultdict(list)
date_to_rail_hist = defaultdict(list)

dates = set()
    
with open(fname, 'r') as f:
    reader = csv.DictReader(f, delimiter=';')
    for row in reader:
        date_str, dist_bucket = row[date_key], row[bucket_key]
        count_rail, count_road = row[rail_count_key], row[road_count_key]
        
        # use middle of bucket as value
        mid_bucket = (int(dist_bucket.split('-')[1]) + int(dist_bucket.split('-')[0])) / 2.
        date = datetime.strptime(date_str, '%Y-%m-%d').date()
        dates.add(date)
        
        if '<' not in count_road:
            date_to_road_hist[date].append((mid_bucket, int(count_road)))
        if '<' not in count_rail:
            date_to_rail_hist[date].append((mid_bucket, int(count_rail)))

## Plot
Here, for each distance bucket, we'll compute the average and standard deviation of the counts over all dates. This will give us the "average" daily distance distribution.

In [None]:
bucket_to_counts_rail = defaultdict(list)
bucket_to_counts_road = defaultdict(list)

for date in dates:
    for b, c in date_to_rail_hist[date]:
        bucket_to_counts_rail[b].append(c)
    for b, c in date_to_road_hist[date]:
        bucket_to_counts_road[b].append(c)

In [None]:
bucket_counts_rail = sorted(list(bucket_to_counts_rail.items()))
bucket_counts_road = sorted(list(bucket_to_counts_road.items()))

buckets_rail = list(map(lambda t: t[0], bucket_counts_rail))
buckets_road = list(map(lambda t: t[0], bucket_counts_road))

means_rail = np.array(list(map(lambda t: np.mean(t[1]), bucket_counts_rail)))
means_road = np.array(list(map(lambda t: np.mean(t[1]), bucket_counts_road)))
stdev_rail = np.array(list(map(lambda t: np.std(t[1]), bucket_counts_rail)))
stdev_road = np.array(list(map(lambda t: np.std(t[1]), bucket_counts_road)))

plt.figure(figsize=(10,7))
plt.semilogy(buckets_road, means_road, lw=6, label='road', alpha=0.8)
plt.fill_between(buckets_road, means_road+2*stdev_road, np.maximum(means_road-2*stdev_road, 10), 
                 facecolor='blue', alpha=0.3)

plt.semilogy(buckets_rail, means_rail, lw=6, label='rail', alpha=0.8)
plt.fill_between(buckets_rail, means_rail+2*stdev_rail, np.maximum(means_rail-2*stdev_rail, 10), 
                 facecolor='orange', alpha=0.6)

plt.xlabel('distance traveled [km]')
plt.ylabel('count')
plt.title('Average daily distance distribution')
plt.legend()
plt.show()