In [1]:
import folium
import pickle
import pandas as pd
import networkx as nx
import reverse_geocoder as rg

## Loading the Data

We first load the files that interest us in DataFrames:
- The file `routes` contains route information, and especially the description of the route (i.e. is it for trams, busses, metro, train, etc)
- The file `trips` contains trip information, and especially the route each trip uses
- The file `stop_times` contains all stop information for each trip, especially the stop id
- The file `stop` contains all geographical information for each stop

In [2]:
DATA_PATH = 'data/raw/'
EARTH_RADIUS = 6373.0 #in Kilometers

In [3]:
routes = pd.read_csv('{}routes.txt'.format(DATA_PATH), delimiter=',')
routes.head(1)

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type
0,91-10-A-j21-1,37,10,,Tram,900


In [4]:
trips = pd.read_csv('{}trips.txt'.format(DATA_PATH), delimiter=',')
trips.head(1)

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,trip_short_name,direction_id
0,91-10-A-j21-1,TA+ej,1.TA.91-10-A-j21-1.1.H,"Ettingen, Bahnhof",10100,0


In [5]:
stop_times = pd.read_csv('{}stop_times.txt'.format(DATA_PATH), delimiter=',', dtype={'stop_id':'string'})
stop_times.head(1)

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type
0,120.TA.91-4-H-j21-1.9.R,13:58:00,13:58:00,8503088:0:21,1,0,0


In [6]:
stops = pd.read_csv('{}stops.txt'.format(DATA_PATH), delimiter=',')
stops.head(1)

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,location_type,parent_station
0,1100006,"Zell (Wiesental), Bahnhof",47.704632,7.847772,,


## Cleaning the Data

Now that the data is loaded, we're interested in keeping the stops that correspond to modes of transport that interest us, that is, railway transportation. In order to achieve this:
- We filter out the routes that do not interest us
- Subsequently, we filter our the trips using routes that do not interest us
- We remove stops of trips that do not interest us

In [7]:
# Get country info for stops, to keep only the ones in CH
countries = [geo['cc'] for geo in rg.search(list(zip(stops.stop_lat, stops.stop_lon)))]
stops['cc'] = countries

Loading formatted geocoded file...


In [8]:
# Keep trains only
modes_of_interest = [101, 102, 103, 105, 106, 107, 109, 117]

regional_routes = [106, 107, 109]
grandes_lignes = [101, 102, 103, 105]

In [9]:
# Filter and keep routes, trips and stop times that interest us
routes = routes[routes.route_type.isin(modes_of_interest)]
trips = trips[trips.route_id.isin(routes.route_id.unique())].copy()
stop_times = stop_times[stop_times.trip_id.isin(trips.trip_id.unique())].copy()

In [13]:
# Keep stops that are in the filtered stop times and in CH
railway_stops = stops[(stops.stop_id.isin(stop_times.stop_id.unique())) & (stops.cc == 'CH')].copy()

In [14]:
railway_stops[railway_stops.stop_name == 'Lausanne'].head()

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,location_type,parent_station,cc
5799,8501120,Lausanne,46.516793,6.629091,,8501120P,CH
5800,8501120:0:1,Lausanne,46.516793,6.629091,,8501120P,CH
5801,8501120:0:3,Lausanne,46.516793,6.629091,,8501120P,CH
5802,8501120:0:4,Lausanne,46.516793,6.629091,,8501120P,CH
5803,8501120:0:5,Lausanne,46.516793,6.629091,,8501120P,CH


Notice that some large main stations have multiple stop identifiers: all of these ids share a same prefix. We therefore edit all ids to just include the prefix and then drop duplicates.

In [14]:
# Remove the suffix of the ids of the same stations 
railway_stops['stop_id'] = railway_stops['stop_id'].apply(lambda id_: id_.split(':')[0])
# Cleanup
railway_stops.drop_duplicates(subset=['stop_id'], inplace=True)
railway_stops.drop(columns = ['location_type', 'parent_station'], inplace = True)

In [15]:
railway_stops[railway_stops.stop_name == 'Lausanne']

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,cc
5799,8501120,Lausanne,46.516793,6.629091,CH


In [16]:
print('Initially, there are {} stops when considering all modes of transport.'.format(len(stops)))
print('By considering only railway transport, we are left with {} stops in Switzerland.'.format(len(railway_stops)))

Initially, there are 36448 stops when considering all modes of transport.
By considering only railway transport, we are left with 1083 stops in Switzerland.


In [17]:
# Maintain this dict for ease of retrieval
stop_id_to_name = railway_stops.set_index('stop_id')['stop_name'].to_dict()

In [18]:
# Remove the suffix of the ids of the same stations 
stop_times['stop_id'] = stop_times['stop_id'].apply(lambda id_: str(id_).split(':')[0])
# Remove stop times that correspond to removed stops
stop_times = stop_times[stop_times.stop_id.isin(railway_stops.stop_id)]

Now that we have all the stops corresponding railway stops in Switzerland (i.e. the nodes), we can proceed to create the edges of the desired graph:
- First, group all stop times by the trip id and form a list of stops per trip. Keep only trips that have more than 1 stop. Trips having 1 stop exist because we removed stops outside of Switzerland.
- Second, add the route type by merging, to be able to distinguish between trips on local or national routes. 

In [20]:
trips_grouped = stop_times.groupby('trip_id')['stop_id'].aggregate(list).reset_index()
trips_grouped = trips_grouped[trips_grouped.stop_id.str.len() > 1]
# Add route type
trips_grouped = trips_grouped.merge(trips[['trip_id', 'route_id']]).merge(routes[['route_id', 'route_type']])
# Convert to string to drop duplicates
trips_grouped = trips_grouped.loc[trips_grouped.astype(str).drop_duplicates('stop_id').index]

In [21]:
def make_stop_pairs(stops):
    '''
        Given a list of stop ids, creates a list of tuples
        of consecutive stops
    '''
    pairs = []
    for i in range(1, len(stops)):
        pairs.append((stops[i-1], stops[i]))
    return pairs

In [22]:
# Create edges: corresponds to pairs of consecutive stops
edges = trips_grouped['stop_id'].apply(make_stop_pairs).explode()

# Build the graph (undirected and unweighted for now)
G = nx.Graph()
for _, row in railway_stops.iterrows():
    G.add_node(row.stop_id, name = row.stop_name, lat = row.stop_lat, lon = row.stop_lon)
G.add_edges_from(edges)

In [23]:
pickle.dump(G, open('data/processed/railgraph.pickle', 'wb'))
pickle.dump(stop_id_to_name, open('data/processed/stop_id_to_name', 'wb'))
trips_grouped.to_pickle('data/processed/trips_grouped.pickle')
railway_stops.to_pickle('data/processed/railway_stops.pickle')

In [24]:
# Create map to visualize graph
m = folium.Map(location=[46.771413, 8.471689], zoom_start = 8, tiles='CartoDB Positron', height = '80%')

# Draw edges
for _, row in trips_grouped.iterrows():
    stops = row.stop_id
    points = [(G.nodes[stop]['lat'], G.nodes[stop]['lon']) for stop in stops]
    if row.route_type in grandes_lignes:
        folium.PolyLine(points, color='#d7191c', opacity='0.1', dash_array='20', weight = 2).add_to(m)
    else:
        folium.PolyLine(points, color='#fdae61', opacity='0.5', weight = 1).add_to(m)

# Draw nodes
for node in G.nodes():
    lat, lon = G.nodes[node]['lat'], G.nodes[node]['lon']
    folium.CircleMarker(
        location = [lat, lon],
        popup = G.nodes[node]['name'], 
        radius = 1
    ).add_to(m)

m.save("network.html")
m