In [84]:
import folium
import pickle5
import pandas as pd
import networkx as nx
import numpy as np
from utils import linear_scaler

# Graph Analysis

Now that the graph is partially constructed, let's perform some analysis on it.

In [2]:
# Start by loading pickled data
G = pickle5.load(open('data/processed/railgraph.pickle', 'rb'))
stop_id_to_name = pickle5.load(open('data/processed/stop_id_to_name', 'rb'))
trips_grouped = pd.read_pickle('data/processed/trips_grouped.pickle')
railway_stops = pd.read_pickle('data/processed/railway_stops.pickle')

## Betweenness Centrality

*TODO: add short description about centrality measure and how it can be interpreted* 

In [3]:
# Compute betweenness centrality of graph (unweighted)
btwn_centrality = nx.betweenness_centrality(G)

In [4]:
# Take top 20
btwn_centrality_20 = dict(sorted(btwn_centrality.items(), key=lambda item: - item[1])[:20]).keys()

print('Top 20 stops with the highest betweenness centrality:')
for stop in btwn_centrality_20:
    print('{:.<20} {:.4f}'.format(stop_id_to_name[stop], btwn_centrality[stop]))

Top 20 stops with the highest betweenness centrality:
Zürich HB........... 0.4424
Bern................ 0.3523
Olten............... 0.2309
Lausanne............ 0.2017
Fribourg/Freiburg... 0.1965
Winterthur.......... 0.1902
Landquart........... 0.1146
St. Gallen.......... 0.1071
Spiez............... 0.1000
Montreux............ 0.0919
Thun................ 0.0917
Chur................ 0.0859
Luzern.............. 0.0763
Biel/Bienne......... 0.0744
Neuchâtel........... 0.0662
Aigle............... 0.0605
Visp................ 0.0582
Arth-Goldau......... 0.0570
Yverdon-les-Bains... 0.0566
Ins................. 0.0528


We can see that the 20 most central nodes according to the betweenness centrality are mostly Swiss agglomerations. However, we see some less major cities with a somewhat important centrality: i.e. Visp, Landquart, Ins. These are probably smalles cities through which has to pass because of their geographic localisation: i.e. the stop might be on the intersection of multiple lines for example. 

In [5]:
for node in G.nodes:
    G.nodes[node]['centrality'] = btwn_centrality[node]

In [86]:
# Create map to visualize graph
m = folium.Map(location=[46.771413, 8.471689], zoom_start = 8, tiles='CartoDB Positron', height = '80%')

# Draw nodes
for node in G.nodes():
    lat, lon = G.nodes[node]['lat'], G.nodes[node]['lon']
    folium.CircleMarker(
        location = [lat, lon],
        popup = '{}, {}'.format(G.nodes[node]['name'],  G.nodes[node]['centrality']), 
        radius = 4 if node in btwn_centrality_20 else 1,
        color = 'green' if node in btwn_centrality_20 else '#3388ff',
        opacity = 1 if node in btwn_centrality_20 else 0.3,
        fill = True
    ).add_to(m)

m.save("network_betweenness.html")
m

This shows the railway stations that are topologically central to the railway network, but we need to take passenger frequency
 into consideration to define a more relevant measurement of  importance.

In [109]:
passenger_data = pd.read_excel('././data/peinaussteiger2018/peinaussteiger2018.xlsx')
passenger_data.head()

Unnamed: 0,Code,Bahnhof_Haltestelle,Kanton,Eigner,DTV_2018,DWV_2018,DNWV_2018,Bemerkungen,Remarques,Note,Remarks
0,AA,Aarau,AG,SBB,37900,44800,22700,Ohne AVA.,Sans AVA.,Senza AVA.,Without AVA.
1,AAT,Aathal,ZH,SBB,740,800,610,,,,
2,ABO,Aarburg-Oftringen,AG,SBB,2500,3000,1300,,,,
3,ACLA,Acla da Fontauna,GR,MGB,90,90,80,,,,
4,AD,Aadorf,TG,SBB,1700,2000,1000,Durchschnittswert 2018 durch Streckensperrung ...,Valeur moyenne 2018 à la baisse en raison de l...,Valore medio 2018 compromesso dallo sbarrament...,Average value for 2018 impacted by line closure.


According to the data's accompanying info on SBB's website, `DTV` is the average daily traffic, `DWV` is the average work-day traffic (typical Monday through Friday), and `DNWV` is the average non-work-day traffic (weekends and holidays)

We only have frequency data for some stations and not for all of them, actually:

In [110]:
print(f'We have passenger frequency data for {passenger_data.shape[0]} stations, while we have a total of {len(stop_id_to_name)} stations in the original dataset')

We have passenger frequency data for 906 stations, while we have a total of 1680 stations in the original dataset


In [111]:
passenger_data[passenger_data.DNWV_2018 == '<50']

Unnamed: 0,Code,Bahnhof_Haltestelle,Kanton,Eigner,DTV_2018,DWV_2018,DNWV_2018,Bemerkungen,Remarques,Note,Remarks
18,ALV,Alvaneu,GR,RhB,<50,<50,<50,,,,
28,AT,Altmatt,SZ,SOB,<50,<50,<50,,,,
49,BELA,Bernina Lagalb,GR,RhB,<50,<50,<50,,,,
54,BESU,Bernina Suot,GR,RhB,<50,<50,<50,,,,
66,BIGG,Biberegg,SZ,SOB,<50,<50,<50,,,,
...,...,...,...,...,...,...,...,...,...,...,...
768,SURA,Surava,GR,RhB,<50,<50,<50,,,,
818,VAU,Vaumarcus,NE,CFF,<50,<50,<50,,,,
819,VD,Vauderens,FR,CFF,<50,<50,<50,,,,
844,WAVU,Waltensburg/Vuorz,GR,RhB,<50,<50,<50,,,,


Note that for many of the smaller stations, we just have the token `<50` instead of an actual number, we replace that with the number 50 to be able to treat that as a number

In [118]:
passenger_data.loc[passenger_data.DTV_2018 == '<50','DTV_2018'] = '50'
passenger_data.loc[passenger_data.DWV_2018 == '<50','DWV_2018'] = '50'
passenger_data.loc[passenger_data.DNWV_2018 == '<50','DNWV_2018'] = '50'

In [119]:
station_data_complete = railway_stops.merge(passenger_data[['Bahnhof_Haltestelle', 'DTV_2018', 'DWV_2018', 'DNWV_2018']],
                                            left_on='stop_name', right_on='Bahnhof_Haltestelle', how='left')\
    .drop('Bahnhof_Haltestelle', axis=1)
station_data_complete['DTV_2018'] = pd.to_numeric(station_data_complete.DTV_2018)
station_data_complete['DNWV_2018'] = pd.to_numeric(station_data_complete.DNWV_2018)
station_data_complete['DWV_2018'] = pd.to_numeric(station_data_complete.DWV_2018)

station_data_complete.dropna().head()

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,cc,DTV_2018,DWV_2018,DNWV_2018
20,8500010,Basel SBB,47.547413,7.58956,CH,99800.0,111000.0,75000.0
21,8500016,Basel St. Johann,47.570303,7.572528,CH,80.0,90.0,50.0
22,8500020,Muttenz,47.533592,7.647897,CH,5300.0,6700.0,2300.0
23,8500021,Pratteln,47.522668,7.690819,CH,8600.0,10100.0,5100.0
24,8500022,Frenkendorf-Füllinsdorf,47.501468,7.719106,CH,3700.0,4300.0,2400.0


___TODO_: Figure out how to handle the missing frequency data (Mean of frequencies in the geographical vicinity?)__

In [120]:
DEFAULT_FREQ = 50

for node in G.nodes:
   # if station_data_complete[station_data_complete.stop_id == node].isna()['DTV_2018'].iloc[0]:
    #    G.nodes[node]['passenger_frequency'] = DEFAULT_FREQ
    #else:
        G.nodes[node]['passenger_frequency'] = station_data_complete[station_data_complete.stop_id == node]['DTV_2018'].iloc[0]

In [121]:
for i,node in enumerate(G.nodes):
    print(G.nodes[node])
    if i == 5: break;

{'name': 'Riehen', 'lat': 47.5831565652384, 'lon': 7.65201128794639, 'centrality': 0.0, 'passenger_frequency': nan}
{'name': 'Grenzach', 'lat': 47.5508989778822, 'lon': 7.65950323741699, 'centrality': 0.0005427851380909595, 'passenger_frequency': nan}
{'name': 'Herten (Baden)', 'lat': 47.5495044591957, 'lon': 7.73965991023014, 'centrality': 3.54941963439558e-07, 'passenger_frequency': nan}
{'name': 'Laufenburg (Baden) Ost', 'lat': 47.5661634992305, 'lon': 8.07368946552364, 'centrality': 0.0005724425110361318, 'passenger_frequency': nan}
{'name': 'Tiengen (Hochrhein)', 'lat': 47.6354707316418, 'lon': 8.27200154767347, 'centrality': 0.0016332384698953609, 'passenger_frequency': nan}
{'name': 'Erzingen (Baden)', 'lat': 47.6595389753013, 'lon': 8.43012300400619, 'centrality': 0.003181863144803885, 'passenger_frequency': nan}


In [123]:
freq_top_20 = station_data_complete.sort_values('DTV_2018',ascending=False).stop_id[:20].values

In [124]:
m = folium.Map(location=[46.771413, 8.471689], zoom_start = 8, tiles='CartoDB Positron', height = '80%')


# Draw nodes
for node in G.nodes():
    lat, lon = G.nodes[node]['lat'], G.nodes[node]['lon']

    folium.CircleMarker(
        location = (lat, lon),
        popup = '{}, {}'.format(G.nodes[node]['name'],  G.nodes[node]['passenger_frequency']),
        radius = 4 if node in freq_top_20 else 1,
        color = 'green' if node in freq_top_20  else '#3388ff',
        opacity = 1 if node in freq_top_20  else 0.3,
        fill = True
    ).add_to(m)

m.save("network_passenger_freq.html")
m


count       870.000000
mean       3918.241379
std       18216.204935
min          50.000000
25%         240.000000
50%         650.000000
75%        1900.000000
max      423600.000000
Name: DTV_2018, dtype: float64