# Aggregating the Mobility Data

Reading in the mobility data from the OD-matrix of all Danish national train stations provided by DSB, and aggregating them by municipalties.

In [108]:
import pandas as pd
import networkx as nx
import pickle

We have manually scraped data on which municipality each of the 298 train stations are located in, and stored this data in a csv-file. Here I am loading in this data to make a dict in the format ``<station: municipality>``

In [76]:
with open('data/station2municipality.csv', 'r', encoding='utf-8') as f:
    f.readline()
    rows = f.readlines()
    station_to_muni = {}
    for row in rows:
        row = row.replace(u'\xa0', '')
        row = row.split(';')
        station = row[0].strip()
        muni = row[2].strip()
        if muni == 'Copenhagen':
            muni = 'København'
        station_to_muni[station] = muni
    
print('#Stations: ', len(station_to_muni))
print('#Municipalities: ', len(set(station_to_muni.values())))

#Stations:  289
#Municipalities:  81


## Loading the OD-matrix into a Pandas Dataframe

Loading the OD-matrix into a Pandas datafram and renaming the columns and rows based on the mapping we just made.

In [95]:
OD = pd.read_excel('data/OD.xlsx', sheet_name=1, index_col=0)
OD = OD.round()
OD.fillna(0, inplace=True)
OD = OD.astype(int)
OD.drop('Total', inplace=True, axis=1)
OD.drop('Total', inplace=True, axis=0)

OD = OD.rename(station_to_muni, columns=station_to_muni)

OD

Unnamed: 0,Albertslund,Skanderborg,Allerød,Mariagerfjord,Herning,Hvidovre,Gladsaxe,Ballerup,Thisted,Gentofte,...,København,København.1,Aalborg,Aalborg.1,Aalborg.2,København.2,Hvidovre.1,Aarhus,Faaborg-Midtfyn,Assens
Albertslund,0,0,1714,0,0,857,1600,726,0,1671,...,4070,37759,0,0,0,5105,177,484,0,0
Skanderborg,0,0,0,0,0,0,1,1,0,0,...,0,3,15,0,0,0,0,11183,0,0
Allerød,1714,0,0,0,0,1106,36,672,2,2955,...,1676,47645,241,0,4,2039,923,830,0,0
Mariagerfjord,0,0,0,0,2,0,0,0,0,0,...,1,12,19035,819,0,0,0,5241,0,0
Herning,0,0,0,2,0,0,0,0,15,0,...,11,26,38,0,0,0,0,2279,0,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
København,5105,0,2039,0,0,1494,3012,3781,0,1974,...,0,3578,0,0,0,0,736,0,0,0
Hvidovre,177,0,923,0,0,14015,990,291,0,360,...,890,18014,0,0,0,736,0,195,0,0
Aarhus,484,11183,830,5241,2279,186,362,0,597,507,...,1932,6561,213120,10678,2625,0,195,0,609,686
Faaborg-Midtfyn,0,0,0,0,0,0,0,0,0,0,...,9,28,120,2,0,0,0,609,0,16


## Aggregating by municipalities

Here I am grouping the counts by the new mapped column and row names denoting the municipalities.

In [96]:
OD = OD.groupby(level=0).sum() #aggregating rows
OD = OD.groupby(OD.columns, axis = 1).sum() #aggregating columns

OD

Unnamed: 0,Aabenraa,Aalborg,Aarhus,Albertslund,Allerød,Assens,Ballerup,Brøndby,Brønderslev,DE,...,Sønderborg,Thisted,Tårnby,Tønder,Vallensbæk,Varde,Vejen,Vejle,Viborg,Vordingborg
Aabenraa,22710,1934,12247,0,0,178,0,0,229,7689,...,18286,352,2948,0,0,316,1309,3291,276,376
Aalborg,1934,218232,240261,0,245,251,2,0,140151,12556,...,1809,36,7733,145,110,212,562,13470,11571,341
Aarhus,12247,240261,78886,484,830,1342,285,434,8134,55121,...,7873,6855,73444,1272,101,7019,9900,262647,143659,1735
Albertslund,0,0,484,0,1714,0,4584,36561,0,0,...,0,11,14930,8,367,45,0,173,35,889
Allerød,0,245,830,1714,0,0,2076,2057,0,0,...,0,621,3977,13,727,11,0,376,48,384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Varde,316,212,7019,45,11,34,23,2,13,1214,...,179,413,373,890,2,136658,1476,697,315,34
Vejen,1309,562,9900,0,0,214,0,0,43,319,...,492,111,1712,701,0,1476,48802,6758,109,133
Vejle,3291,13470,262647,173,376,1179,109,47,897,29431,...,1501,2142,15048,391,0,697,6758,330420,1434,444
Viborg,276,11571,143659,35,48,41,52,25,50,6761,...,206,763,1850,157,1,315,109,1434,274964,118


## Removing irrelevant data
Removing the data for traffic flow going to Sweden and Germany.

In [97]:
OD.drop('SE', inplace=True, axis=1)
OD.drop('SE', inplace=True, axis=0)

OD.drop('DE', inplace=True, axis=1)
OD.drop('DE', inplace=True, axis=0)

In [99]:
OD.shape

(79, 79)

# Making the network for later
Storing the adjacency matrix directly as a nx graph that I pickle for use later.

In [103]:
G = nx.from_pandas_adjacency(OD)
print(nx.info(G))

Name: 
Type: Graph
Number of nodes: 79
Number of edges: 2748
Average degree:  69.5696


Checking that the edge weights were loaded in correctly

In [106]:
list(G.edges(data = True))[:5]

[('Aabenraa', 'Aabenraa', {'weight': 22710}),
 ('Aabenraa', 'Aalborg', {'weight': 1934}),
 ('Aabenraa', 'Aarhus', {'weight': 12247}),
 ('Aabenraa', 'Assens', {'weight': 178}),
 ('Aabenraa', 'Brønderslev', {'weight': 229})]

### Importing node attributes
Building a dictionary with the format `<municipality(str) : population_size(int)` for adding the pop size as a node attribute, and inspecting the `pop` dictionary.

In [116]:
pop = {}

with open('data/Municipality_test_pos.csv') as f:
    crap = f.readlines()[1:] #skipping header
    for line in crap:
        data = line.split(';')
        n = ''.join([c for c in data[4].strip() if c.isnumeric()])
        muni = data[1]
        pop[muni] = int(n)

print(pop['København'])
print(len(pop))

632340
99


Removing Christiansø, which has special status, but is not a true municipality. Also pickling the dictionary for later use.

In [117]:
del pop['Christiansø']
print(len(pop))

pickle.dump(pop, open('data/population.pickle', 'wb'))

98


### Checking which municipalities are missing from the OD data

In [119]:
missing = set(pop.keys()) - set(G.nodes)

print('# Missing: ', len(missing), '\n')
print(missing)

# Missing:  19 

{'Nordfyns', 'Dragør', 'Ærø', 'Odder', 'Læsø', 'Vesthimmerlands', 'Gribskov', 'Jammerbugt', 'Morsø', 'Langeland', 'Fanø', 'Billund', 'Stevns', 'Samsø', 'Halsnæs', 'Syddjurs', 'Bornholm', 'Norddjurs', 'Lemvig'}


### Adding missing nodes and all node attributes to the network

In [120]:
G.add_nodes_from(missing)
nx.set_node_attributes(G, pop, name='Population size')

G.nodes['Nordfyns']

{'Population size': 29665}

In [122]:
print(nx.info(G))

Name: 
Type: Graph
Number of nodes: 98
Number of edges: 2748
Average degree:  56.0816


### Pickling the network

In [123]:
nx.write_gpickle(G, "networks/OD.gpickle")