In [1]:
import pandas as pd
import numpy as np

# Data

In [2]:
data = pd.read_csv('data/anzahl-sbb-bahnhofbenutzer.csv') #contains origin 
data_pop = pd.read_excel('data/data_pop.xlsx') #population for each city
data_red = pd.read_excel('data/data_pop_modif.xlsx') # population and origin. it does not contain zurich and geneve

In [3]:
data_split = data['Bahnhof_Gare_Stazione;Unité;Jahr;Anzahl Bahnhofbenutzer'].str.split(';', expand=True)
data_split.columns = ['city', 'Unité', 'Jahr', 'origin']

## Data Manipulation

In [4]:
data_2024 = data_split[(data_split['Unité']== 'DP/jour ouvré') & (data_split['Jahr']=='2024')]
data_2024 = data_2024.drop(['Unité', 'Jahr'], axis=1)

In [5]:
data_2024.loc[:, 'origin']  = pd.to_numeric(data_2024['origin'], errors='coerce')

In [6]:
to_remove = ['Uster', 'Zürich Stadelhofen']
data_2024 = data_2024[~data_2024['city'].isin(to_remove)]
data_2024 = data_2024.reset_index(drop=True)

We have to manage the different station in the city of Zürich and Geneve. To do that, first we aggregate the origin for this station in only one. 

In [7]:
df = data_2024

cities_to_merge_zh = ["Zürich HB",'Zürich Enge','Zürich Hardbrücke', 'Zürich Altstetten' ]
cities_to_merge_ge = ['Genève', 'Genève-Eaux-Vives']

mask_zh = df["city"].isin(cities_to_merge_zh)
mask_ge = df["city"].isin(cities_to_merge_ge)

merged_origin_zh = df.loc[mask_zh, "origin"].sum()
merged_origin_ge = df.loc[mask_ge, "origin"].sum()


new_row_zh = pd.DataFrame({"city": ["Zürich HB"], "origin": [merged_origin_zh]})
new_row_ge = pd.DataFrame({"city": ["Genève"], "origin": [merged_origin_ge]})


df_filtered = df[~mask_zh]
df_filtered = df_filtered[~mask_ge]
df_filtered_zh = pd.concat([df_filtered, new_row_zh], ignore_index=True)
df_final = pd.concat([df_filtered_zh, new_row_ge], ignore_index=True)
df_final

  df_filtered = df_filtered[~mask_ge]


Unnamed: 0,city,origin
0,Neuchâtel,28900
1,Thun,40400
2,Bellinzona,16100
3,Lugano,34100
4,St. Gallen,77400
5,Winterthur,134700
6,Zug,58900
7,Bern,298900
8,Luzern,145400
9,Baden,55400


# Estimate the destination

As we have only the origin we try to estiamte the destinatio in order to utilise a gravity model and define a origin-destiantion matrix. 

To estimate the destination we consider the population and the origin of each cities and we compute a weight. The weight reppresent the power of the city with respect to the others, the weight are between 0 and 1. If the weigth w --> 1 means that the cities attract more people.

In [8]:
total_origin = df_final['origin'].sum()
population_total = data_pop['habitant'].sum()


menage the station in geneve

In [9]:
geneve_stations = ['Genève', 'Genève-Aéroport']  
df_ge = df_final.loc[df_final['city'].isin(geneve_stations)].copy() 

pop_ge = data_pop.loc[data_pop['City'] == 'Genève'].copy()
total_origin_ge = df_ge['origin'].sum()

df_ge['weight'] = df_ge['origin'] / total_origin_ge
population_weight_ge = (pop_ge['habitant'] + total_origin_ge) / (total_origin + population_total)
df_ge

Unnamed: 0,city,origin,weight
15,Genève-Aéroport,43000,0.18647
20,Genève,187600,0.81353


In [10]:
df_ge['weight_population'] = df_ge['weight']*population_weight_ge[1]
df_ge['destination'] = df_ge['weight_population']*total_origin
df_ge


Unnamed: 0,city,origin,weight,weight_population,destination
15,Genève-Aéroport,43000,0.18647,0.019556,45734.763171
20,Genève,187600,0.81353,0.085321,199531.199324


menage the stations of zurich

In [11]:
zurich_station = ['Zürich Oerlikon', 'Zürich HB']
df_zh = df_final[df_final['city'].isin(zurich_station)].copy()

pop_zh = data_pop[data_pop['City'] == 'Zürich'].copy()
total_origin_zh = df_zh['origin'].sum()

df_zh['weight'] = df_zh['origin']/total_origin_zh
population_weight_zh = (pop_zh['habitant']+total_origin_zh) / (total_origin + population_total)
df_zh

Unnamed: 0,city,origin,weight
11,Zürich Oerlikon,113800,0.171437
19,Zürich HB,550000,0.828563


In [12]:
df_zh['weight_population'] = df_zh['weight']*population_weight_zh[0]
df_zh['destination'] = df_zh['weight_population']*total_origin

df_zh

Unnamed: 0,city,origin,weight,weight_population,destination
11,Zürich Oerlikon,113800,0.171437,0.045174,105643.939168
19,Zürich HB,550000,0.828563,0.218328,510581.428315


Menage the other city

In [13]:
data_red['weight'] = (data_red['population']+data_red['origin'])/(population_total + total_origin)

data_red['destination'] = data_red['weight']*total_origin
data_red.sort_values(by='destination')

Unnamed: 0,city,population,origin,weight,destination
11,Bellinzona,46544,16100,0.015123,35366.082669
8,Fribourg/Freiburg,37653,31500,0.016694,39040.781476
10,Neuchâtel,44597,28900,0.017743,41493.215278
15,Chur,38129,40300,0.018933,44277.608352
0,Thun,43670,40400,0.020295,47462.272045
7,Baden,32566,55400,0.021236,49661.784497
12,Zug,31345,58900,0.021786,50948.408953
6,Lugano,62464,34100,0.023311,54515.842009
14,Olten,30678,69600,0.024208,56612.605163
16,Aarau,22290,78600,0.024356,56958.113793


Concatenate and clean the Origin Destination

In [14]:
data_red = data_red.drop(['population', 'weight'], axis=1)
df_ge = df_ge.drop(['weight', 'weight_population'], axis=1)
df_zh = df_zh.drop(['weight','weight_population'], axis=1)

production = pd.concat([df_ge, df_zh], axis=0, ignore_index=True)  # Ignore original index and create a new one

In [15]:
od_df = pd.concat([production, data_red], axis=0, ignore_index=True)  # Ignore original index and create a new one


Sanity Check

In [16]:
origin = od_df['origin'].sum()
destination = od_df['destination'].sum()
print(origin, destination)

2338600 2338600.0000000005


In [17]:
od_df[['origin', 'destination']] = np.round(od_df[['origin', 'destination']]).astype(int)

Save the dataframe

In [18]:
od_df.to_csv('prod_att.csv', index=False)                # To CSV

In [19]:
prod_att = pd.read_csv('prod_att.csv')
prod_att


Unnamed: 0,city,origin,destination
0,Genève-Aéroport,43000,45734
1,Genève,187600,199531
2,Zürich Oerlikon,113800,105643
3,Zürich HB,550000,510581
4,Thun,40400,47462
5,Basel SBB,140900,177525
6,Lausanne,127900,152045
7,Winterthur,134700,142045
8,St. Gallen,77400,87128
9,Luzern,145400,129418
