## Preprocessing

### Imports

In [1]:
import pandas as pd
import biogeme.database as db
import biogeme.biogeme as bio
from IPython.core.display_functions import display
from biogeme.expressions import Beta, Variable
from biogeme.models import loglogit, boxcox
from biogeme.segmentation import DiscreteSegmentationTuple, segmented_beta

### Load Data and Filter

Remove all those who shouldn't have access to cars but still chose it

In [None]:
data = pd.read_csv("lpmc01.dat", sep = '\t')
#
# Step 1: Identify whether each household has a driving license
data['household_has_license'] = data.groupby('household_id')['driving_license'].transform(lambda x: x.max())

# Step 2: Define a filter for the availability conditions not being met
car_availability_unmet = (data['car_ownership'] == 0) & (data['driving_license'] == 0) & (data['household_has_license'] == 0)

# Step 3: Identify rows where travel_mode is 4 (car) but the availability conditions are not met
car_without_availability = data[(data['travel_mode'] == 4) & car_availability_unmet]

# Step 4: Remove those rows from the original DataFrame
data_filtered = data[~data.index.isin(car_without_availability.index)]

database = db.Database('london', data_filtered)

## Variable Definition

In [3]:
trip_id = Variable('trip_id')
household_id = Variable('household_id')
person_n = Variable('person_n')
trip_n = Variable('trip_n')
travel_mode = Variable('travel_mode')
purpose = Variable('purpose')
fueltype = Variable('fueltype')
faretype = Variable('faretype')
bus_scale = Variable('bus_scale')
survey_year = Variable('survey_year')
travel_year = Variable('travel_year')
travel_month = Variable('travel_month')
travel_date = Variable('travel_date')
day_of_week = Variable('day_of_week')
start_time = Variable('start_time')
age = Variable('age')
female = Variable('female')
driving_license = Variable('driving_license')
car_ownership = Variable('car_ownership')
distance = Variable('distance')
dur_walking = Variable('dur_walking')
dur_cycling = Variable('dur_cycling')
dur_pt_access = Variable('dur_pt_access') # Predicted total access and egress time for public transport route in hours
dur_pt_rail = Variable('dur_pt_rail')
dur_pt_bus = Variable('dur_pt_bus')
dur_pt_int = Variable('dur_pt_int') # Time taken (hrs) at each interchange point
pt_interchanges = Variable('pt_interchanges')   # Number of interchange points in public transport route
dur_driving = Variable('dur_driving')
cost_transit = Variable('cost_transit')
cost_driving_fuel = Variable('cost_driving_fuel')   # Estimated fuel cost of driving route in GBP
cost_driving_ccharge = Variable('cost_driving_ccharge')  # Estimated congestion charge cost of driving route in GBP
driving_traffic_percent = Variable('driving_traffic_percent')

household_has_license = Variable('household_has_license')


In [4]:
# Define driving cost
cost_driving = cost_driving_ccharge + cost_driving_fuel

# Define time taken by each mode of transport
time_pt = dur_pt_access + dur_pt_rail + dur_pt_bus + dur_pt_int 

time_cycling = dur_cycling
time_walking = dur_walking
time_driving = dur_driving

## Model 0

Only generic ASCs and parameters

In [5]:
# Define ASCs 
asc_car = Beta(name='asc_car', value=0, lowerbound=None, upperbound=None, status=0)
asc_walk = Beta(name='asc_walk', value=0, lowerbound=None, upperbound=None, status=0)
asc_pt = Beta(name='asc_pt', value=0, lowerbound=None, upperbound=None, status=0)

# Define generic parameters

beta_cost = Beta(name='beta_cost', value=0, lowerbound=None, upperbound=None, status=0)
beta_time = Beta(name='beta_time', value=0, lowerbound=None, upperbound=None, status=0)


In [6]:
# Define utility functions

v_car = asc_car + beta_cost * cost_driving + beta_time * time_driving

v_walk = asc_walk + beta_time * time_walking

v_pt = asc_pt + beta_cost * cost_transit + beta_time * time_pt

v_cycling = beta_time * time_cycling

In [7]:
# Availability

V = {1: v_walk, 2: v_cycling, 3: v_pt, 4: v_car}

CAR_AV = (car_ownership != 0) | (driving_license != 0) | (household_has_license != 0)
WALK_AV = (car_ownership >= 0)
PT_AV = (car_ownership >= 0)
CYCLING_AV = (car_ownership >= 0)

av = {1: WALK_AV, 2: CYCLING_AV , 3: PT_AV, 4: CAR_AV}

In [8]:
logprob_m0 = loglogit(V, av, travel_mode)

model_0 = bio.BIOGEME(database, logprob_m0)
model_0.modelName = 'model_0'

null_log_likelihood_m0 = model_0.calculate_null_loglikelihood(av)

results_m0 = model_0.estimate()

print(results_m0.print_general_statistics())

Number of estimated parameters:	5
Sample size:	4941
Excluded observations:	0
Null log likelihood:	-6610.329
Init log likelihood:	-4007.051
Final log likelihood:	-4007.051
Likelihood ratio test for the null model:	5206.557
Rho-square for the null model:	0.394
Rho-square-bar for the null model:	0.393
Likelihood ratio test for the init. model:	-0
Rho-square for the init. model:	0
Rho-square-bar for the init. model:	-0.00125
Akaike Information Criterion:	8024.101
Bayesian Information Criterion:	8056.628
Final gradient norm:	2.4196E-03
Nbr of threads:	16



In [9]:
pd_results_m0 = results_m0.get_estimated_parameters()
display(pd_results_m0)


Unnamed: 0,Value,Rob. Std err,Rob. t-test,Rob. p-value
asc_car,2.904088,0.091323,31.800297,0.0
asc_pt,3.355414,0.094523,35.498546,0.0
asc_walk,3.945274,0.112873,34.953192,0.0
beta_cost,-0.206632,0.015116,-13.669511,0.0
beta_time,-5.974888,0.227355,-26.280001,0.0


## Model 1: Time alternative specific

In [10]:
# Define ASCs 
asc_car = Beta(name='asc_car', value=0, lowerbound=None, upperbound=None, status=0)
asc_walk = Beta(name='asc_walk', value=0, lowerbound=None, upperbound=None, status=0)
asc_pt = Beta(name='asc_pt', value=0, lowerbound=None, upperbound=None, status=0)

# Define generic parameters

beta_time_car = Beta(name='beta_time_car', value=0, lowerbound=None, upperbound=None, status=0)
beta_time_pt = Beta(name='beta_time_pt', value=0, lowerbound=None, upperbound=None, status=0)
beta_time_walk = Beta(name='beta_time_walk', value=0, lowerbound=None, upperbound=None, status=0)
beta_time_cycling = Beta(name='beta_time_cycling', value=0, lowerbound=None, upperbound=None, status=0)

# Define alternative specific parameters

beta_cost = Beta(name='beta_cost', value=0, lowerbound=None, upperbound=None, status=0)

# Define utility functions

v_car = asc_car + beta_cost * cost_driving + beta_time_car * time_driving

v_walk = asc_walk + beta_time_walk * time_walking

v_pt = asc_pt + beta_cost * cost_transit + beta_time_pt * time_pt

v_cycling = beta_time_cycling * time_cycling

V = {1: v_walk, 2: v_cycling, 3: v_pt, 4: v_car}

CAR_AV = (car_ownership != 0) | (driving_license != 0) | (household_has_license != 0)
WALK_AV = (car_ownership >= 0)
PT_AV = (car_ownership >= 0)
CYCLING_AV = (car_ownership >= 0)

av = {1: WALK_AV, 2: CYCLING_AV , 3: PT_AV, 4: CAR_AV}

logprob_m1 = loglogit(V, av, travel_mode)

model_1 = bio.BIOGEME(database, logprob_m1)
model_1.modelName = 'model_1'

null_log_likelihood_m1 = model_1.calculate_null_loglikelihood(av)

results_m1 = model_1.estimate()
print(results_m1.print_general_statistics())


Number of estimated parameters:	8
Sample size:	4941
Excluded observations:	0
Null log likelihood:	-6610.329
Init log likelihood:	-3663.213
Final log likelihood:	-3663.213
Likelihood ratio test for the null model:	5894.233
Rho-square for the null model:	0.446
Rho-square-bar for the null model:	0.445
Likelihood ratio test for the init. model:	-0
Rho-square for the init. model:	0
Rho-square-bar for the init. model:	-0.00218
Akaike Information Criterion:	7342.425
Bayesian Information Criterion:	7394.468
Final gradient norm:	6.3734E-03
Nbr of threads:	16



In [11]:
pd_results_m1 = results_m1.get_estimated_parameters()
display(pd_results_m1)


Unnamed: 0,Value,Rob. Std err,Rob. t-test,Rob. p-value
asc_car,2.873129,0.149264,19.248647,0.0
asc_pt,1.983868,0.152605,13.000034,0.0
asc_walk,4.639939,0.204647,22.672896,0.0
beta_cost,-0.191306,0.018074,-10.584411,0.0
beta_time_car,-6.689821,0.392282,-17.05359,0.0
beta_time_cycling,-6.543603,0.512912,-12.757758,0.0
beta_time_pt,-3.462775,0.264295,-13.10194,0.0
beta_time_walk,-9.155925,0.469862,-19.486417,0.0


### Comparison between Model 0 and Model 1

In [12]:
loglikehood_m0 = results_m0.data.logLike
num_params_m0 = results_m0.data.nparam

loglikehood_m1 = results_m1.data.logLike
num_params_m1 = results_m1.data.nparam

# Calculate the LR statistic
LR = 2 * (loglikehood_m1 - loglikehood_m0)

# Degrees of freedom
df = num_params_m1 - num_params_m0

from scipy.stats import chi2

# Critical value at 0.05 significance level
critical_value = chi2.ppf(0.95, df)

print("Likelihood Ratio:", LR)
print("Degrees of Freedom:", df)
print("Critical Chi-Square Value (0.05 significance):", critical_value)

if LR > critical_value:
    print("Model 1 is significantly better than Model 0.")
else:
    print("No significant improvement in Model 1 over Model 0.")



Likelihood Ratio: 687.6759050711189
Degrees of Freedom: 3
Critical Chi-Square Value (0.05 significance): 7.814727903251179
Model 1 is significantly better than Model 0.


We find that Model 1 is better than Model 0 based on the likelihood test

## Model 2: Socio-economic interactions (Sex)

In [13]:
# Time segmentation
sex_segmentation = DiscreteSegmentationTuple(
    variable=female, mapping={0: 'other', 1: 'female'}
)

### Model 2A: Sex interaction with Time

In [14]:
# Define ASCs 
asc_car = Beta(name='asc_car', value=0, lowerbound=None, upperbound=None, status=0)
asc_walk = Beta(name='asc_walk', value=0, lowerbound=None, upperbound=None, status=0)
asc_pt = Beta(name='asc_pt', value=0, lowerbound=None, upperbound=None, status=0)

# Define generic parameters

beta_time = Beta(name='beta_time', value=0, lowerbound=None, upperbound=None, status=0)

# Define alternative specific parameters

beta_time_car = Beta(name='beta_time_car', value=0, lowerbound=None, upperbound=None, status=0)
beta_time_pt = Beta(name='beta_time_pt', value=0, lowerbound=None, upperbound=None, status=0)
beta_time_walk = Beta(name='beta_time_walk', value=0, lowerbound=None, upperbound=None, status=0)
beta_time_cycling = Beta(name='beta_time_cycling', value=0, lowerbound=None, upperbound=None, status=0)

segmented_B_time_car = segmented_beta(beta_time_car, [sex_segmentation])
segmented_B_time_walk = segmented_beta(beta_time_walk, [sex_segmentation])
segmented_B_time_cycling = segmented_beta(beta_time_cycling, [sex_segmentation])


# Define alternative specific parameters

beta_cost = Beta(name='beta_cost', value=0, lowerbound=None, upperbound=None, status=0)

# Define utility functions

v_car = asc_car + beta_cost * cost_driving + segmented_B_time_car * time_driving

v_walk = asc_walk + segmented_B_time_walk * time_walking

v_pt = asc_pt + beta_cost * cost_transit + beta_time_pt * time_pt

v_cycling = segmented_B_time_cycling * time_cycling

V = {1: v_walk, 2: v_cycling, 3: v_pt, 4: v_car}

CAR_AV = (car_ownership != 0) | (driving_license != 0) | (household_has_license != 0)
WALK_AV = (car_ownership >= 0)
PT_AV = (car_ownership >= 0)
CYCLING_AV = (car_ownership >= 0)

av = {1: WALK_AV, 2: CYCLING_AV , 3: PT_AV, 4: CAR_AV}

In [15]:
logprob_m2a = loglogit(V, av, travel_mode)

model_2a = bio.BIOGEME(database, logprob_m2a)
model_2a.modelName = 'model_2a'

null_log_likelihood_m2a = model_2a.calculate_null_loglikelihood(av)

results_m2a = model_2a.estimate()
print(results_m2a.print_general_statistics())


Number of estimated parameters:	11
Sample size:	4941
Excluded observations:	0
Null log likelihood:	-6610.329
Init log likelihood:	-3634.216
Final log likelihood:	-3634.216
Likelihood ratio test for the null model:	5952.226
Rho-square for the null model:	0.45
Rho-square-bar for the null model:	0.449
Likelihood ratio test for the init. model:	-0
Rho-square for the init. model:	0
Rho-square-bar for the init. model:	-0.00303
Akaike Information Criterion:	7290.432
Bayesian Information Criterion:	7361.991
Final gradient norm:	3.4371E-02
Nbr of threads:	16



In [16]:
pd_results_m2a = results_m2a.get_estimated_parameters()
display(pd_results_m2a)

Unnamed: 0,Value,Rob. Std err,Rob. t-test,Rob. p-value
asc_car,2.687622,0.169233,15.881194,0.0
asc_pt,1.7814,0.172947,10.30027,0.0
asc_walk,4.459167,0.217109,20.538866,0.0
beta_cost,-0.19048,0.018069,-10.541906,0.0
beta_time_car,-6.516277,0.399661,-16.304513,0.0
beta_time_car_female,-0.951823,0.2539,-3.748807,0.000178
beta_time_cycling,-5.944642,0.46363,-12.821958,0.0
beta_time_cycling_female,-3.895369,0.952143,-4.091159,4.3e-05
beta_time_pt,-3.600259,0.272785,-13.198168,0.0
beta_time_walk,-8.927448,0.486331,-18.356747,0.0


In [17]:
loglikehood_m2a = results_m2a.data.logLike
num_params_m2a = results_m2a.data.nparam

# Calculate the LR statistic
LR = 2 * (loglikehood_m2a - loglikehood_m1)

# Degrees of freedom
df = num_params_m2a - num_params_m1

from scipy.stats import chi2

# Critical value at 0.05 significance level
critical_value = chi2.ppf(0.95, df)

print("Likelihood Ratio:", LR)
print("Degrees of Freedom:", df)
print("Critical Chi-Square Value (0.05 significance):", critical_value)

if LR > critical_value:
    print("Model 2a is significantly better than Model 1.")
else:
    print("No significant improvement in Model 2a over Model 1.")



Likelihood Ratio: 57.99322919856331
Degrees of Freedom: 3
Critical Chi-Square Value (0.05 significance): 7.814727903251179
Model 2a is significantly better than Model 1.


### Model 2B: Sex interaction with all ASCs

In [18]:
# Define ASCs 
asc_car = Beta(name='asc_car', value=0, lowerbound=None, upperbound=None, status=0)
asc_walk = Beta(name='asc_walk', value=0, lowerbound=None, upperbound=None, status=0)
asc_pt = Beta(name='asc_pt', value=0, lowerbound=None, upperbound=None, status=0)

beta_time = Beta(name='beta_time', value=0, lowerbound=None, upperbound=None, status=0)

# Define alternative specific parameters

beta_time_car = Beta(name='beta_time_car', value=0, lowerbound=None, upperbound=None, status=0)
beta_time_pt = Beta(name='beta_time_pt', value=0, lowerbound=None, upperbound=None, status=0)
beta_time_walk = Beta(name='beta_time_walk', value=0, lowerbound=None, upperbound=None, status=0)
beta_time_cycling = Beta(name='beta_time_cycling', value=0, lowerbound=None, upperbound=None, status=0)

# Define alternative specific parameters

beta_cost = Beta(name='beta_cost', value=0, lowerbound=None, upperbound=None, status=0)

ASC_cycling_female = Beta(name='asc_cycling_female', value=0, lowerbound=None, upperbound=None, status=0)


# Define utility functions

v_car = asc_car + beta_cost * cost_driving + beta_time_car * time_driving

v_walk = asc_walk + beta_time_walk * time_walking

v_pt = asc_pt + beta_cost * cost_transit + beta_time_pt * time_pt

v_cycling = ASC_cycling_female * (female == 1) + beta_time_cycling * time_cycling


# Define utility functions

V = {1: v_walk, 2: v_cycling, 3: v_pt, 4: v_car}

CAR_AV = (car_ownership != 0) | (driving_license != 0) | (household_has_license != 0)
WALK_AV = (car_ownership >= 0)
PT_AV = (car_ownership >= 0)
CYCLING_AV = (car_ownership >= 0)

av = {1: WALK_AV, 2: CYCLING_AV , 3: PT_AV, 4: CAR_AV}

In [19]:
logprob_m2b = loglogit(V, av, travel_mode)

model_2b = bio.BIOGEME(database, logprob_m2b)
model_2b.modelName = 'model_2b'

null_log_likelihood_m2b = model_2b.calculate_null_loglikelihood(av)

results_m2b = model_2b.estimate()
print(results_m2b.print_general_statistics())


Number of estimated parameters:	9
Sample size:	4941
Excluded observations:	0
Null log likelihood:	-6610.329
Init log likelihood:	-3631.723
Final log likelihood:	-3631.723
Likelihood ratio test for the null model:	5957.213
Rho-square for the null model:	0.451
Rho-square-bar for the null model:	0.449
Likelihood ratio test for the init. model:	-0
Rho-square for the init. model:	0
Rho-square-bar for the init. model:	-0.00248
Akaike Information Criterion:	7281.445
Bayesian Information Criterion:	7339.993
Final gradient norm:	1.0853E-01
Nbr of threads:	16



In [20]:
pd_results_m2b = results_m2b.get_estimated_parameters()
display(pd_results_m2b)

Unnamed: 0,Value,Rob. Std err,Rob. t-test,Rob. p-value
asc_car,2.27444,0.161346,14.09663,0.0
asc_cycling_female,-1.351199,0.185511,-7.283682,3.248513e-13
asc_pt,1.385694,0.164266,8.435656,0.0
asc_walk,4.043289,0.21257,19.021022,0.0
beta_cost,-0.190871,0.018124,-10.531523,0.0
beta_time_car,-6.713912,0.393906,-17.044466,0.0
beta_time_cycling,-6.753277,0.511095,-13.213347,0.0
beta_time_pt,-3.481654,0.265757,-13.100891,0.0
beta_time_walk,-9.170343,0.471157,-19.463437,0.0


### Compare Model 2b with Model 1

In [21]:
loglikehood_m2b = results_m2b.data.logLike
num_params_m2b = results_m2b.data.nparam

# Calculate the LR statistic
LR = 2 * (loglikehood_m2b - loglikehood_m1)

# Degrees of freedom
df = num_params_m2b - num_params_m1

from scipy.stats import chi2

# Critical value at 0.05 significance level
critical_value = chi2.ppf(0.95, df)

print("Likelihood Ratio:", LR)
print("Degrees of Freedom:", df)
print("Critical Chi-Square Value (0.05 significance):", critical_value)

if LR > critical_value:
    print("Model 2b is significantly better than Model 1.")
else:
    print("No significant improvement in Model 2b over Model 1.")



Likelihood Ratio: 62.98021317241182
Degrees of Freedom: 1
Critical Chi-Square Value (0.05 significance): 3.841458820694124
Model 2b is significantly better than Model 1.


## Model 3: Non-linear specification

In [22]:
# Define ASCs 
asc_car = Beta(name='asc_car', value=0, lowerbound=None, upperbound=None, status=0)
asc_walk = Beta(name='asc_walk', value=0, lowerbound=None, upperbound=None, status=0)
asc_pt = Beta(name='asc_pt', value=0, lowerbound=None, upperbound=None, status=0)

# Define generic parameters

beta_time = Beta(name='beta_time', value=0, lowerbound=None, upperbound=None, status=0)
LAMBDA = Beta('LAMBDA', 0, None, None, 0)

# Define alternative specific parameters

beta_cost_car = Beta(name='beta_cost_car', value=0, lowerbound=None, upperbound=None, status=0)
beta_cost_pt = Beta(name='beta_cost_pt', value=0, lowerbound=None, upperbound=None, status=0)

# Segmented Walk ASC

ASC_cycling_female = Beta(name='asc_cycling_female', value=0, lowerbound=None, upperbound=None, status=0)

# Define utility functions

v_car = asc_car + beta_cost_car * cost_driving + beta_time * boxcox(time_driving,LAMBDA)
v_walk = asc_walk + beta_time * boxcox(time_walking,LAMBDA)
v_pt = asc_pt + beta_cost_pt * cost_transit + beta_time * boxcox(time_pt,LAMBDA)

v_cycling = ASC_cycling_female * (female == 1) + beta_time * boxcox(time_cycling,LAMBDA)

V = {1: v_walk, 2: v_cycling, 3: v_pt, 4: v_car}

CAR_AV = (car_ownership != 0) | (driving_license != 0) | (household_has_license != 0)
WALK_AV = (car_ownership >= 0)
PT_AV = (car_ownership >= 0)
CYCLING_AV = (car_ownership >= 0)

av = {1: WALK_AV, 2: CYCLING_AV , 3: PT_AV, 4: CAR_AV}

It is advised to set the bounds on parameter LAMBDA. A value of -10 and 10 should be appropriate: Beta("LAMBDA", 0, -10, 10, 0)
It is advised to set the bounds on parameter LAMBDA. A value of -10 and 10 should be appropriate: Beta("LAMBDA", 0, -10, 10, 0)
It is advised to set the bounds on parameter LAMBDA. A value of -10 and 10 should be appropriate: Beta("LAMBDA", 0, -10, 10, 0)
It is advised to set the bounds on parameter LAMBDA. A value of -10 and 10 should be appropriate: Beta("LAMBDA", 0, -10, 10, 0)


In [23]:
logprob_m3 = loglogit(V, av, travel_mode)

model_3 = bio.BIOGEME(database, logprob_m3)
model_3.modelName = 'model_3'

null_log_likelihood_m3 = model_3.calculate_null_loglikelihood(av)

results_m3 = model_3.estimate()
print(results_m3.print_general_statistics())


Number of estimated parameters:	8
Sample size:	4941
Excluded observations:	0
Null log likelihood:	-6610.329
Init log likelihood:	-3822.292
Final log likelihood:	-3822.292
Likelihood ratio test for the null model:	5576.074
Rho-square for the null model:	0.422
Rho-square-bar for the null model:	0.421
Likelihood ratio test for the init. model:	-0
Rho-square for the init. model:	0
Rho-square-bar for the init. model:	-0.00209
Akaike Information Criterion:	7660.584
Bayesian Information Criterion:	7712.626
Final gradient norm:	1.0293E-01
Nbr of threads:	16



In [24]:
pd_results_m3 = results_m3.get_estimated_parameters()
display(pd_results_m3)

Unnamed: 0,Value,Rob. Std err,Rob. t-test,Rob. p-value
LAMBDA,0.480279,0.023742,20.228786,0.0
asc_car,2.137815,0.104259,20.504826,0.0
asc_cycling_female,-1.358128,0.187009,-7.262372,3.803624e-13
asc_pt,3.029652,0.123595,24.512744,0.0
asc_walk,4.407632,0.14712,29.959525,0.0
beta_cost_car,-0.196301,0.021652,-9.066256,0.0
beta_cost_pt,-0.038949,0.029644,-1.313875,0.1888884
beta_time,-5.384872,0.165841,-32.470084,0.0


### Compare Model 3 with Model 2   

#### Model C

In [25]:
# Define ASCs 
asc_car = Beta(name='asc_car', value=0, lowerbound=None, upperbound=None, status=0)
asc_walk = Beta(name='asc_walk', value=0, lowerbound=None, upperbound=None, status=0)
asc_pt = Beta(name='asc_pt', value=0, lowerbound=None, upperbound=None, status=0)

# Define generic parameters

beta_time = Beta(name='beta_time', value=0, lowerbound=None, upperbound=None, status=0)
beta_time_lambda = Beta(name='beta_time_lambda', value=0, lowerbound=None, upperbound=None, status=0)
LAMBDA = Beta('LAMBDA', 0, None, None, 0)

# Define alternative specific parameters

beta_cost_car = Beta(name='beta_cost_car', value=0, lowerbound=None, upperbound=None, status=0)
beta_cost_pt = Beta(name='beta_cost_pt', value=0, lowerbound=None, upperbound=None, status=0)

# Segmented Walk ASC

ASC_cycling_female = Beta(name='asc_cycling_female', value=0, lowerbound=None, upperbound=None, status=0)

# Define utility functions

v_car = asc_car + beta_cost_car * cost_driving + beta_time_lambda * boxcox(time_driving,LAMBDA) + beta_time * time_driving
v_walk = asc_walk + beta_time_lambda * boxcox(time_walking,LAMBDA) + beta_time * time_walking
v_pt = asc_pt + beta_cost_pt * cost_transit + beta_time_lambda * boxcox(time_pt,LAMBDA) + beta_time * time_pt

v_cycling = ASC_cycling_female * (female == 1) + beta_time_lambda * boxcox(time_cycling,LAMBDA) + beta_time * time_cycling

V = {1: v_walk, 2: v_cycling, 3: v_pt, 4: v_car}

CAR_AV = (car_ownership != 0) | (driving_license != 0) | (household_has_license != 0)
WALK_AV = (car_ownership >= 0)
PT_AV = (car_ownership >= 0)
CYCLING_AV = (car_ownership >= 0)

av = {1: WALK_AV, 2: CYCLING_AV , 3: PT_AV, 4: CAR_AV}

It is advised to set the bounds on parameter LAMBDA. A value of -10 and 10 should be appropriate: Beta("LAMBDA", 0, -10, 10, 0)
It is advised to set the bounds on parameter LAMBDA. A value of -10 and 10 should be appropriate: Beta("LAMBDA", 0, -10, 10, 0)
It is advised to set the bounds on parameter LAMBDA. A value of -10 and 10 should be appropriate: Beta("LAMBDA", 0, -10, 10, 0)
It is advised to set the bounds on parameter LAMBDA. A value of -10 and 10 should be appropriate: Beta("LAMBDA", 0, -10, 10, 0)


In [26]:
logprob_mC = loglogit(V, av, travel_mode)

model_C = bio.BIOGEME(database, logprob_mC)
model_C.modelName = 'model_C'

null_log_likelihood_mC = model_C.calculate_null_loglikelihood(av)

results_mC = model_C.estimate()
print(results_mC.print_general_statistics())


Number of estimated parameters:	9
Sample size:	4941
Excluded observations:	0
Null log likelihood:	-6610.329
Init log likelihood:	-3820.207
Final log likelihood:	-3820.207
Likelihood ratio test for the null model:	5580.245
Rho-square for the null model:	0.422
Rho-square-bar for the null model:	0.421
Likelihood ratio test for the init. model:	-0
Rho-square for the init. model:	0
Rho-square-bar for the init. model:	-0.00236
Akaike Information Criterion:	7658.413
Bayesian Information Criterion:	7716.961
Final gradient norm:	9.8315E-03
Nbr of threads:	16



#### Test C vs 2b

In [27]:
loglikehood_mC = results_mC.data.logLike
num_params_mC = results_mC.data.nparam

# Calculate the LR statistic
LR = 2 * (loglikehood_mC - loglikehood_m2b)

# Degrees of freedom
df = num_params_mC - num_params_m2b

from scipy.stats import chi2

# Critical value at 0.05 significance level
critical_value = chi2.ppf(0.99, df)

print("Likelihood Ratio:", LR)
print("Degrees of Freedom:", df)
print("Critical Chi-Square Value (0.05 significance):", critical_value)

if LR > critical_value:
    print("Model C is significantly better than Model 2b.")
else:
    print("No significant improvement in Model C over Model 2b.")



Likelihood Ratio: -376.96821637620906
Degrees of Freedom: 0
Critical Chi-Square Value (0.05 significance): nan
No significant improvement in Model C over Model 2b.


#### Test C vs 3

In [28]:
loglikehood_m3 = results_m3.data.logLike
num_params_m3 = results_m3.data.nparam

# Calculate the LR statistic
LR = 2 * (loglikehood_mC - loglikehood_m3)

# Degrees of freedom
df = num_params_mC - num_params_m3

from scipy.stats import chi2

# Critical value at 0.01 significance level
critical_value = chi2.ppf(0.99, df)

print("Likelihood Ratio:", LR)
print("Degrees of Freedom:", df)
print("Critical Chi-Square Value (0.01 significance):", critical_value)

if LR > critical_value:
    print("Model C is significantly better than Model 3.")
else:
    print("No significant improvement in Model C over Model 3.")



Likelihood Ratio: 4.17050782645947
Degrees of Freedom: 1
Critical Chi-Square Value (0.01 significance): 6.6348966010212145
No significant improvement in Model C over Model 3.


## Model 4: Nested or Cross-Nested

In [29]:
from biogeme.models import loglogit, lognested
from biogeme.nests import OneNestForNestedLogit, NestsForNestedLogit

# Define ASCs 
asc_car = Beta(name='asc_car', value=0, lowerbound=None, upperbound=None, status=0)
asc_walk = Beta(name='asc_walk', value=0, lowerbound=None, upperbound=None, status=0)
asc_pt = Beta(name='asc_pt', value=0, lowerbound=None, upperbound=None, status=0)

# Define generic parameters

beta_time = Beta(name='beta_time', value=0, lowerbound=None, upperbound=None, status=0)
LAMBDA = Beta('LAMBDA', 0, None, None, 0)

# Define alternative specific parameters

beta_cost_car = Beta(name='beta_cost_car', value=0, lowerbound=None, upperbound=None, status=0)
beta_cost_pt = Beta(name='beta_cost_pt', value=0, lowerbound=None, upperbound=None, status=0)

# Segmented Walk ASC

ASC_cycling_female = Beta(name='asc_cycling_female', value=0, lowerbound=None, upperbound=None, status=0)

# Define utility functions

v_car = asc_car + beta_cost_car * cost_driving + beta_time * boxcox(time_driving,LAMBDA)
v_walk = asc_walk + beta_time * boxcox(time_walking,LAMBDA)
v_pt = asc_pt + beta_cost_pt * cost_transit + beta_time * boxcox(time_pt,LAMBDA)

v_cycling = ASC_cycling_female * (female == 1) + beta_time * boxcox(time_cycling,LAMBDA)

V = {1: v_walk, 2: v_cycling, 3: v_pt, 4: v_car}

CAR_AV = (car_ownership != 0) | (driving_license != 0) | (household_has_license != 0)
WALK_AV = (car_ownership >= 0)
PT_AV = (car_ownership >= 0)
CYCLING_AV = (car_ownership >= 0)

av = {1: WALK_AV, 2: CYCLING_AV , 3: PT_AV, 4: CAR_AV}

It is advised to set the bounds on parameter LAMBDA. A value of -10 and 10 should be appropriate: Beta("LAMBDA", 0, -10, 10, 0)
It is advised to set the bounds on parameter LAMBDA. A value of -10 and 10 should be appropriate: Beta("LAMBDA", 0, -10, 10, 0)
It is advised to set the bounds on parameter LAMBDA. A value of -10 and 10 should be appropriate: Beta("LAMBDA", 0, -10, 10, 0)
It is advised to set the bounds on parameter LAMBDA. A value of -10 and 10 should be appropriate: Beta("LAMBDA", 0, -10, 10, 0)


In [30]:
mu_slow = Beta('mu_slow', 1, 0, None, 0)
mu_fast = Beta('mu_fast', 1, 0, None, 0)
nest_slow = OneNestForNestedLogit(nest_param=mu_slow, list_of_alternatives=[1, 2], name='slow modes')
nest_fast = OneNestForNestedLogit(nest_param=mu_fast, list_of_alternatives=[3, 4], name='faster modes')
nests = NestsForNestedLogit(choice_set=list(V), tuple_of_nests=(nest_slow, nest_fast))

logprob_m4 = lognested(V, av, nests, travel_mode)


# logprob_m3 = loglogit(V, av, travel_mode)

model_4 = bio.BIOGEME(database, logprob_m4)
model_4.modelName = 'model_4'

null_log_likelihood_m4 = model_4.calculate_null_loglikelihood(av)

results_m4 = model_4.estimate()
print(results_m4.print_general_statistics())


The following expression may potentially be ambiguous: [((car_ownership >= `0.0`) != `0.0`)] if it contains the chaining of two comparisons expressions. Keep in mind that, for Biogeme (like for Pandas), the expression (a <= x <= b) is not equivalent to (a <= x) and (x <= b).
The following expression may potentially be ambiguous: [((car_ownership >= `0.0`) != `0.0`)] if it contains the chaining of two comparisons expressions. Keep in mind that, for Biogeme (like for Pandas), the expression (a <= x <= b) is not equivalent to (a <= x) and (x <= b).
The following expression may potentially be ambiguous: [((car_ownership >= `0.0`) != `0.0`)] if it contains the chaining of two comparisons expressions. Keep in mind that, for Biogeme (like for Pandas), the expression (a <= x <= b) is not equivalent to (a <= x) and (x <= b).
The following expression may potentially be ambiguous: [((car_ownership >= `0.0`) != `0.0`)] if it contains the chaining of two comparisons expressions. Keep in mind that, 

Number of estimated parameters:	10
Sample size:	4941
Excluded observations:	0
Null log likelihood:	-6610.329
Init log likelihood:	-3850.096
Final log likelihood:	-3807.078
Likelihood ratio test for the null model:	5606.502
Rho-square for the null model:	0.424
Rho-square-bar for the null model:	0.423
Likelihood ratio test for the init. model:	86.03575
Rho-square for the init. model:	0.0112
Rho-square-bar for the init. model:	0.00858
Akaike Information Criterion:	7634.156
Bayesian Information Criterion:	7699.209
Final gradient norm:	9.6963E-03
Nbr of threads:	16



In [31]:
pd_results_m4 = results_m4.get_estimated_parameters()
display(pd_results_m4)

Unnamed: 0,Value,Rob. Std err,Rob. t-test,Rob. p-value
LAMBDA,0.509546,0.026465,19.253256,0.0
asc_car,2.219163,0.166074,13.362484,0.0
asc_cycling_female,-1.508085,0.234891,-6.420371,1.359421e-10
asc_pt,3.192454,0.191311,16.687228,0.0
asc_walk,4.920358,0.23301,21.116522,0.0
beta_cost_car,-0.261843,0.030241,-8.658483,0.0
beta_cost_pt,-0.070811,0.036538,-1.93801,0.05262199
beta_time,-6.336484,0.285158,-22.220949,0.0
mu_fast,0.765266,0.036323,21.068352,0.0
mu_slow,0.904641,0.082134,11.014186,0.0


## Market Shares

In [32]:
populations = {
    'female_45_less': 2841376,
    'female_45_or_more': 1519948,
    'male_45_less': 2929408,
    'male_45_or_more': 1379198,
}

total_pop = sum(populations.values())

filters = {
    'male_45_or_more': (data_filtered.age >= 45) & (data_filtered.female == 0),
    'male_45_less': (data_filtered.age < 45) & (data_filtered.female == 0),
    'female_45_or_more': (data_filtered.age >= 45) & (data_filtered.female == 1),
    'female_45_less': (data_filtered.age < 45) & (data_filtered.female == 1),
}

sample_segments = {
    segment_name: segment_rows.sum() for segment_name, segment_rows in filters.items()
}
print(sample_segments)

total_sample = sum(sample_segments.values())
print(f'Sample size: {total_sample}')

weights = {
    segment_name: populations[segment_name] * total_sample / (segment_size * total_pop)
    for segment_name, segment_size in sample_segments.items()
}
print(weights)


{'male_45_or_more': np.int64(891), 'male_45_less': np.int64(1431), 'female_45_or_more': np.int64(960), 'female_45_less': np.int64(1659)}
Sample size: 4941
{'male_45_or_more': np.float64(0.8821616573815265), 'male_45_less': np.float64(1.1666470637431319), 'female_45_or_more': np.float64(0.9023120558643496), 'female_45_less': np.float64(0.9760713932651373)}


In [33]:
data_filtered.head()

Unnamed: 0,trip_id,household_id,person_n,trip_n,travel_mode,purpose,fueltype,faretype,bus_scale,survey_year,...,dur_pt_rail,dur_pt_bus,dur_pt_int,pt_interchanges,dur_driving,cost_transit,cost_driving_fuel,cost_driving_ccharge,driving_traffic_percent,household_has_license
0,12,1,1,0,4,3,1,5,0.0,1,...,0.0,0.123611,0.0,0,0.141389,0.0,0.51,0.0,0.090373,1
1,17,3,1,1,3,1,6,1,1.0,1,...,0.0,0.208056,0.091667,1,0.115556,3.0,0.33,0.0,0.033654,0
2,51,12,1,1,4,5,2,1,1.0,1,...,0.0,0.5475,0.133333,1,0.355556,3.0,1.12,0.0,0.302344,1
3,67,13,1,6,4,3,1,5,0.0,1,...,0.0,0.391667,0.0,0,0.206944,0.0,0.67,0.0,0.159732,1
4,74,14,0,3,4,3,1,5,0.0,1,...,0.0,0.033889,0.0,0,0.067778,0.0,0.2,0.0,0.151639,1


In [36]:
from biogeme.biogeme import BIOGEME
from biogeme.expressions import Beta, Variable, log, exp

for segment_name, segment_rows in filters.items():
    data_filtered.loc[segment_rows, 'weight'] = weights[segment_name]

prob_pt = exp(v_pt*mu_fast)/(exp(v_car*mu_fast)+exp(v_cycling*mu_slow)+exp(v_pt*mu_fast)+exp(v_walk*mu_slow))
prob_walk = exp(v_walk*mu_slow)/(exp(v_car*mu_fast)+exp(v_cycling*mu_slow)+exp(v_pt*mu_fast)+exp(v_walk*mu_slow))
prob_car = exp(v_car*mu_fast)/(exp(v_car*mu_fast)+exp(v_cycling*mu_slow)+exp(v_pt*mu_fast)+exp(v_walk*mu_slow))
prob_cycle = exp(v_cycling*mu_slow)/(exp(v_car*mu_fast)+exp(v_cycling*mu_slow)+exp(v_pt*mu_fast)+exp(v_walk*mu_slow))


weight = Variable('weight')
simulate = {
    'weight': weight,
    'Prob. pt': prob_pt,
    'Prob. car': prob_car,
    'Prob. walk': prob_walk,
    'Prob. cycle': prob_cycle,
}

database = db.Database('london', data_filtered)


biosim = BIOGEME(database, simulate)
simulated_values = biosim.simulate(results_m4.get_beta_values())
display(simulated_values)

simulated_values['Weighted pt'] = (
    simulated_values['weight'] * simulated_values['Prob. pt']
)
simulated_values['Weighted car'] = (
    simulated_values['weight'] * simulated_values['Prob. car']
)

simulated_values['Weighted walk'] = (
    simulated_values['weight'] * simulated_values['Prob. walk']
)
simulated_values['Weighted cycle'] = (
    simulated_values['weight'] * simulated_values['Prob. cycle']
)




Unnamed: 0,weight,Prob. pt,Prob. car,Prob. walk,Prob. cycle
0,0.882162,0.210918,0.616896,0.071523,0.100664
1,0.976071,0.049853,0.510274,0.386098,0.053775
2,0.902312,0.130707,0.844706,0.005985,0.018602
3,0.882162,0.190447,0.519632,0.101131,0.188790
4,0.902312,0.076415,0.148078,0.748913,0.026594
...,...,...,...,...,...
4995,0.882162,0.322336,0.487037,0.078179,0.112448
4996,1.166647,0.275874,0.333355,0.159229,0.231542
4997,0.976071,0.125353,0.163993,0.667412,0.043242
4998,0.976071,0.144930,0.834884,0.009698,0.010488


In [37]:
market_share_pt = simulated_values['Weighted pt'].mean()
print(f'Market share for pt: {100*market_share_pt:.1f}%')

market_share_car = simulated_values['Weighted car'].mean()
print(f'Market share for car: {100*market_share_car:.1f}%')

market_share_walk = simulated_values['Weighted walk'].mean()
print(f'Market share for walk: {100*market_share_walk:.1f}%')

market_share_cycle = simulated_values['Weighted cycle'].mean()
print(f'Market share for cycling: {100*market_share_cycle:.1f}%')




Market share for pt: 24.6%
Market share for car: 37.3%
Market share for walk: 31.2%
Market share for cycling: 6.8%


In [None]:
model_4.bootstrap_samples = 100
results_bootstrapping = model_4.estimate(run_bootstrap=True)

betas = model_4.free_beta_names
b = results_bootstrapping.get_betas_for_sensitivity_analysis(betas)
left, right = biosim.confidence_intervals(b, 0.9)

display(left)

display(right)

100%|██████████| 100/100 [02:36<00:00,  1.57s/it]


Unnamed: 0,weight,Prob. pt,Prob. car,Prob. walk,Prob. cycle
0,0.882162,0.161538,0.475925,0.031995,0.033980
1,0.976071,0.023310,0.232914,0.172376,0.022400
2,0.902312,0.116288,0.819315,0.004193,0.009712
3,0.882162,0.124037,0.330780,0.052877,0.068962
4,0.902312,0.018384,0.033538,0.381047,0.015671
...,...,...,...,...,...
4995,0.882162,0.234228,0.357832,0.035067,0.037718
4996,1.166647,0.153901,0.181763,0.086874,0.090940
4997,0.976071,0.039092,0.050735,0.352749,0.024878
4998,0.976071,0.134330,0.811370,0.006377,0.005064


Unnamed: 0,weight,Prob. pt,Prob. car,Prob. walk,Prob. cycle
0,0.882162,0.245135,0.692290,0.126766,0.236978
1,0.976071,0.073057,0.735115,0.645884,0.087946
2,0.902312,0.142840,0.864564,0.008714,0.031473
3,0.882162,0.241940,0.636637,0.154584,0.381824
4,0.902312,0.199208,0.394890,0.921598,0.035240
...,...,...,...,...,...
4995,0.882162,0.368969,0.557471,0.139595,0.262230
4996,1.166647,0.365230,0.451350,0.226537,0.419095
4997,0.976071,0.261038,0.357668,0.859050,0.058912
4998,0.976071,0.157102,0.852068,0.015035,0.018328


KeyError: 'Prob. rail'

In [39]:
# Calculate weighted probabilities
left['Weighted pt'] = left['weight'] * left['Prob. pt']
left['Weighted car'] = left['weight'] * left['Prob. car']
left['Weighted walk'] = left['weight'] * left['Prob. walk']
left['Weighted cycle'] = left['weight'] * left['Prob. cycle']

right['Weighted pt'] = right['weight'] * right['Prob. pt']
right['Weighted car'] = right['weight'] * right['Prob. car']
right['Weighted walk'] = right['weight'] * right['Prob. walk']
right['Weighted cycle'] = right['weight'] * right['Prob. cycle']

# Calculate mean market shares
market_share_pt = simulated_values['Weighted pt'].mean()
market_share_car = simulated_values['Weighted car'].mean()
market_share_walk = simulated_values['Weighted walk'].mean()
market_share_cycle = simulated_values['Weighted cycle'].mean()

# Calculate confidence intervals
left_market_share_pt = left['Weighted pt'].mean()
right_market_share_pt = right['Weighted pt'].mean()

left_market_share_car = left['Weighted car'].mean()
right_market_share_car = right['Weighted car'].mean()

left_market_share_walk = left['Weighted walk'].mean()
right_market_share_walk = right['Weighted walk'].mean()

left_market_share_cycle = left['Weighted cycle'].mean()
right_market_share_cycle = right['Weighted cycle'].mean()

# Print market shares and confidence intervals
print(f"Market share for pt: {100 * market_share_pt:.1f}% "
      f"CI: [{100 * left_market_share_pt:.1f}%-{100 * right_market_share_pt:.1f}%]")

print(f"Market share for car: {100 * market_share_car:.1f}% "
      f"CI: [{100 * left_market_share_car:.1f}%-{100 * right_market_share_car:.1f}%]")

print(f"Market share for walk: {100 * market_share_walk:.1f}% "
      f"CI: [{100 * left_market_share_walk:.1f}%-{100 * right_market_share_walk:.1f}%]")

print(f"Market share for cycling: {100 * market_share_cycle:.1f}% "
      f"CI: [{100 * left_market_share_cycle:.1f}%-{100 * right_market_share_cycle:.1f}%]")


Market share for pt: 24.6% CI: [19.2%-31.7%]
Market share for car: 37.3% CI: [27.2%-49.1%]
Market share for walk: 31.2% CI: [16.7%-41.9%]
Market share for cycling: 6.8% CI: [3.3%-10.9%]


In [41]:
# Example mapping
labels = {1: 'walk', 2: 'cycling', 3: 'pt', 4: 'car'}

# Map the travel_mode column to the labels
data_filtered['mode_label'] = data_filtered['travel_mode'].map(labels)

# Calculate market shares
market_shares = (
    data_filtered['mode_label']
    .value_counts(normalize=True)  # Get proportions
    .sort_index()  # Ensure consistent order
    * 100  # Convert to percentage
)

# Print market shares
for mode, share in market_shares.items():
    print(f"Market share for {mode}: {share:.1f}%")


Market share for car: 43.3%
Market share for cycling: 3.3%
Market share for pt: 35.7%
Market share for walk: 17.6%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['mode_label'] = data_filtered['travel_mode'].map(labels)
