In [None]:
import pandas as pd
import biogeme.database as db
import biogeme.biogeme as bio
from biogeme.expressions import Beta, Variable, exp
from biogeme.models import loglogit
from biogeme.tools import likelihood_ratio_test
from biogeme.results import compile_estimation_results
from biogeme.models import loglogit,  boxcox
from biogeme.models.piecewise import piecewise_formula
from biogeme.models import lognested
from biogeme.nests import OneNestForNestedLogit, NestsForNestedLogit
from biogeme.biogeme import BIOGEME



import numpy as np


In [None]:
# Load the data
df = pd.read_csv("lpmc01.dat", sep = '\t')
df['age_scaled'] = (df['age'] - df['age'].mean()) / df['age'].std()
df['cost_driving'] = df['cost_driving_ccharge'] + df['cost_driving_fuel']
df['dur_pt'] = df['dur_pt_access'] + df['dur_pt_rail'] + df['dur_pt_int'] + df['dur_pt_bus']

database1 = db.Database('lpmc01', df)


# Define the given veriables 
dur_pt = Variable('dur_pt')
cost_driving = Variable('cost_driving')
age_scaled = Variable('age_scaled')
trip_id = Variable('trip_id')
household_id = Variable('household_id')
person_n = Variable('person_n')
trip_n = Variable('trip_n')
travel_mode = Variable('travel_mode')
purpose = Variable('purpose')
fueltype = Variable('fueltype')
faretype = Variable('faretype')
bus_scale = Variable('bus_scale')
survey_year = Variable('survey_year')
travel_year = Variable('travel_year')
travel_month = Variable('travel_month')
travel_date = Variable('travel_date')
day_of_week = Variable('day_of_week')
start_time = Variable('start_time')
age = Variable('age')
female = Variable('female')
driving_license = Variable('driving_license')
car_ownership = Variable('car_ownership')
distance = Variable('distance')
dur_walking = Variable('dur_walking')
dur_cycling = Variable('dur_cycling')
dur_pt_access = Variable('dur_pt_access') # Predicted total access and egress time for public transport route in hours
dur_pt_rail = Variable('dur_pt_rail')
dur_pt_bus = Variable('dur_pt_bus')
dur_pt_int = Variable('dur_pt_int') # Time taken (hrs) at each interchange point
pt_interchanges = Variable('pt_interchanges')   # Number of interchange points in public transport route
dur_driving = Variable('dur_driving')
cost_transit = Variable('cost_transit')
cost_driving_fuel = Variable('cost_driving_fuel')   # Estimated fuel cost of driving route in GBP
cost_driving_ccharge = Variable('cost_driving_ccharge')  # Estimated congestion charge cost of driving route in GBP
driving_traffic_percent = Variable('driving_traffic_percent')



# Define transport availability
# Assume pt, walking, cycle always available, with car availability depending on number of cars per household. From the data, 
# people without driving licenses choose driving as their mode of transport (eg. row 28). 
av_drive =  (car_ownership > 0)
av_pt =1
av_walk = 1
av_cycle = 1

variable_names = ['dur_pt', 'cost_driving', 'age_scaled']  # Replace with your variable name
for variable_name in variable_names:
    if variable_name in database1.data.columns:
        print(f"'{variable_name}' exists in the database.")
    else:
        print(f"'{variable_name}' does NOT exist in the database.")



# Define pt_cost (not needed)
# Original paper, page 31: "Public transport fares are determined for single trips using Oystercard/contactless payment."
# Therefore, cost_transit should already consider faretype and bus_scale

database = db.Database('lpmc01', df)
variable_names = ['dur_pt', 'cost_driving', 'age_scaled']  # Replace with your variable name
for variable_name in variable_names:
    if variable_name in database1.data.columns:
        print(f"'{variable_name}' exists in the database.")
    else:
        print(f"'{variable_name}' does NOT exist in the database.")

'dur_pt' exists in the database.
'cost_driving' exists in the database.
'age_scaled' exists in the database.
'dur_pt' exists in the database.
'cost_driving' exists in the database.
'age_scaled' exists in the database.


In [3]:
# Define driving cost
cost_driving = cost_driving_ccharge + cost_driving_fuel

# Define time taken by each mode of transport
dur_pt = dur_pt_access + dur_pt_int + dur_pt_bus + dur_pt_rail  # Public transport (external) time 

# Model 0

In [4]:
all_results = {}

In [5]:
print("Columns in the database:")
print(database.data.columns)

Columns in the database:
Index(['trip_id', 'household_id', 'person_n', 'trip_n', 'travel_mode',
       'purpose', 'fueltype', 'faretype', 'bus_scale', 'survey_year',
       'travel_year', 'travel_month', 'travel_date', 'day_of_week',
       'start_time', 'age', 'female', 'driving_license', 'car_ownership',
       'distance', 'dur_walking', 'dur_cycling', 'dur_pt_access',
       'dur_pt_rail', 'dur_pt_bus', 'dur_pt_int', 'pt_interchanges',
       'dur_driving', 'cost_transit', 'cost_driving_fuel',
       'cost_driving_ccharge', 'driving_traffic_percent', 'age_scaled',
       'cost_driving', 'dur_pt'],
      dtype='object')


In [6]:
# Load the data 

# Define ASCs
ASC_CYCLE = Beta('ASC_CYCLE', 0, None, None, 0)
ASC_PT = Beta('ASC_PT', 0, None, None, 0)
ASC_DRIVE = Beta('ASC_DRIVE', 0, None, None, 0)

# Define generic parameters for cost and travel time
B_COST = Beta('B_COST', 0, None, None, 0)
B_TIME = Beta('B_TIME', 0, None, None, 0)

# Define utility functions for each alternative
V_WALK =  B_TIME * dur_walking
V_CYCLE = ASC_CYCLE + B_TIME * dur_cycling
V_PT = ASC_PT + B_COST * cost_transit + B_TIME * dur_pt
V_DRIVE = ASC_DRIVE + B_COST * cost_driving + B_TIME * dur_driving

# Assume every mode of transport is available
availability_walk = 1  
availability_cycle = 1  
availability_pt = 1     
availability_drive = 1

availability = {
    1: availability_walk,   # Walking
    2: availability_cycle,  # Cycling
    3: availability_pt,     # Public Transport
    4: availability_drive   # Driving
}

# Associate utility functions with the mode choice
V = {
    1: V_WALK,    # Walking
    2: V_CYCLE,   # Cycling
    3: V_PT,      # Public Transport
    4: V_DRIVE    # Driving
}

# Specify the model using a log-logit function
model_0 = loglogit(V, availability, travel_mode)

# Create Biogeme object
biogeme = bio.BIOGEME(database, model_0)
biogeme.modelName = "Model_0"

# Estimate parameters
results = biogeme.estimate()

# Display estimation results
print("Estimation results for Model 0:")
print(results.get_estimated_parameters())

# Optional: To calculate number of rows with car ownership 0 but driving mode chosen
driving_without_car = df[(df['car_ownership'] == 0) & (df['travel_mode'] == 4)]
num_rows_driving_without_car = driving_without_car.shape[0]
#print("Number of rows where car ownership is 0 but travel mode is driving:", num_rows_driving_without_car)
all_results['Model_0'] = results

File biogeme.toml has been created


Estimation results for Model 0:
              Value  Rob. Std err  Rob. t-test  Rob. p-value
ASC_CYCLE -3.778175      0.103171   -36.620564           0.0
ASC_DRIVE -1.288003      0.079431   -16.215324           0.0
ASC_PT    -0.525660      0.054777    -9.596426           0.0
B_COST    -0.190833      0.014635   -13.039920           0.0
B_TIME    -5.598770      0.203407   -27.524922           0.0


In [7]:
# Define alternative-specific parameters for travel time
B_TIME_WALK = Beta('B_TIME_WALK', 0, None, None, 0)
B_TIME_CYCLE = Beta('B_TIME_CYCLE', 0, None, None, 0)
B_TIME_PT = Beta('B_TIME_PT', 0, None, None, 0)
B_TIME_DRIVE = Beta('B_TIME_DRIVE', 0, None, None, 0)

# Update utility functions with alternative-specific time parameters
V_WALK = B_TIME_WALK * dur_walking
V_CYCLE = ASC_CYCLE + B_TIME_CYCLE * dur_cycling
V_PT = ASC_PT + B_COST * cost_transit + B_TIME_PT * dur_pt
V_DRIVE = ASC_DRIVE + B_COST * cost_driving + B_TIME_DRIVE * dur_driving

# Redefine the model
model_1 = loglogit({1: V_WALK, 2: V_CYCLE, 3: V_PT, 4: V_DRIVE}, availability, travel_mode)

# Create Biogeme object for Model 1
biogeme_model_1 = bio.BIOGEME(database, model_1)
biogeme_model_1.modelName = "Model_1"

# Estimate parameters for Model 1
results_model_1 = biogeme_model_1.estimate()
all_results['Model_1'] = results_model_1

# Display estimation results
print("Estimation results for Model 1:")
print(results_model_1.get_estimated_parameters())

Estimation results for Model 1:
                 Value  Rob. Std err  Rob. t-test  Rob. p-value
ASC_CYCLE    -4.602378      0.197139   -23.345859           0.0
ASC_DRIVE    -2.115318      0.144923   -14.596200           0.0
ASC_PT       -2.599549      0.146553   -17.737963           0.0
B_COST       -0.180812      0.017547   -10.304242           0.0
B_TIME_CYCLE -6.462945      0.483849   -13.357356           0.0
B_TIME_DRIVE -6.623754      0.379926   -17.434327           0.0
B_TIME_PT    -3.494668      0.244093   -14.316930           0.0
B_TIME_WALK  -9.065278      0.456118   -19.874846           0.0


In [8]:
# Retrieve log-likelihoods and number of parameters
model_0_loglike = results.data.logLike       # Log-likelihood for Model 0
model_0_numParam = len(results.get_beta_values())   # Number of parameters in Model 0

model_1_loglike = results_model_1.data.logLike       # Log-likelihood for Model 1
model_1_numParam = len(results_model_1.get_beta_values())   # Number of parameters in Model 1

# Perform the likelihood ratio test
alpha = 0.05
lr_test_result = likelihood_ratio_test(
    [model_0_loglike, model_0_numParam],
    [model_1_loglike, model_1_numParam],
    alpha # Significance level of 0.01
)

# Display the results (accessing values by index)
print("Likelihood Ratio Test Results:")
print(f" {lr_test_result[0]}")           # LR Statistic


Likelihood Ratio Test Results:
 H0 can be rejected at level 5.0%


Model_pref = model_1

# Model 2

In [9]:
# Define interaction terms between ASCs and age_scaled
ASC_CYCLE_AGE = ASC_CYCLE + Beta('ASC_CYCLE_AGE', 0, None, None, 0) * age_scaled
ASC_PT_AGE = ASC_PT + Beta('ASC_PT_AGE', 0, None, None, 0) * age_scaled
ASC_DRIVE_AGE = ASC_DRIVE + Beta('ASC_DRIVE_AGE', 0, None, None, 0) * age_scaled

# Utility functions with interaction terms
V_WALK = B_TIME_WALK * dur_walking
V_CYCLE = ASC_CYCLE_AGE + B_TIME_CYCLE * dur_cycling
V_PT = ASC_PT_AGE + B_COST * cost_transit + B_TIME_PT * dur_pt
V_DRIVE = ASC_DRIVE_AGE + B_COST * cost_driving + B_TIME_DRIVE * dur_driving

# Specify the model
model_2_spec1 = loglogit({1: V_WALK, 2: V_CYCLE, 3: V_PT, 4: V_DRIVE}, availability, travel_mode)

# Estimate Model 2 - Specification 1
biogeme_spec1 = bio.BIOGEME(database, model_2_spec1)
biogeme_spec1.modelName = "Model_2_spec1"
results_spec1 = biogeme_spec1.estimate()
all_results['Model_2_spec1'] = results_spec1

In [10]:
# Define interaction terms between travel time parameters and age_scaled
ASC_CYCLE = Beta('ASC_CYCLE', 0, None, None, 0)
ASC_PT = Beta('ASC_PT', 0, None, None, 0)
ASC_DRIVE = Beta('ASC_DRIVE', 0, None, None, 0)

B_TIME_WALK_AGE = Beta('B_TIME_WALK_AGE', 0, None, None, 0) * age_scaled
B_TIME_CYCLE_AGE = Beta('B_TIME_CYCLE_AGE', 0, None, None, 0) * age_scaled
B_TIME_PT_AGE = Beta('B_TIME_PT_AGE', 0, None, None, 0) * age_scaled
B_TIME_DRIVE_AGE = Beta('B_TIME_DRIVE_AGE', 0, None, None, 0) * age_scaled

# Updated utility functions with age interaction for travel time
V_WALK = (B_TIME_WALK + B_TIME_WALK_AGE) * dur_walking
V_CYCLE = ASC_CYCLE + (B_TIME_CYCLE + B_TIME_CYCLE_AGE) * dur_cycling
V_PT = ASC_PT + B_COST * cost_transit + (B_TIME_PT + B_TIME_PT_AGE) * dur_pt
V_DRIVE = ASC_DRIVE + B_COST * cost_driving + (B_TIME_DRIVE + B_TIME_DRIVE_AGE) * dur_driving

# V_WALK = B_TIME_WALK * dur_walking
# V_CYCLE = ASC_CYCLE + B_TIME_CYCLE * dur_cycling
# V_PT = ASC_PT + B_COST * cost_transit + B_TIME_PT * dur_pt
# V_DRIVE = ASC_DRIVE + B_COST * cost_driving + B_TIME_DRIVE * dur_driving



# Specify the model
model_2_spec2 = loglogit({1: V_WALK, 2: V_CYCLE, 3: V_PT, 4: V_DRIVE}, availability, travel_mode)

# Estimate Model 2 - Specification 2
biogeme_spec2 = bio.BIOGEME(database, model_2_spec2)
biogeme_spec2.modelName = "Model_2_spec2_"
results_spec2_ = biogeme_spec2.estimate()
all_results['Model_2_spec2_'] = results_spec2_

In [11]:
print("Estimation results for Model 2 - Specification 1 (Interaction with ASCs):")
print(results_spec1.get_estimated_parameters())

# Display estimation results for Model 2 - Specification 2
print("\nEstimation results for Model 2 - Specification 2 (Interaction with Travel Time):")
print(results_spec2_.get_estimated_parameters())

Estimation results for Model 2 - Specification 1 (Interaction with ASCs):
                  Value  Rob. Std err  Rob. t-test  Rob. p-value
ASC_CYCLE     -4.611273      0.199434   -23.121772  0.000000e+00
ASC_CYCLE_AGE  0.095194      0.077756     1.224269  2.208507e-01
ASC_DRIVE     -2.142161      0.147718   -14.501651  0.000000e+00
ASC_DRIVE_AGE  0.296461      0.045537     6.510324  7.498890e-11
ASC_PT        -2.610521      0.148440   -17.586417  0.000000e+00
ASC_PT_AGE     0.189014      0.049282     3.835347  1.253870e-04
B_COST        -0.182477      0.017631   -10.349664  0.000000e+00
B_TIME_CYCLE  -6.478674      0.485238   -13.351548  0.000000e+00
B_TIME_DRIVE  -6.554638      0.378412   -17.321419  0.000000e+00
B_TIME_PT     -3.478082      0.242090   -14.366916  0.000000e+00
B_TIME_WALK   -9.118911      0.464488   -19.632169  0.000000e+00

Estimation results for Model 2 - Specification 2 (Interaction with Travel Time):
                     Value  Rob. Std err  Rob. t-test  Rob. p-va

In [12]:
# Retrieve log-likelihoods and number of parameters for Model_pref and Model 2 specifications
model_1_loglike = results.data.logLike      
model_1_numParam = len(results.get_beta_values())  

spec1_loglike = results_spec1.data.logLike       
spec1_numParam = len(results_spec1.get_beta_values())   

spec2_loglike = results_spec2_.data.logLike       
spec2_numParam = len(results_spec2_.get_beta_values())   

# Perform the likelihood ratio test for Model 1 vs Model 2 - Specification 1
alpha = 0.00001
lr_test_spec1 = likelihood_ratio_test(
    [model_1_loglike, model_1_numParam],
    [spec1_loglike, spec1_numParam],
    alpha  # Significance level of alpha
)

# Display results for Model 0 vs Model 2 - Specification 1
print("Likelihood Ratio Test Results for Model 0 vs Model 2 - Specification 1:")
print(f" {lr_test_spec1[0]}")         

# Perform the likelihood ratio test for Model 0 vs Model 2 - Specification 2
lr_test_spec2 = likelihood_ratio_test(
    [model_0_loglike, model_0_numParam],
    [spec2_loglike, spec2_numParam],
    alpha 
)

# Display results for Model 0 vs Model 2 - Specification 2
print("\nLikelihood Ratio Test Results for Model 0 vs Model 2 - Specification 2:")
print(f": {lr_test_spec2[0]}")          




Likelihood Ratio Test Results for Model 0 vs Model 2 - Specification 1:
 H0 can be rejected at level 0.0%

Likelihood Ratio Test Results for Model 0 vs Model 2 - Specification 2:
: H0 can be rejected at level 0.0%


In [13]:
print(model_1_numParam)
print(spec2_numParam)

5
12


In [14]:

comparison_table, _ = compile_estimation_results(all_results)
display(comparison_table)

Unnamed: 0,Model_0,Model_1,Model_2_spec1,Model_2_spec2_
Number of estimated parameters,5,8,11,12
Sample size,5000,5000,5000,5000
Final log likelihood,-4581.895651,-4223.985956,-4202.066697,-4193.749471
Akaike Information Criterion,9173.791303,8463.971913,8426.133395,8411.498941
Bayesian Information Criterion,9206.377269,8516.109458,8497.82252,8489.70526
ASC_CYCLE (t-test),-3.78 (-36.6),-4.6 (-23.3),-4.61 (-23.1),-4.62 (-23)
ASC_DRIVE (t-test),-1.29 (-16.2),-2.12 (-14.6),-2.14 (-14.5),-2.16 (-14.4)
ASC_PT (t-test),-0.526 (-9.6),-2.6 (-17.7),-2.61 (-17.6),-2.61 (-17.4)
B_COST (t-test),-0.191 (-13),-0.181 (-10.3),-0.182 (-10.3),-0.183 (-10.3)
B_TIME (t-test),-5.6 (-27.5),,,


model_pref = model 2 (spec 1)

# Model 3
(Box_Cox)

In [15]:
variable_name = 'dur_pt'  # Replace with your variable name
if variable_name in database.data.columns:
    print(f"'{variable_name}' exists in the database.")
else:
    print(f"'{variable_name}' does NOT exist in the database.")

'dur_pt' exists in the database.


In [16]:
# Box-Cox Transformation for costs
lambda_cost = Beta('lambda_cost', 1, -10, 10, 0)
boxcox_cost_transit = boxcox(cost_transit, lambda_cost)
boxcox_cost_driving = boxcox(cost_driving, lambda_cost)

# Define interaction terms between ASCs and age_scaled
ASC_CYCLE_AGE = ASC_CYCLE + Beta('ASC_CYCLE_AGE', 0, None, None, 0) * age_scaled
ASC_PT_AGE = ASC_PT + Beta('ASC_PT_AGE', 0, None, None, 0) * age_scaled
ASC_DRIVE_AGE = ASC_DRIVE + Beta('ASC_DRIVE_AGE', 0, None, None, 0) * age_scaled

# Utility functions with Box-Cox transformation
V_WALK = B_TIME_WALK * dur_walking
V_CYCLE = ASC_CYCLE_AGE + B_TIME_CYCLE * dur_cycling
V_PT = ASC_PT_AGE + B_COST * boxcox_cost_transit + B_TIME_PT * dur_pt
V_DRIVE = ASC_DRIVE_AGE + B_COST * boxcox_cost_driving + B_TIME_DRIVE * dur_driving

# Specify the model with Box-Cox transformation
model_3_boxcox = loglogit({1: V_WALK, 2: V_CYCLE, 3: V_PT, 4: V_DRIVE}, availability, travel_mode)

# Create Biogeme object
biogeme_model_3 = bio.BIOGEME(database, model_3_boxcox)
biogeme_model_3.modelName = "Model_3_BoxCox"

# Estimate Model 3
results_model_3 = biogeme_model_3.estimate()

# Log-likelihood and number of parameters for Model 3
model_3_boxcox_loglike = results_model_3.data.logLike
model_3_boxcox_numParam = results_model_3.get_estimated_parameters().shape[0]

# Store results for comparison
all_results['Model_3_BoxCox'] = results_model_3

# Display results
print("Estimation results for Model 3 (Box-Cox Transformation):")
print(results_model_3.get_estimated_parameters())

Estimation results for Model 3 (Box-Cox Transformation):
                  Value  Rob. Std err  Rob. t-test  Rob. p-value
ASC_CYCLE     -4.735363      0.214838   -22.041596  0.000000e+00
ASC_CYCLE_AGE  0.092801      0.077886     1.191499  2.334578e-01
ASC_DRIVE     -2.853241      0.207380   -13.758483  0.000000e+00
ASC_DRIVE_AGE  0.293056      0.045106     6.497072  8.189804e-11
ASC_PT        -2.784971      0.158433   -17.578248  0.000000e+00
ASC_PT_AGE     0.194194      0.050281     3.862141  1.123979e-04
B_COST        -0.460217      0.052159    -8.823419  0.000000e+00
B_TIME_CYCLE  -6.980520      0.514151   -13.576780  0.000000e+00
B_TIME_DRIVE  -6.213109      0.377631   -16.452844  0.000000e+00
B_TIME_PT     -3.762030      0.247652   -15.190786  0.000000e+00
B_TIME_WALK   -9.540950      0.506010   -18.855242  0.000000e+00
lambda_cost    0.323809      0.081127     3.991383  6.568920e-05


In [17]:
# Retrieve log-likelihoods and number of parameters for Model_pref and Model 2 specifications

spec1_loglike = results_spec1.data.logLike       
spec1_numParam = len(results_spec1.get_beta_values())   

model_3_boxcox_loglike = results_model_3.data.logLike       
model_3_boxcox_numParam = len(results_model_3.get_beta_values())   

# Perform the likelihood ratio test for Model 1 vs Model 2 - Specification 1
alpha = 0.01
lr_test_spec1 = likelihood_ratio_test(
    [spec1_loglike, spec1_numParam],
    [model_3_boxcox_loglike, model_0_numParam],
    alpha  # Significance level of alpha
)

# Display results for Model 0 vs Model 2 - Specification 1
print("Likelihood Ratio Test Results for Model 3 vs Model 2 - Specification 1:")
print(f" {lr_test_spec1[0]}")         
      


Likelihood Ratio Test Results for Model 3 vs Model 2 - Specification 1:
 H0 cannot be rejected at level 1.0%


In [18]:
comparison_table, _ = compile_estimation_results(all_results)
display(comparison_table)



Unnamed: 0,Model_0,Model_1,Model_2_spec1,Model_2_spec2_,Model_3_BoxCox
Number of estimated parameters,5,8,11,12,12
Sample size,5000,5000,5000,5000,5000
Final log likelihood,-4581.895651,-4223.985956,-4202.066697,-4193.749471,-4203.35605
Akaike Information Criterion,9173.791303,8463.971913,8426.133395,8411.498941,8430.712099
Bayesian Information Criterion,9206.377269,8516.109458,8497.82252,8489.70526,8508.918418
ASC_CYCLE (t-test),-3.78 (-36.6),-4.6 (-23.3),-4.61 (-23.1),-4.62 (-23),-4.74 (-22)
ASC_DRIVE (t-test),-1.29 (-16.2),-2.12 (-14.6),-2.14 (-14.5),-2.16 (-14.4),-2.85 (-13.8)
ASC_PT (t-test),-0.526 (-9.6),-2.6 (-17.7),-2.61 (-17.6),-2.61 (-17.4),-2.78 (-17.6)
B_COST (t-test),-0.191 (-13),-0.181 (-10.3),-0.182 (-10.3),-0.183 (-10.3),-0.46 (-8.82)
B_TIME (t-test),-5.6 (-27.5),,,,


modelpref = model2, spec_1

In [19]:
print(database.data.columns)

Index(['trip_id', 'household_id', 'person_n', 'trip_n', 'travel_mode',
       'purpose', 'fueltype', 'faretype', 'bus_scale', 'survey_year',
       'travel_year', 'travel_month', 'travel_date', 'day_of_week',
       'start_time', 'age', 'female', 'driving_license', 'car_ownership',
       'distance', 'dur_walking', 'dur_cycling', 'dur_pt_access',
       'dur_pt_rail', 'dur_pt_bus', 'dur_pt_int', 'pt_interchanges',
       'dur_driving', 'cost_transit', 'cost_driving_fuel',
       'cost_driving_ccharge', 'driving_traffic_percent', 'age_scaled',
       'cost_driving', 'dur_pt'],
      dtype='object')


# Model 4

In [20]:
# Define ASC interaction terms with age_scaled
ASC_CYCLE_AGE = ASC_CYCLE + Beta('ASC_CYCLE_AGE', 0, None, None, 0) * age_scaled
ASC_PT_AGE = ASC_PT + Beta('ASC_PT_AGE', 0, None, None, 0) * age_scaled
ASC_DRIVE_AGE = ASC_DRIVE + Beta('ASC_DRIVE_AGE', 0, None, None, 0) * age_scaled

mu_a = Beta('mu_a', 1, 0, None, 0)
mu_b = Beta('mu_b', 1, 0, None, 0)
nest_a = OneNestForNestedLogit(nest_param=mu_a, list_of_alternatives=[1, 2], name='slow modes')
nest_b = OneNestForNestedLogit(nest_param=mu_b, list_of_alternatives=[3, 4], name='faster modes')
nests = NestsForNestedLogit(choice_set=list(V), tuple_of_nests=(nest_a, nest_b))


# Define utility functions
V_WALK = B_TIME_WALK * dur_walking
V_CYCLE = ASC_CYCLE_AGE + B_TIME_CYCLE * dur_cycling
V_PT = ASC_PT_AGE + B_COST * cost_transit + B_TIME_PT * dur_pt
V_DRIVE = ASC_DRIVE_AGE + B_COST * cost_driving + B_TIME_DRIVE * dur_driving
# Define utility dictionary
V = {1: V_WALK, 2: V_CYCLE, 3: V_PT, 4: V_DRIVE}


# Lognested model
logprob_m4 = lognested(V, None, nests, travel_mode)

# Create Biogeme object
biogeme_model_4 = bio.BIOGEME(database, logprob_m4)
biogeme_model_4.modelName = "Model_4_Nested"

# Estimate Model 4
results_model_4 = biogeme_model_4.estimate()

# Log-likelihood and number of parameters for Model 4
model_4_loglike = results_model_4.data.logLike
model_4_numParam = results_model_4.get_estimated_parameters().shape[0]

# Store results for comparison
all_results['Model_4'] = results_model_4

# Display results
print("Estimation results for Model 4 (Nested Logit):")
print(results_model_4.get_estimated_parameters())

Estimation results for Model 4 (Nested Logit):
                   Value  Rob. Std err  Rob. t-test  Rob. p-value
ASC_CYCLE      -6.387396      0.301940   -21.154501  0.000000e+00
ASC_CYCLE_AGE   0.151708      0.100424     1.510685  1.308686e-01
ASC_DRIVE      -2.589653      0.172910   -14.976873  0.000000e+00
ASC_DRIVE_AGE   0.334220      0.052333     6.386400  1.698359e-10
ASC_PT         -3.253821      0.222463   -14.626342  0.000000e+00
ASC_PT_AGE      0.178994      0.059520     3.007283  2.635939e-03
B_COST         -0.251061      0.035243    -7.123709  1.050715e-12
B_TIME_CYCLE   -6.106219      0.791322    -7.716481  1.199041e-14
B_TIME_DRIVE   -9.268319      1.012542    -9.153513  0.000000e+00
B_TIME_PT      -4.906518      0.560796    -8.749195  0.000000e+00
B_TIME_WALK   -11.216415      0.610830   -18.362574  0.000000e+00
mu_a            0.591399      0.046559    12.702035  0.000000e+00
mu_b            0.700449      0.083943     8.344328  0.000000e+00


In [21]:
# Retrieve log-likelihoods and number of parameters for Model_pref and Model 2 specifications

spec1_loglike = results_spec1.data.logLike       
spec1_numParam = len(results_spec1.get_beta_values())   

model_4_loglike = results_model_4.data.logLike       
model_4_numParam = len(results_model_4.get_beta_values())   

# Perform the likelihood ratio test for Model 1 vs Model 2 - Specification 1
alpha = 0.01
lr_test_spec1 = likelihood_ratio_test(
    [spec1_loglike, spec1_numParam],
    [model_4_loglike, model_4_numParam],
    alpha  # Significance level of alpha
)

# Display results for Model 0 vs Model 2 - Specification 1
print("Likelihood Ratio Test Results for Model 4 vs Model 2 - Specification 1:")
print(f" {lr_test_spec1[0]}")      

Likelihood Ratio Test Results for Model 4 vs Model 2 - Specification 1:
 H0 can be rejected at level 1.0%


In [22]:
comparison_table, _ = compile_estimation_results(all_results)
display(comparison_table)

Unnamed: 0,Model_0,Model_1,Model_2_spec1,Model_2_spec2_,Model_3_BoxCox,Model_4
Number of estimated parameters,5,8,11,12,12,13
Sample size,5000,5000,5000,5000,5000,5000
Final log likelihood,-4581.895651,-4223.985956,-4202.066697,-4193.749471,-4203.35605,-4180.831857
Akaike Information Criterion,9173.791303,8463.971913,8426.133395,8411.498941,8430.712099,8387.663715
Bayesian Information Criterion,9206.377269,8516.109458,8497.82252,8489.70526,8508.918418,8472.387226
ASC_CYCLE (t-test),-3.78 (-36.6),-4.6 (-23.3),-4.61 (-23.1),-4.62 (-23),-4.74 (-22),-6.39 (-21.2)
ASC_DRIVE (t-test),-1.29 (-16.2),-2.12 (-14.6),-2.14 (-14.5),-2.16 (-14.4),-2.85 (-13.8),-2.59 (-15)
ASC_PT (t-test),-0.526 (-9.6),-2.6 (-17.7),-2.61 (-17.6),-2.61 (-17.4),-2.78 (-17.6),-3.25 (-14.6)
B_COST (t-test),-0.191 (-13),-0.181 (-10.3),-0.182 (-10.3),-0.183 (-10.3),-0.46 (-8.82),-0.251 (-7.12)
B_TIME (t-test),-5.6 (-27.5),,,,,


model_pref = model 4

# Market Shares

The weight $w_g$ associated with segment $g$ is defined as
$$
w_g = \frac{N_g}{N}\frac{S}{S_g}.
$$

In [23]:
populations = {
    'male_44_less':  2926408,
    'male_45_more': 1379198 ,
    'female_44_less':  2841379,
    'female_45_more':  1519948,
    }

total_pop = sum(populations.values())
total_pop

8666933

In [24]:
filters = {
    'male_45_more': (df.age >= 45) & (df.female == 0),
    'male_44_less': (df.age < 45) & (df.female == 0),
    'female_45_more': (df.age >= 45) & (df.female == 1),
    'female_44_less': (df.age < 45) & (df.female == 1),
}


In [25]:
sample_segments = {
    segment_name: segment_rows.sum() for segment_name, segment_rows in filters.items()
}
print(sample_segments)

total_sample = sum(sample_segments.values())
print(f'Sample size: {total_sample}')

weights = {
    segment_name: populations[segment_name] * total_sample / (segment_size * total_pop)
    for segment_name, segment_size in sample_segments.items()
}
print(weights)

{'male_45_more': np.int64(896), 'male_44_less': np.int64(1442), 'female_45_more': np.int64(984), 'female_44_less': np.int64(1678)}
Sample size: 5000
{'male_45_more': np.float64(0.8880208732101985), 'male_44_less': np.float64(1.1707769945566922), 'female_45_more': np.float64(0.8911241160085213), 'female_44_less': np.float64(0.9768812522649147)}


In [26]:
total_sample = sum(sample_segments.values())
total_sample

np.int64(5000)

In [27]:
from biogeme import models

for segment_name, segment_rows in filters.items():
    df.loc[segment_rows, 'weight'] = weights[segment_name]


prob_walk = models.nested(V, None, nests, 1)
prob_cycle = models.nested(V, None, nests, 2)
prob_pt = models.nested(V, None, nests, 3)
prob_car = models.nested(V, None, nests, 4)


weight = Variable('weight')
simulate = {
    'weight': weight,
    'Prob. pt': prob_pt,
    'Prob. car': prob_car,
    'Prob. walk': prob_walk,
    'Prob. cycle': prob_cycle,
}

database = db.Database('london', df)


In [28]:

biosim = BIOGEME(database, simulate)
simulated_values = biosim.simulate(results_model_4.get_beta_values())
display(simulated_values)

simulated_values['Weighted pt'] = (
    simulated_values['weight'] * simulated_values['Prob. pt']
)
simulated_values['Weighted car'] = (
    simulated_values['weight'] * simulated_values['Prob. car']
)

simulated_values['Weighted walk'] = (
    simulated_values['weight'] * simulated_values['Prob. walk']
)
simulated_values['Weighted cycle'] = (
    simulated_values['weight'] * simulated_values['Prob. cycle']
)


Unnamed: 0,weight,Prob. pt,Prob. car,Prob. walk,Prob. cycle
0,0.888021,0.298542,0.687333,3.604272e-03,0.010521
1,0.976881,0.139342,0.688180,1.235766e-01,0.048901
2,0.891124,0.227023,0.757259,2.620116e-05,0.015692
3,0.888021,0.344668,0.629474,5.599319e-03,0.020259
4,0.891124,0.177917,0.366109,4.171279e-01,0.038846
...,...,...,...,...,...
4995,0.888021,0.341519,0.635340,6.972370e-03,0.016169
4996,1.170777,0.434144,0.504854,2.004364e-02,0.040959
4997,0.976881,0.236936,0.385238,3.179143e-01,0.059912
4998,0.976881,0.278908,0.712039,1.112778e-04,0.008942


In [29]:

market_share_pt = simulated_values['Weighted pt'].mean()
print(f'Market share for pt: {100*market_share_pt:.1f}%')

market_share_car = simulated_values['Weighted car'].mean()
print(f'Market share for car: {100*market_share_car:.1f}%')

market_share_walk = simulated_values['Weighted walk'].mean()
print(f'Market share for walk: {100*market_share_walk:.1f}%')

market_share_cycle = simulated_values['Weighted cycle'].mean()
print(f'Market share for cycling: {100*market_share_cycle:.1f}%')

biogeme_model_4.bootstrap_samples = 100
results_bootstrapping = biogeme_model_4.estimate(run_bootstrap=True)

betas = biogeme_model_4.free_beta_names
b = results_bootstrapping.get_betas_for_sensitivity_analysis(betas)
left, right = biosim.confidence_intervals(b, 0.9)

display(left)

display(right)

Market share for pt: 35.6%
Market share for car: 43.6%
Market share for walk: 17.4%
Market share for cycling: 3.4%


100%|██████████| 100/100 [02:14<00:00,  1.35s/it]


Unnamed: 0,weight,Prob. pt,Prob. car,Prob. walk,Prob. cycle
0,0.888021,0.274004,0.654385,2.290399e-03,0.007269
1,0.976881,0.126274,0.667361,1.022670e-01,0.043211
2,0.891124,0.209888,0.733470,6.586277e-06,0.011702
3,0.888021,0.324864,0.604352,3.648340e-03,0.015534
4,0.891124,0.163146,0.341518,3.941663e-01,0.030852
...,...,...,...,...,...
4995,0.888021,0.325559,0.614559,4.597159e-03,0.012529
4996,1.170777,0.413710,0.476655,1.451689e-02,0.034995
4997,0.976881,0.221336,0.366872,2.909413e-01,0.053029
4998,0.976881,0.263505,0.691385,4.286230e-05,0.006626


Unnamed: 0,weight,Prob. pt,Prob. car,Prob. walk,Prob. cycle
0,0.888021,0.331709,0.711357,5.017980e-03,0.015188
1,0.976881,0.153577,0.711856,1.412557e-01,0.055810
2,0.891124,0.249718,0.777291,5.284811e-05,0.019436
3,0.888021,0.366950,0.645559,7.658032e-03,0.026230
4,0.891124,0.193531,0.383982,4.444125e-01,0.045774
...,...,...,...,...,...
4995,0.888021,0.361786,0.651744,9.262216e-03,0.020406
4996,1.170777,0.456728,0.521791,2.688852e-02,0.052492
4997,0.976881,0.256393,0.401265,3.421027e-01,0.067261
4998,0.976881,0.298268,0.728224,1.851042e-04,0.011729


In [30]:
# Calculate weighted probabilities
left['Weighted pt'] = left['weight'] * left['Prob. pt']
left['Weighted car'] = left['weight'] * left['Prob. car']
left['Weighted walk'] = left['weight'] * left['Prob. walk']
left['Weighted cycle'] = left['weight'] * left['Prob. cycle']

right['Weighted pt'] = right['weight'] * right['Prob. pt']
right['Weighted car'] = right['weight'] * right['Prob. car']
right['Weighted walk'] = right['weight'] * right['Prob. walk']
right['Weighted cycle'] = right['weight'] * right['Prob. cycle']

# Calculate mean market shares
market_share_pt = simulated_values['Weighted pt'].mean()
market_share_car = simulated_values['Weighted car'].mean()
market_share_walk = simulated_values['Weighted walk'].mean()
market_share_cycle = simulated_values['Weighted cycle'].mean()

# Calculate confidence intervals
left_market_share_pt = left['Weighted pt'].mean()
right_market_share_pt = right['Weighted pt'].mean()

left_market_share_car = left['Weighted car'].mean()
right_market_share_car = right['Weighted car'].mean()

left_market_share_walk = left['Weighted walk'].mean()
right_market_share_walk = right['Weighted walk'].mean()

left_market_share_cycle = left['Weighted cycle'].mean()
right_market_share_cycle = right['Weighted cycle'].mean()

# Print market shares and confidence intervals
print(f"Market share for pt: {100 * market_share_pt:.1f}% "
      f"CI: [{100 * left_market_share_pt:.1f}%-{100 * right_market_share_pt:.1f}%]")

print(f"Market share for car: {100 * market_share_car:.1f}% "
      f"CI: [{100 * left_market_share_car:.1f}%-{100 * right_market_share_car:.1f}%]")

print(f"Market share for walk: {100 * market_share_walk:.1f}% "
      f"CI: [{100 * left_market_share_walk:.1f}%-{100 * right_market_share_walk:.1f}%]")
print(f"Market share for cycling: {100 * market_share_cycle:.1f}% "
      f"CI: [{100 * left_market_share_cycle:.1f}%-{100 * right_market_share_cycle:.1f}%]")



Market share for pt: 35.6% CI: [33.6%-37.9%]
Market share for car: 43.6% CI: [41.2%-45.8%]
Market share for walk: 17.4% CI: [16.2%-18.6%]
Market share for cycling: 3.4% CI: [2.8%-4.2%]


In [31]:
# Example mapping
labels = {1: 'walk', 2: 'cycling', 3: 'pt', 4: 'car'}

# Map the travel_mode column to the labels
df['mode_label'] = df['travel_mode'].map(labels)

# Calculate market shares
market_shares = (
    df['mode_label']
    .value_counts(normalize=True)  # Get proportions
    .sort_index()  # Ensure consistent order
    * 100  # Convert to percentage
)

# Print market shares
for mode, share in market_shares.items():
    print(f"Market share for {mode}: {share:.1f}%")

Market share for car: 44.0%
Market share for cycling: 3.3%
Market share for pt: 35.3%
Market share for walk: 17.4%


# Forecasting

Scenario 2: decrease rail cost by 20%

In [32]:
import pickle

# open a file, where you stored the pickled data
file = open('Model_1.pickle', 'rb')

# dump information to that file
pickled_model = pickle.load(file)

# close the file
file.close()




In [None]:
pickled_model.

<biogeme.results.RawResults at 0x259147dedd0>