In [None]:
import pandas as pd
import biogeme.database as db
import biogeme.biogeme as bio
from biogeme.expressions import Beta, Variable, exp
from biogeme.models import loglogit
from biogeme.tools import likelihood_ratio_test
from biogeme.results import compile_estimation_results
from biogeme.models import loglogit,  boxcox
from biogeme.models.piecewise import piecewise_formula
from biogeme.models import lognested
from biogeme.nests import OneNestForNestedLogit, NestsForNestedLogit
from biogeme.biogeme import BIOGEME



import numpy as np


#from data_preparation import *

In [8]:
# Load the data
df = pd.read_csv("lpmc01.dat", sep = '\t')
df['age_scaled'] = (df['age'] - df['age'].mean()) / df['age'].std()
df['cost_driving'] = df['cost_driving_ccharge'] + df['cost_driving_fuel']
df['dur_pt'] = df['dur_pt_access'] + df['dur_pt_rail'] + df['dur_pt_int'] + df['dur_pt_bus']

database1 = db.Database('lpmc01', df)


# Define the given veriables 
dur_pt = Variable('dur_pt')
cost_driving = Variable('cost_driving')
age_scaled = Variable('age_scaled')
trip_id = Variable('trip_id')
household_id = Variable('household_id')
person_n = Variable('person_n')
trip_n = Variable('trip_n')
travel_mode = Variable('travel_mode')
purpose = Variable('purpose')
fueltype = Variable('fueltype')
faretype = Variable('faretype')
bus_scale = Variable('bus_scale')
survey_year = Variable('survey_year')
travel_year = Variable('travel_year')
travel_month = Variable('travel_month')
travel_date = Variable('travel_date')
day_of_week = Variable('day_of_week')
start_time = Variable('start_time')
age = Variable('age')
female = Variable('female')
driving_license = Variable('driving_license')
car_ownership = Variable('car_ownership')
distance = Variable('distance')
dur_walking = Variable('dur_walking')
dur_cycling = Variable('dur_cycling')
dur_pt_access = Variable('dur_pt_access') # Predicted total access and egress time for public transport route in hours
dur_pt_rail = Variable('dur_pt_rail')
dur_pt_bus = Variable('dur_pt_bus')
dur_pt_int = Variable('dur_pt_int') # Time taken (hrs) at each interchange point
pt_interchanges = Variable('pt_interchanges')   # Number of interchange points in public transport route
dur_driving = Variable('dur_driving')
cost_transit = Variable('cost_transit')
cost_driving_fuel = Variable('cost_driving_fuel')   # Estimated fuel cost of driving route in GBP
cost_driving_ccharge = Variable('cost_driving_ccharge')  # Estimated congestion charge cost of driving route in GBP
driving_traffic_percent = Variable('driving_traffic_percent')


# Define new variables:
# # Define driving cost 
# df['cost_driving'] = df['cost_driving_ccharge'] + df['cost_driving_fuel']
# cost_driving = Variable(cost_driving_ccharge + cost_driving_fuel)

# # Define time taken by each mode of transport
# df['dur_pt'] = df['dur_pt_access'] + df['dur_pt_rail'] + df['dur_pt_int'] + df['dur_pt_bus']
# dur_pt = Variable(dur_pt_access + dur_pt_rail + dur_pt_int + dur_pt_bus)


# Define transport availability
# Assume pt, walking, cycle always available, with car availability depending on number of cars per household. From the data, 
# people without driving licenses choose driving as their mode of transport (eg. row 28). 
av_drive =  (car_ownership > 0)
av_pt =1
av_walk = 1
av_cycle = 1

variable_names = ['dur_pt', 'cost_driving', 'age_scaled']  # Replace with your variable name
for variable_name in variable_names:
    if variable_name in database1.data.columns:
        print(f"'{variable_name}' exists in the database.")
    else:
        print(f"'{variable_name}' does NOT exist in the database.")



# Define pt_cost (not needed)
# Original paper, page 31: "Public transport fares are determined for single trips using Oystercard/contactless payment."
# Therefore, cost_transit should already consider faretype and bus_scale

database = db.Database('lpmc01', df)
variable_names = ['dur_pt', 'cost_driving', 'age_scaled']  # Replace with your variable name
for variable_name in variable_names:
    if variable_name in database1.data.columns:
        print(f"'{variable_name}' exists in the database.")
    else:
        print(f"'{variable_name}' does NOT exist in the database.")

'dur_pt' exists in the database.
'cost_driving' exists in the database.
'age_scaled' exists in the database.
'dur_pt' exists in the database.
'cost_driving' exists in the database.
'age_scaled' exists in the database.


In [9]:
# Define driving cost
cost_driving = cost_driving_ccharge + cost_driving_fuel

# Define time taken by each mode of transport
dur_pt = dur_pt_access + dur_pt_int + dur_pt_bus + dur_pt_rail  # Public transport (external) time 

# Model 0

In [11]:
all_results = {}

In [4]:
print("Columns in the database:")
print(database.data.columns)

Columns in the database:
Index(['trip_id', 'household_id', 'person_n', 'trip_n', 'travel_mode',
       'purpose', 'fueltype', 'faretype', 'bus_scale', 'survey_year',
       'travel_year', 'travel_month', 'travel_date', 'day_of_week',
       'start_time', 'age', 'female', 'driving_license', 'car_ownership',
       'distance', 'dur_walking', 'dur_cycling', 'dur_pt_access',
       'dur_pt_rail', 'dur_pt_bus', 'dur_pt_int', 'pt_interchanges',
       'dur_driving', 'cost_transit', 'cost_driving_fuel',
       'cost_driving_ccharge', 'driving_traffic_percent', 'age_scaled',
       'cost_driving', 'dur_pt'],
      dtype='object')


In [12]:
# Load the data 

# Define ASCs
ASC_CYCLE = Beta('ASC_CYCLE', 0, None, None, 0)
ASC_PT = Beta('ASC_PT', 0, None, None, 0)
ASC_DRIVE = Beta('ASC_DRIVE', 0, None, None, 0)

# Define generic parameters for cost and travel time
B_COST = Beta('B_COST', 0, None, None, 0)
B_TIME = Beta('B_TIME', 0, None, None, 0)

# Define utility functions for each alternative
V_WALK =  B_TIME * dur_walking
V_CYCLE = ASC_CYCLE + B_TIME * dur_cycling
V_PT = ASC_PT + B_COST * cost_transit + B_TIME * dur_pt
V_DRIVE = ASC_DRIVE + B_COST * cost_driving + B_TIME * dur_driving

# Assume every mode of transport is available
availability_walk = 1  
availability_cycle = 1  
availability_pt = 1     
availability_drive = 1

availability = {
    1: availability_walk,   # Walking
    2: availability_cycle,  # Cycling
    3: availability_pt,     # Public Transport
    4: availability_drive   # Driving
}

# Associate utility functions with the mode choice
V = {
    1: V_WALK,    # Walking
    2: V_CYCLE,   # Cycling
    3: V_PT,      # Public Transport
    4: V_DRIVE    # Driving
}

# Specify the model using a log-logit function
model_0 = loglogit(V, availability, travel_mode)

# Create Biogeme object
biogeme = bio.BIOGEME(database, model_0)
biogeme.modelName = "Model_0"

# Estimate parameters
results = biogeme.estimate()

# Display estimation results
print("Estimation results for Model 0:")
print(results.get_estimated_parameters())

# Optional: To calculate number of rows with car ownership 0 but driving mode chosen
driving_without_car = df[(df['car_ownership'] == 0) & (df['travel_mode'] == 4)]
num_rows_driving_without_car = driving_without_car.shape[0]
#print("Number of rows where car ownership is 0 but travel mode is driving:", num_rows_driving_without_car)
all_results['Model_0'] = results

Estimation results for Model 0:
              Value  Rob. Std err  Rob. t-test  Rob. p-value
ASC_CYCLE -3.778175      0.103171   -36.620564           0.0
ASC_DRIVE -1.288003      0.079431   -16.215324           0.0
ASC_PT    -0.525660      0.054777    -9.596426           0.0
B_COST    -0.190833      0.014635   -13.039920           0.0
B_TIME    -5.598770      0.203407   -27.524922           0.0


In [13]:
# Define alternative-specific parameters for travel time
B_TIME_WALK = Beta('B_TIME_WALK', 0, None, None, 0)
B_TIME_CYCLE = Beta('B_TIME_CYCLE', 0, None, None, 0)
B_TIME_PT = Beta('B_TIME_PT', 0, None, None, 0)
B_TIME_DRIVE = Beta('B_TIME_DRIVE', 0, None, None, 0)

# Update utility functions with alternative-specific time parameters
V_WALK = B_TIME_WALK * dur_walking
V_CYCLE = ASC_CYCLE + B_TIME_CYCLE * dur_cycling
V_PT = ASC_PT + B_COST * cost_transit + B_TIME_PT * dur_pt
V_DRIVE = ASC_DRIVE + B_COST * cost_driving + B_TIME_DRIVE * dur_driving

# Redefine the model
model_1 = loglogit({1: V_WALK, 2: V_CYCLE, 3: V_PT, 4: V_DRIVE}, availability, travel_mode)

# Create Biogeme object for Model 1
biogeme_model_1 = bio.BIOGEME(database, model_1)
biogeme_model_1.modelName = "Model_1"

# Estimate parameters for Model 1
results_model_1 = biogeme_model_1.estimate()
all_results['Model_1'] = results_model_1

# Display estimation results
print("Estimation results for Model 1:")
print(results_model_1.get_estimated_parameters())

Estimation results for Model 1:
                 Value  Rob. Std err  Rob. t-test  Rob. p-value
ASC_CYCLE    -4.602378      0.197139   -23.345860           0.0
ASC_DRIVE    -2.115318      0.144923   -14.596200           0.0
ASC_PT       -2.599549      0.146553   -17.737963           0.0
B_COST       -0.180812      0.017547   -10.304242           0.0
B_TIME_CYCLE -6.462945      0.483849   -13.357357           0.0
B_TIME_DRIVE -6.623754      0.379926   -17.434326           0.0
B_TIME_PT    -3.494668      0.244093   -14.316930           0.0
B_TIME_WALK  -9.065278      0.456118   -19.874846           0.0


In [14]:
# Retrieve log-likelihoods and number of parameters
model_0_loglike = results.data.logLike       # Log-likelihood for Model 0
model_0_numParam = len(results.get_beta_values())   # Number of parameters in Model 0

model_1_loglike = results_model_1.data.logLike       # Log-likelihood for Model 1
model_1_numParam = len(results_model_1.get_beta_values())   # Number of parameters in Model 1

# Perform the likelihood ratio test
alpha = 0.05
lr_test_result = likelihood_ratio_test(
    [model_0_loglike, model_0_numParam],
    [model_1_loglike, model_1_numParam],
    alpha # Significance level of 0.01
)

# Display the results (accessing values by index)
print("Likelihood Ratio Test Results:")
print(f" {lr_test_result[0]}")           # LR Statistic


Likelihood Ratio Test Results:
 H0 can be rejected at level 5.0%


Model_pref = model_1

# Model 2

In [15]:
# Define interaction terms between ASCs and age_scaled
ASC_CYCLE_AGE = ASC_CYCLE + Beta('ASC_CYCLE_AGE', 0, None, None, 0) * age_scaled
ASC_PT_AGE = ASC_PT + Beta('ASC_PT_AGE', 0, None, None, 0) * age_scaled
ASC_DRIVE_AGE = ASC_DRIVE + Beta('ASC_DRIVE_AGE', 0, None, None, 0) * age_scaled

# Utility functions with interaction terms
V_WALK = B_TIME_WALK * dur_walking
V_CYCLE = ASC_CYCLE_AGE + B_TIME_CYCLE * dur_cycling
V_PT = ASC_PT_AGE + B_COST * cost_transit + B_TIME_PT * dur_pt
V_DRIVE = ASC_DRIVE_AGE + B_COST * cost_driving + B_TIME_DRIVE * dur_driving

# Specify the model
model_2_spec1 = loglogit({1: V_WALK, 2: V_CYCLE, 3: V_PT, 4: V_DRIVE}, availability, travel_mode)

# Estimate Model 2 - Specification 1
biogeme_spec1 = bio.BIOGEME(database, model_2_spec1)
biogeme_spec1.modelName = "Model_2_spec1"
results_spec1 = biogeme_spec1.estimate()
all_results['Model_2_spec1'] = results_spec1

In [16]:
# Define interaction terms between travel time parameters and age_scaled
ASC_CYCLE = Beta('ASC_CYCLE', 0, None, None, 0)
ASC_PT = Beta('ASC_PT', 0, None, None, 0)
ASC_DRIVE = Beta('ASC_DRIVE', 0, None, None, 0)

B_TIME_WALK_AGE = Beta('B_TIME_WALK_AGE', 0, None, None, 0) * age_scaled
B_TIME_CYCLE_AGE = Beta('B_TIME_CYCLE_AGE', 0, None, None, 0) * age_scaled
B_TIME_PT_AGE = Beta('B_TIME_PT_AGE', 0, None, None, 0) * age_scaled
B_TIME_DRIVE_AGE = Beta('B_TIME_DRIVE_AGE', 0, None, None, 0) * age_scaled

# Updated utility functions with age interaction for travel time
V_WALK = (B_TIME_WALK + B_TIME_WALK_AGE) * dur_walking
V_CYCLE = ASC_CYCLE + (B_TIME_CYCLE + B_TIME_CYCLE_AGE) * dur_cycling
V_PT = ASC_PT + B_COST * cost_transit + (B_TIME_PT + B_TIME_PT_AGE) * dur_pt
V_DRIVE = ASC_DRIVE + B_COST * cost_driving + (B_TIME_DRIVE + B_TIME_DRIVE_AGE) * dur_driving

# V_WALK = B_TIME_WALK * dur_walking
# V_CYCLE = ASC_CYCLE + B_TIME_CYCLE * dur_cycling
# V_PT = ASC_PT + B_COST * cost_transit + B_TIME_PT * dur_pt
# V_DRIVE = ASC_DRIVE + B_COST * cost_driving + B_TIME_DRIVE * dur_driving



# Specify the model
model_2_spec2 = loglogit({1: V_WALK, 2: V_CYCLE, 3: V_PT, 4: V_DRIVE}, availability, travel_mode)

# Estimate Model 2 - Specification 2
biogeme_spec2 = bio.BIOGEME(database, model_2_spec2)
biogeme_spec2.modelName = "Model_2_spec2_"
results_spec2_ = biogeme_spec2.estimate()
all_results['Model_2_spec2_'] = results_spec2_

In [17]:
print("Estimation results for Model 2 - Specification 1 (Interaction with ASCs):")
print(results_spec1.get_estimated_parameters())

# Display estimation results for Model 2 - Specification 2
print("\nEstimation results for Model 2 - Specification 2 (Interaction with Travel Time):")
print(results_spec2_.get_estimated_parameters())

Estimation results for Model 2 - Specification 1 (Interaction with ASCs):
                  Value  Rob. Std err  Rob. t-test  Rob. p-value
ASC_CYCLE     -4.611336      0.199462   -23.118840  0.000000e+00
ASC_CYCLE_AGE  0.095215      0.077758     1.224505  2.207617e-01
ASC_DRIVE     -2.142379      0.147746   -14.500429  0.000000e+00
ASC_DRIVE_AGE  0.296494      0.045540     6.510669  7.481704e-11
ASC_PT        -2.610719      0.148465   -17.584769  0.000000e+00
ASC_PT_AGE     0.189040      0.049284     3.835696  1.252090e-04
B_COST        -0.182406      0.017625   -10.349493  0.000000e+00
B_TIME_CYCLE  -6.479459      0.485375   -13.349399  0.000000e+00
B_TIME_DRIVE  -6.554869      0.378414   -17.321956  0.000000e+00
B_TIME_PT     -3.478239      0.242092   -14.367451  0.000000e+00
B_TIME_WALK   -9.119824      0.464604   -19.629222  0.000000e+00

Estimation results for Model 2 - Specification 2 (Interaction with Travel Time):
                     Value  Rob. Std err  Rob. t-test  Rob. p-va

In [18]:
# Retrieve log-likelihoods and number of parameters for Model_pref and Model 2 specifications
model_1_loglike = results.data.logLike      
model_1_numParam = len(results.get_beta_values())  

spec1_loglike = results_spec1.data.logLike       
spec1_numParam = len(results_spec1.get_beta_values())   

spec2_loglike = results_spec2_.data.logLike       
spec2_numParam = len(results_spec2_.get_beta_values())   

# Perform the likelihood ratio test for Model 1 vs Model 2 - Specification 1
alpha = 0.00001
lr_test_spec1 = likelihood_ratio_test(
    [model_1_loglike, model_1_numParam],
    [spec1_loglike, spec1_numParam],
    alpha  # Significance level of alpha
)

# Display results for Model 0 vs Model 2 - Specification 1
print("Likelihood Ratio Test Results for Model 0 vs Model 2 - Specification 1:")
print(f" {lr_test_spec1[0]}")         

# Perform the likelihood ratio test for Model 0 vs Model 2 - Specification 2
lr_test_spec2 = likelihood_ratio_test(
    [model_0_loglike, model_0_numParam],
    [spec2_loglike, spec2_numParam],
    alpha 
)

# Display results for Model 0 vs Model 2 - Specification 2
print("\nLikelihood Ratio Test Results for Model 0 vs Model 2 - Specification 2:")
print(f": {lr_test_spec2[0]}")          




Likelihood Ratio Test Results for Model 0 vs Model 2 - Specification 1:
 H0 can be rejected at level 0.0%

Likelihood Ratio Test Results for Model 0 vs Model 2 - Specification 2:
: H0 can be rejected at level 0.0%


In [19]:
print(model_1_numParam)
print(spec2_numParam)

5
12


In [20]:

comparison_table, _ = compile_estimation_results(all_results)
display(comparison_table)

Unnamed: 0,Model_0,Model_1,Model_2_spec1,Model_2_spec2_
Number of estimated parameters,5,8,11,12
Sample size,5000,5000,5000,5000
Final log likelihood,-4581.895651,-4223.985956,-4202.066677,-4193.749471
Akaike Information Criterion,9173.791303,8463.971913,8426.133354,8411.498941
Bayesian Information Criterion,9206.377269,8516.109458,8497.822479,8489.70526
ASC_CYCLE (t-test),-3.78 (-36.6),-4.6 (-23.3),-4.61 (-23.1),-4.62 (-23)
ASC_DRIVE (t-test),-1.29 (-16.2),-2.12 (-14.6),-2.14 (-14.5),-2.16 (-14.4)
ASC_PT (t-test),-0.526 (-9.6),-2.6 (-17.7),-2.61 (-17.6),-2.61 (-17.4)
B_COST (t-test),-0.191 (-13),-0.181 (-10.3),-0.182 (-10.3),-0.183 (-10.3)
B_TIME (t-test),-5.6 (-27.5),,,


model_pref = model 2 (spec 1)

# Model 3
(Box_Cox)

In [21]:
variable_name = 'dur_pt'  # Replace with your variable name
if variable_name in database.data.columns:
    print(f"'{variable_name}' exists in the database.")
else:
    print(f"'{variable_name}' does NOT exist in the database.")

'dur_pt' exists in the database.


In [32]:
# Box-Cox Transformation for costs
lambda_cost = Beta('lambda_cost', 1, -10, 10, 0)
boxcox_cost_transit = boxcox(cost_transit, lambda_cost)
boxcox_cost_driving = boxcox(cost_driving, lambda_cost)

# Define interaction terms between ASCs and age_scaled
ASC_CYCLE_AGE = ASC_CYCLE + Beta('ASC_CYCLE_AGE', 0, None, None, 0) * age_scaled
ASC_PT_AGE = ASC_PT + Beta('ASC_PT_AGE', 0, None, None, 0) * age_scaled
ASC_DRIVE_AGE = ASC_DRIVE + Beta('ASC_DRIVE_AGE', 0, None, None, 0) * age_scaled

# Utility functions with Box-Cox transformation
V_WALK = B_TIME_WALK * dur_walking
V_CYCLE = ASC_CYCLE_AGE + B_TIME_CYCLE * dur_cycling
V_PT = ASC_PT_AGE + B_COST * boxcox_cost_transit + B_TIME_PT * dur_pt
V_DRIVE = ASC_DRIVE_AGE + B_COST * boxcox_cost_driving + B_TIME_DRIVE * dur_driving

# Specify the model with Box-Cox transformation
model_3_boxcox = loglogit({1: V_WALK, 2: V_CYCLE, 3: V_PT, 4: V_DRIVE}, availability, travel_mode)

# Create Biogeme object
biogeme_model_3 = bio.BIOGEME(database, model_3_boxcox)
biogeme_model_3.modelName = "Model_3_BoxCox"

# Estimate Model 3
results_model_3 = biogeme_model_3.estimate()

# Log-likelihood and number of parameters for Model 3
model_3_boxcox_loglike = results_model_3.data.logLike
model_3_boxcox_numParam = results_model_3.get_estimated_parameters().shape[0]

# Store results for comparison
all_results['Model_3_BoxCox'] = results_model_3

# Display results
print("Estimation results for Model 3 (Box-Cox Transformation):")
print(results_model_3.get_estimated_parameters())

Estimation results for Model 3 (Box-Cox Transformation):
                  Value  Rob. Std err  Rob. t-test  Rob. p-value
ASC_CYCLE     -4.735370      0.214838   -22.041553  0.000000e+00
ASC_CYCLE_AGE  0.092800      0.077886     1.191491  2.334608e-01
ASC_DRIVE     -2.853291      0.207382   -13.758641  0.000000e+00
ASC_DRIVE_AGE  0.293056      0.045106     6.497073  8.189760e-11
ASC_PT        -2.784977      0.158434   -17.578200  0.000000e+00
ASC_PT_AGE     0.194193      0.050281     3.862127  1.124042e-04
B_COST        -0.460248      0.052159    -8.823889  0.000000e+00
B_TIME_CYCLE  -6.980558      0.514153   -13.576817  0.000000e+00
B_TIME_DRIVE  -6.213037      0.377630   -16.452731  0.000000e+00
B_TIME_PT     -3.762030      0.247651   -15.190825  0.000000e+00
B_TIME_WALK   -9.540978      0.506012   -18.855238  0.000000e+00
lambda_cost    0.323788      0.081119     3.991540  6.564567e-05


In [42]:
# Retrieve log-likelihoods and number of parameters for Model_pref and Model 2 specifications

spec1_loglike = results_spec1.data.logLike       
spec1_numParam = len(results_spec1.get_beta_values())   

model_3_boxcox_loglike = results_model_3.data.logLike       
model_3_boxcox_numParam = len(results_model_3.get_beta_values())   

# Perform the likelihood ratio test for Model 1 vs Model 2 - Specification 1
alpha = 0.01
lr_test_spec1 = likelihood_ratio_test(
    [spec1_loglike, spec1_numParam],
    [model_3_boxcox_loglike, model_0_numParam],
    alpha  # Significance level of alpha
)

# Display results for Model 0 vs Model 2 - Specification 1
print("Likelihood Ratio Test Results for Model 3 vs Model 2 - Specification 1:")
print(f" {lr_test_spec1[0]}")         
      


Likelihood Ratio Test Results for Model 3 vs Model 2 - Specification 1:
 H0 cannot be rejected at level 1.0%


In [39]:
comparison_table, _ = compile_estimation_results(all_results)
display(comparison_table)



Unnamed: 0,Model_0,Model_1,Model_2_spec1,Model_2_spec2_,Model_3,Model_3_BoxCox
Number of estimated parameters,5,8,11,12,10,12
Sample size,5000,5000,5000,5000,5000,5000
Final log likelihood,-4581.895651,-4223.985956,-4202.066677,-4193.749471,-4544.159911,-4203.35605
Akaike Information Criterion,9173.791303,8463.971913,8426.133354,8411.498941,9108.319821,8430.7121
Bayesian Information Criterion,9206.377269,8516.109458,8497.822479,8489.70526,9173.491753,8508.918418
ASC_CYCLE (t-test),-3.78 (-36.6),-4.6 (-23.3),-4.61 (-23.1),-4.62 (-23),-1.94 (-36.6),-4.74 (-22)
ASC_DRIVE (t-test),-1.29 (-16.2),-2.12 (-14.6),-2.14 (-14.5),-2.16 (-14.4),-0.706 (-17),-2.85 (-13.8)
ASC_PT (t-test),-0.526 (-9.6),-2.6 (-17.7),-2.61 (-17.6),-2.61 (-17.4),-0.286 (-10.2),-2.78 (-17.6)
B_COST (t-test),-0.191 (-13),-0.181 (-10.3),-0.182 (-10.3),-0.183 (-10.3),-0.188 (-12.8),-0.46 (-8.82)
B_TIME (t-test),-5.6 (-27.5),,,,,


modelpref = model2, spec_1

In [26]:
print(database.data.columns)

Index(['trip_id', 'household_id', 'person_n', 'trip_n', 'travel_mode',
       'purpose', 'fueltype', 'faretype', 'bus_scale', 'survey_year',
       'travel_year', 'travel_month', 'travel_date', 'day_of_week',
       'start_time', 'age', 'female', 'driving_license', 'car_ownership',
       'distance', 'dur_walking', 'dur_cycling', 'dur_pt_access',
       'dur_pt_rail', 'dur_pt_bus', 'dur_pt_int', 'pt_interchanges',
       'dur_driving', 'cost_transit', 'cost_driving_fuel',
       'cost_driving_ccharge', 'driving_traffic_percent', 'age_scaled',
       'cost_driving', 'dur_pt'],
      dtype='object')


# Model 4

In [47]:
# Define ASC interaction terms with age_scaled
ASC_CYCLE_AGE = ASC_CYCLE + Beta('ASC_CYCLE_AGE', 0, None, None, 0) * age_scaled
ASC_PT_AGE = ASC_PT + Beta('ASC_PT_AGE', 0, None, None, 0) * age_scaled
ASC_DRIVE_AGE = ASC_DRIVE + Beta('ASC_DRIVE_AGE', 0, None, None, 0) * age_scaled

mu_a = Beta('mu_a', 1, 0, None, 0)
mu_b = Beta('mu_b', 1, 0, None, 0)
nest_a = OneNestForNestedLogit(nest_param=mu_a, list_of_alternatives=[1, 2], name='slow modes')
nest_b = OneNestForNestedLogit(nest_param=mu_b, list_of_alternatives=[3, 4], name='faster modes')
nests = NestsForNestedLogit(choice_set=list(V), tuple_of_nests=(nest_a, nest_b))


# Define utility functions
V_WALK = B_TIME_WALK * dur_walking
V_CYCLE = ASC_CYCLE_AGE + B_TIME_CYCLE * dur_cycling
V_PT = ASC_PT_AGE + B_COST * cost_transit + B_TIME_PT * dur_pt
V_DRIVE = ASC_DRIVE_AGE + B_COST * cost_driving + B_TIME_DRIVE * dur_driving
# Define utility dictionary
V = {1: V_WALK, 2: V_CYCLE, 3: V_PT, 4: V_DRIVE}


# Lognested model
logprob_m4 = lognested(V, None, nests, travel_mode)

# Create Biogeme object
biogeme_model_4 = bio.BIOGEME(database, logprob_m4)
biogeme_model_4.modelName = "Model_4_Nested"

# Estimate Model 4
results_model_4 = biogeme_model_4.estimate()

# Log-likelihood and number of parameters for Model 4
model_4_loglike = results_model_4.data.logLike
model_4_numParam = results_model_4.get_estimated_parameters().shape[0]

# Store results for comparison
all_results['Model_4'] = results_model_4

# Display results
print("Estimation results for Model 4 (Nested Logit):")
print(results_model_4.get_estimated_parameters())

Estimation results for Model 4 (Nested Logit):
                   Value  Rob. Std err  Rob. t-test  Rob. p-value
ASC_CYCLE      -6.387729      0.301944   -21.155358  0.000000e+00
ASC_CYCLE_AGE   0.151718      0.100427     1.510724  1.308589e-01
ASC_DRIVE      -2.589648      0.172907   -14.977148  0.000000e+00
ASC_DRIVE_AGE   0.334217      0.052332     6.386429  1.698039e-10
ASC_PT         -3.253760      0.222455   -14.626579  0.000000e+00
ASC_PT_AGE      0.179009      0.059518     3.007622  2.633002e-03
B_COST         -0.251035      0.035241    -7.123310  1.053602e-12
B_TIME_CYCLE   -6.104817      0.791182    -7.716074  1.199041e-14
B_TIME_DRIVE   -9.267261      1.012461    -9.153200  0.000000e+00
B_TIME_PT      -4.905927      0.560740    -8.749029  0.000000e+00
B_TIME_WALK   -11.216285      0.610809   -18.362989  0.000000e+00
mu_a            0.591358      0.046556    12.701940  0.000000e+00
mu_b            0.700524      0.083956     8.343941  0.000000e+00


In [49]:
# Retrieve log-likelihoods and number of parameters for Model_pref and Model 2 specifications

spec1_loglike = results_spec1.data.logLike       
spec1_numParam = len(results_spec1.get_beta_values())   

model_4_loglike = results_model_4.data.logLike       
model_4_numParam = len(results_model_4.get_beta_values())   

# Perform the likelihood ratio test for Model 1 vs Model 2 - Specification 1
alpha = 0.01
lr_test_spec1 = likelihood_ratio_test(
    [spec1_loglike, spec1_numParam],
    [model_4_loglike, model_4_numParam],
    alpha  # Significance level of alpha
)

# Display results for Model 0 vs Model 2 - Specification 1
print("Likelihood Ratio Test Results for Model 4 vs Model 2 - Specification 1:")
print(f" {lr_test_spec1[0]}")      

Likelihood Ratio Test Results for Model 4 vs Model 2 - Specification 1:
 H0 can be rejected at level 1.0%


In [50]:
comparison_table, _ = compile_estimation_results(all_results)
display(comparison_table)

Unnamed: 0,Model_0,Model_1,Model_2_spec1,Model_2_spec2_,Model_3,Model_3_BoxCox,Model_4
Number of estimated parameters,5,8,11,12,10,12,13
Sample size,5000,5000,5000,5000,5000,5000,5000
Final log likelihood,-4581.895651,-4223.985956,-4202.066677,-4193.749471,-4544.159911,-4203.35605,-4180.831879
Akaike Information Criterion,9173.791303,8463.971913,8426.133354,8411.498941,9108.319821,8430.7121,8387.663758
Bayesian Information Criterion,9206.377269,8516.109458,8497.822479,8489.70526,9173.491753,8508.918418,8472.387269
ASC_CYCLE (t-test),-3.78 (-36.6),-4.6 (-23.3),-4.61 (-23.1),-4.62 (-23),-1.94 (-36.6),-4.74 (-22),-6.39 (-21.2)
ASC_DRIVE (t-test),-1.29 (-16.2),-2.12 (-14.6),-2.14 (-14.5),-2.16 (-14.4),-0.706 (-17),-2.85 (-13.8),-2.59 (-15)
ASC_PT (t-test),-0.526 (-9.6),-2.6 (-17.7),-2.61 (-17.6),-2.61 (-17.4),-0.286 (-10.2),-2.78 (-17.6),-3.25 (-14.6)
B_COST (t-test),-0.191 (-13),-0.181 (-10.3),-0.182 (-10.3),-0.183 (-10.3),-0.188 (-12.8),-0.46 (-8.82),-0.251 (-7.12)
B_TIME (t-test),-5.6 (-27.5),,,,,,


model_pref = model 4

# Market Shares

In [54]:
df = pd.read_csv("lpmc01.dat", sep='\t')  # Adjust file path as necessary

# Define the strata based on age and gender
df['stratum'] = pd.cut(
    df['age'],
    bins=[0, 45, 120],  # Age bins: less than 45, and 45 or older
    labels=["<45", ">=45"]
).astype(str) + "_" + df['female'].map({1: "Female", 0: "Male"})

# Count the number of samples in each stratum
strata_sample_sizes = df['stratum'].value_counts()

# Display the results
print("Sample sizes for each stratum:")
print(strata_sample_sizes)

Sample sizes for each stratum:
stratum
<45_Female     1732
<45_Male       1493
>=45_Female     930
>=45_Male       845
Name: count, dtype: int64


The weight $w_g$ associated with segment $g$ is defined as
$$
w_g = \frac{N_g}{N}\frac{S}{S_g}.
$$

In [55]:
strata_population_sizes = {
    "<45_Female": 2841376,
    ">=45_Female": 1519948,
    "<45_Male": 2926408,
    ">=45_Male": 1379198,
}

# Total sample size (N)
total_sample_size = df.shape[0]

# Total population size (S)
total_population_size = sum(strata_population_sizes.values())

# Calculate weights (w_g) for each stratum
weights = {}
for stratum in strata_sample_sizes.index:
    N_g = strata_sample_sizes[stratum]
    S_g = strata_population_sizes[stratum]
    weights[stratum] = (N_g / total_sample_size) * (total_population_size / S_g)

# Display weights
print("Weights for each stratum:")
for stratum, weight in weights.items():
    print(f"{stratum}: {weight:.4f}")

Weights for each stratum:
<45_Female: 1.0566
<45_Male: 0.8843
>=45_Female: 1.0606
>=45_Male: 1.0620


In [None]:
# Step 1: Define weights based on strata
df['weight'] = df['stratum'].map(weights)  # Map pre-computed weights to individuals

# Step 2: Nested Logit Probabilities
# Compute inclusive values (IV) for each nest
inclusive_slow = mu_a * (exp(V[1]) + exp(V[2]))
inclusive_fast = mu_b * (exp(V[3]) + exp(V[4]))

# Nest probabilities
prob_slow_modes = exp(inclusive_slow) / (exp(inclusive_slow) + exp(inclusive_fast))
prob_fast_modes = exp(inclusive_fast) / (exp(inclusive_slow) + exp(inclusive_fast))

# Within-nest probabilities
prob_walk_given_slow = exp(V[1]) / (exp(V[1]) + exp(V[2]))
prob_cycle_given_slow = exp(V[2]) / (exp(V[1]) + exp(V[2]))
prob_pt_given_fast = exp(V[3]) / (exp(V[3]) + exp(V[4]))
prob_drive_given_fast = exp(V[4]) / (exp(V[3]) + exp(V[4]))

# Overall probabilities for each mode
prob_walk = prob_slow_modes * prob_walk_given_slow
prob_cycle = prob_slow_modes * prob_cycle_given_slow
prob_pt = prob_fast_modes * prob_pt_given_fast
prob_drive = prob_fast_modes * prob_drive_given_fast

# Step 3: Define simulation dictionary
simulate = {
    'weight': Variable('weight'),
    'Prob. walking': prob_walk,
    'Prob. cycling': prob_cycle,
    'Prob. public_transport': prob_pt,
    'Prob. driving': prob_drive,
}

# Perform the simulation
biosim = BIOGEME(database, simulate)
simulated_values = biosim.simulate(results_model_4.get_beta_values())

# Step 4: Calculate market shares
for mode in ['walking', 'cycling', 'public_transport', 'driving']:
    simulated_values[f'Weighted {mode}'] = (
        simulated_values['weight'] * simulated_values[f'Prob. {mode}']
    )

# Aggregate market shares
market_shares = {
    mode: simulated_values[f'Weighted {mode}'].mean()
    for mode in ['walking', 'cycling', 'public_transport', 'driving']
}

# Display market shares
print("Predicted Market Shares (Model 4):")
for mode, share in market_shares.items():
    print(f"{mode.capitalize()}: {share:.2%}")

# Step 5: Bootstrap Confidence Intervals
biosim.bootstrap_samples = 100  # Number of bootstrap samples
results_bootstrap = biosim.estimate(run_bootstrap=True)

# Extract beta values for sensitivity analysis
betas = results_bootstrap.get_betas_for_sensitivity_analysis(biogeme.free_beta_names)

# Confidence intervals
ci_left, ci_right = biosim.confidence_intervals(betas, 0.9)  # 90% confidence level

# Add confidence intervals for each mode
for mode in ['walking', 'cycling', 'public_transport', 'driving']:
    ci_left[f'Weighted {mode}'] = (
        ci_left['weight'] * ci_left[f'Prob. {mode}']
    )
    ci_right[f'Weighted {mode}'] = (
        ci_right['weight'] * ci_right[f'Prob. {mode}']
    )

# Display market shares with confidence intervals
print("\nMarket Shares with Confidence Intervals (90%):")
for mode in ['walking', 'cycling', 'public_transport', 'driving']:
    lower = ci_left[f'Weighted {mode}'].mean()
    upper = ci_right[f'Weighted {mode}'].mean()
    print(
        f"{mode.capitalize()}: {market_shares[mode]:.2%} "
        f"(90% CI: [{lower:.2%}, {upper:.2%}])"
    )