# Imports

In [1]:
import pandas as pd
import biogeme.database as db
import biogeme.biogeme as bio
from biogeme.expressions import Beta, Variable, exp
from biogeme.models import loglogit
from biogeme.tools import likelihood_ratio_test
from biogeme.results import compile_estimation_results
from biogeme.models import loglogit,  boxcox
from biogeme.models.piecewise import piecewise_formula
from biogeme.models import lognested
from biogeme.nests import OneNestForNestedLogit, NestsForNestedLogit
from biogeme.biogeme import BIOGEME

import pickle

import numpy as np
import os

from scipy.stats import chi2


# Data & Variables

In [2]:
# Define the relative path to the data folder
file_path = os.path.join(os.pardir, 'lpmc01.dat')

#file_path = os.path.join(data_folder, 'lpmc01.dat')

df = pd.read_csv(file_path, sep = '\t')
df['age_normalized'] = (df['age'] - df['age'].mean()) / df['age'].std()
df['age_scaled'] = df['age'] / df['age'].max()
df['cost_driving'] = df['cost_driving_ccharge'] + df['cost_driving_fuel']
df['dur_pt'] = df['dur_pt_access'] + df['dur_pt_rail'] + df['dur_pt_int'] + df['dur_pt_bus']

database1 = db.Database('lpmc01', df)


# Define the given veriables 
dur_pt = Variable('dur_pt')
cost_driving = Variable('cost_driving')
age_scaled = Variable('age_scaled')
trip_id = Variable('trip_id')
household_id = Variable('household_id')
person_n = Variable('person_n')
trip_n = Variable('trip_n')
travel_mode = Variable('travel_mode')
purpose = Variable('purpose')
fueltype = Variable('fueltype')
faretype = Variable('faretype')
bus_scale = Variable('bus_scale')
survey_year = Variable('survey_year')
travel_year = Variable('travel_year')
travel_month = Variable('travel_month')
travel_date = Variable('travel_date')
day_of_week = Variable('day_of_week')
start_time = Variable('start_time')
age = Variable('age')
female = Variable('female')
driving_license = Variable('driving_license')
car_ownership = Variable('car_ownership')
distance = Variable('distance')
dur_walking = Variable('dur_walking')
dur_cycling = Variable('dur_cycling')
dur_pt_access = Variable('dur_pt_access') # Predicted total access and egress time for public transport route in hours
dur_pt_rail = Variable('dur_pt_rail')
dur_pt_bus = Variable('dur_pt_bus')
dur_pt_int = Variable('dur_pt_int') # Time taken (hrs) at each interchange point
pt_interchanges = Variable('pt_interchanges')   # Number of interchange points in public transport route
dur_driving = Variable('dur_driving')
cost_transit = Variable('cost_transit')
cost_driving_fuel = Variable('cost_driving_fuel')   # Estimated fuel cost of driving route in GBP
cost_driving_ccharge = Variable('cost_driving_ccharge')  # Estimated congestion charge cost of driving route in GBP
driving_traffic_percent = Variable('driving_traffic_percent')


variable_names = ['dur_pt', 'cost_driving', 'age_scaled']  # Replace with your variable name
for variable_name in variable_names:
    if variable_name in database1.data.columns:
        print(f"'{variable_name}' exists in the database.")
    else:
        print(f"'{variable_name}' does NOT exist in the database.")



# Define pt_cost (not needed)
# Original paper, page 31: "Public transport fares are determined for single trips using Oystercard/contactless payment."
# Therefore, cost_transit should already consider faretype and bus_scale

database = db.Database('lpmc01', df)
variable_names = ['dur_pt', 'cost_driving', 'age_scaled']  # Replace with your variable name
for variable_name in variable_names:
    if variable_name in database1.data.columns:
        print(f"'{variable_name}' exists in the database.")
    else:
        print(f"'{variable_name}' does NOT exist in the database.")

# Define driving cost
cost_driving = cost_driving_ccharge + cost_driving_fuel

# Define time taken by each mode of transport
dur_pt = dur_pt_access + dur_pt_int + dur_pt_bus + dur_pt_rail  # Public transport (external) time 

'dur_pt' exists in the database.
'cost_driving' exists in the database.
'age_scaled' exists in the database.
'dur_pt' exists in the database.
'cost_driving' exists in the database.
'age_scaled' exists in the database.


In [3]:
all_results = {}

# Model A

## Definition

In [4]:
# Assume every mode of transport is available
availability_walk = 1  
availability_cycle = 1  
availability_pt = 1     
availability_drive = 1

availability = {
    1: availability_walk,   # Walking
    2: availability_cycle,  # Cycling
    3: availability_pt,     # Public Transport
    4: availability_drive   # Driving
}

In [5]:
ASC_CYCLE = Beta('ASC_CYCLE', 0, None, None, 0)
ASC_PT = Beta('ASC_PT', 0, None, None, 0)
ASC_DRIVE = Beta('ASC_DRIVE', 0, None, None, 0)

# Define generic parameters for cost and travel time
B_COST = Beta('B_COST', 0, None, None, 0)

# Define interaction terms between ASCs and age_scaled
ASC_CYCLE_AGE = ASC_CYCLE + Beta('ASC_CYCLE_AGE', 0, None, None, 0) * age_scaled
ASC_PT_AGE = ASC_PT + Beta('ASC_PT_AGE', 0, None, None, 0) * age_scaled
ASC_DRIVE_AGE =  Beta('ASC_DRIVE_AGE', 0, None, None, 0) 
ASC_WALK_AGE =  Beta('ASC_WALK_AGE', 0, None, None, 0)

B_TIME_WALK = Beta('B_TIME_WALK', 0, None, None, 0)
B_TIME_CYCLE = Beta('B_TIME_CYCLE', 0, None, None, 0)
B_TIME_PT = Beta('B_TIME_PT', 0, None, None, 0)
B_TIME_DRIVE = Beta('B_TIME_DRIVE', 0, None, None, 0)


# Utility functions with interaction terms
V_WALK = ASC_WALK_AGE * age_scaled + B_TIME_WALK * dur_walking
V_CYCLE = ASC_CYCLE + B_TIME_CYCLE * dur_cycling
V_PT = ASC_PT + B_COST * cost_transit + B_TIME_PT * dur_pt
V_DRIVE = (ASC_DRIVE + ASC_DRIVE_AGE * age_scaled) + B_COST * cost_driving + B_TIME_DRIVE * dur_driving

# Specify the model
model_2a = loglogit({1: V_WALK, 2: V_CYCLE, 3: V_PT, 4: V_DRIVE}, availability, travel_mode)

## Results

In [6]:
# Estimate Model 2 - Specification 1
biogeme_spec1 = bio.BIOGEME(database, model_2a)
biogeme_spec1.modelName = "Model_2A"
results_m2a = biogeme_spec1.estimate()

all_results['Model_2A'] = results_m2a

# Display estimation results
print("Estimation results for Model 2A:")
print(results_m2a.get_estimated_parameters())

Estimation results for Model 2A:
                  Value  Rob. Std err  Rob. t-test  Rob. p-value
ASC_CYCLE     -4.985520      0.232790   -21.416422      0.000000
ASC_DRIVE     -2.759898      0.186986   -14.759950      0.000000
ASC_DRIVE_AGE  0.577240      0.172923     3.338141      0.000843
ASC_PT        -2.981905      0.185293   -16.092911      0.000000
ASC_WALK_AGE  -0.875893      0.236444    -3.704445      0.000212
B_COST        -0.182600      0.017640   -10.351245      0.000000
B_TIME_CYCLE  -6.452877      0.482611   -13.370758      0.000000
B_TIME_DRIVE  -6.555481      0.378319   -17.327911      0.000000
B_TIME_PT     -3.480724      0.242122   -14.375939      0.000000
B_TIME_WALK   -9.119941      0.464617   -19.628961      0.000000


In [7]:
print(results_m2a.print_general_statistics())

Number of estimated parameters:	10
Sample size:	5000
Excluded observations:	0
Init log likelihood:	-6931.472
Final log likelihood:	-4202.68
Likelihood ratio test for the init. model:	5457.583
Rho-square for the init. model:	0.394
Rho-square-bar for the init. model:	0.392
Akaike Information Criterion:	8425.361
Bayesian Information Criterion:	8490.533
Final gradient norm:	3.7185E-02
Nbr of threads:	16



## Testing Against Model 1

In [8]:
folder_path = os.path.join(os.pardir, 'Model_1')
file_path = os.path.join(folder_path, 'Model_1.pickle')

# open a file, where you stored the pickled data
file = open(file_path, 'rb')

# dump information to that file
results_m1 = pickle.load(file)

# close the file
file.close()

In [9]:
loglikehood_m2a = results_m2a.data.logLike
num_params_m2a = results_m2a.data.nparam

loglikehood_m1 = results_m1.logLike
num_params_m1 = results_m1.nparam

# Calculate the LR statistic
LR = 2 * (loglikehood_m2a - loglikehood_m1)

# Degrees of freedom
df = num_params_m2a - num_params_m1

# Critical value at 0.05 significance level
critical_value = chi2.ppf(0.95, df)

print("Likelihood Ratio:", LR)
print("Degrees of Freedom:", df)
print("Critical Chi-Square Value (0.05 significance):", critical_value)

if LR > critical_value:
    print("Model 2a is significantly better than Model 1.")
else:
    print("No significant improvement in Model 2a over Model 1.")



Likelihood Ratio: 42.611232658311565
Degrees of Freedom: 2
Critical Chi-Square Value (0.05 significance): 5.991464547107979
Model 2a is significantly better than Model 1.


# Model B

## Definition

In [10]:
# Define alternative-specific parameters for travel time
B_TIME_WALK = Beta('B_TIME_WALK', 0, None, None, 0)
B_TIME_CYCLE = Beta('B_TIME_CYCLE', 0, None, None, 0)
B_TIME_PT = Beta('B_TIME_PT', 0, None, None, 0)
B_TIME_DRIVE = Beta('B_TIME_DRIVE', 0, None, None, 0)

ASC_CYCLE = Beta('ASC_CYCLE', 0, None, None, 0)
ASC_PT = Beta('ASC_PT', 0, None, None, 0)
ASC_DRIVE = Beta('ASC_DRIVE', 0, None, None, 0)

# Define generic parameters for cost and travel time
B_COST = Beta('B_COST', 0, None, None, 0)

B_TIME_WALK_AGE = Beta('B_TIME_WALK_AGE', 0, None, None, 0)
B_TIME_CYCLE_AGE = Beta('B_TIME_CYCLE_AGE', 0, None, None, 0) 
B_TIME_PT_AGE = Beta('B_TIME_PT_AGE', 0, None, None, 0)
B_TIME_DRIVE_AGE = Beta('B_TIME_DRIVE_AGE', 0, None, None, 0)

# Updated utility functions with age interaction for travel time
V_WALK = (B_TIME_WALK + B_TIME_WALK_AGE * age_scaled) * dur_walking
V_CYCLE = ASC_CYCLE + (B_TIME_CYCLE + B_TIME_CYCLE_AGE* age_scaled) * dur_cycling
V_PT = ASC_PT + B_COST * cost_transit + (B_TIME_PT + B_TIME_PT_AGE* age_scaled) * dur_pt
V_DRIVE = ASC_DRIVE + B_COST * cost_driving + (B_TIME_DRIVE + B_TIME_DRIVE_AGE * age_scaled) * dur_driving

# Associate utility functions with the mode choice
V = {
    1: V_WALK,    # Walking
    2: V_CYCLE,   # Cycling
    3: V_PT,      # Public Transport
    4: V_DRIVE    # Driving
}

# Specify the model
model_2b = loglogit({1: V_WALK, 2: V_CYCLE, 3: V_PT, 4: V_DRIVE}, availability, travel_mode)

## Results

In [11]:
# Estimate Model 2 - Specification 1
biogeme_spec1 = bio.BIOGEME(database, model_2b)
biogeme_spec1.modelName = "Model_2B"
results_m2b = biogeme_spec1.estimate()

all_results['Model_2B'] = results_m2b

# Display estimation results
print("Estimation results for Model 2b:")
print(results_m2b.get_estimated_parameters())

Estimation results for Model 2b:
                     Value  Rob. Std err  Rob. t-test  Rob. p-value
ASC_CYCLE        -4.618726      0.200583   -23.026502  0.000000e+00
ASC_DRIVE        -2.156613      0.150105   -14.367345  0.000000e+00
ASC_PT           -2.612313      0.149861   -17.431628  0.000000e+00
B_COST           -0.182679      0.017772   -10.278822  0.000000e+00
B_TIME_CYCLE     -4.681728      0.865093    -5.411823  6.238650e-08
B_TIME_CYCLE_AGE -4.491985      1.941414    -2.313770  2.068033e-02
B_TIME_DRIVE     -5.823763      0.882407    -6.599858  4.115508e-11
B_TIME_DRIVE_AGE -1.946479      2.054133    -0.947591  3.433376e-01
B_TIME_PT        -2.274456      0.486557    -4.674590  2.945418e-06
B_TIME_PT_AGE    -3.056258      1.128731    -2.707694  6.775238e-03
B_TIME_WALK      -7.455368      0.504927   -14.765247  0.000000e+00
B_TIME_WALK_AGE  -4.134307      1.083000    -3.817458  1.348338e-04


In [12]:
print(results_m2b.print_general_statistics())

Number of estimated parameters:	12
Sample size:	5000
Excluded observations:	0
Init log likelihood:	-6931.472
Final log likelihood:	-4193.755
Likelihood ratio test for the init. model:	5475.433
Rho-square for the init. model:	0.395
Rho-square-bar for the init. model:	0.393
Akaike Information Criterion:	8411.51
Bayesian Information Criterion:	8489.717
Final gradient norm:	7.5547E-02
Nbr of threads:	16



## Testing Against Model 1

In [13]:
loglikehood_m2b = results_m2b.data.logLike
num_params_m2b = results_m2b.data.nparam

# Calculate the LR statistic
LR = 2 * (loglikehood_m2b - loglikehood_m1)

# Degrees of freedom
df = num_params_m2a - num_params_m1

# Critical value at 0.05 significance level
critical_value = chi2.ppf(0.95, df)

print("Likelihood Ratio:", LR)
print("Degrees of Freedom:", df)
print("Critical Chi-Square Value (0.05 significance):", critical_value)

if LR > critical_value:
    print("Model 2b is significantly better than Model 1.")
else:
    print("No significant improvement in Model 2b over Model 1.")



Likelihood Ratio: 60.46160699763641
Degrees of Freedom: 2
Critical Chi-Square Value (0.05 significance): 5.991464547107979
Model 2b is significantly better than Model 1.


# Comparing Model A and Model B

In [14]:
comparison_table, _ = compile_estimation_results(all_results)
display(comparison_table)

Unnamed: 0,Model_2A,Model_2B
Number of estimated parameters,10,12
Sample size,5000,5000
Final log likelihood,-4202.680341,-4193.755154
Akaike Information Criterion,8425.360682,8411.510308
Bayesian Information Criterion,8490.532614,8489.716626
ASC_CYCLE (t-test),-4.99 (-21.4),-4.62 (-23)
ASC_DRIVE (t-test),-2.76 (-14.8),-2.16 (-14.4)
ASC_DRIVE_AGE (t-test),0.577 (3.34),
ASC_PT (t-test),-2.98 (-16.1),-2.61 (-17.4)
ASC_WALK_AGE (t-test),-0.876 (-3.7),
