# Imports and Preprocessing

Install the BIOGEME using pip

In [None]:
pip install biogeme

In [None]:
#check BIOGEME version
import biogeme.version as ver
print("Biogeme version:", ver.get_version())

#Import necessary libraries
import pandas as pd
import biogeme.database as db
import biogeme.biogeme as bio
from biogeme import models
from biogeme.expressions import Beta, Variable

**Important!: All data have to be numeric**

In [None]:
# Load the database
df = pd.read_csv("ModeChoiceData_RP.csv", sep=',') # Check the path to your CSV file

#create backup of the original dataframe
df_backup = df.copy()

In [None]:
# Check missing values
print(df.isnull().sum())



In [None]:
# Drop columns with missing values
## df = df.drop(columns=["SP_task"])
# Or, set the missing values to -1
df = df.fillna(-1)
print(df.isnull().sum())

# Additional preprocessing the data
# Create a row index
df["RowID"] = df.index + 1
df

In [None]:
# Define the BIOGEME-specific database
database = db.Database("ModeChoice", df)
#print the BIOGEME-specific database
database._df

In [None]:
"""
Use the loop below to get all of columns names.
for col in df.columns:
    print(f'{col} = Variable("{col}")')

1. The code will generate the Variable for each column.    
2. Copy the output and paste it below to define the variables.
"""
for col in df.columns:
    print(f'{col} = Variable("{col}")')


In [None]:
#Define the variables
ID = Variable("ID")
RP = Variable("RP")
SP = Variable("SP")
RP_journey = Variable("RP_journey")
av_car = Variable("av_car")
av_rail = Variable("av_rail")
time_car = Variable("time_car")
cost_car = Variable("cost_car")
time_rail = Variable("time_rail")
cost_rail = Variable("cost_rail")
business = Variable("business")
access_rail = Variable("access_rail")
service_rail = Variable("service_rail")
female = Variable("female")
income = Variable("income")
choice = Variable("choice")

# Binary Logit Models

#### Specification Baseline (Model A from Lecture 3)

In [None]:
# Define beta values to be estimated
## Beta('name', initial value, lower bound, upper bound, reference)
## Reference is used to set the reference alternative in a logit model (1 = yes, 0 = no)
asc_car = Beta('asc_car', 0, None, None, 1) #Reference alternatives
asc_rail = Beta('asc_rail', 0, None, None, 0) 
b_cost = Beta('b_cost', 0, None, None, 0)

# Utility functions
V_car = asc_car + b_cost * cost_car
V_rail = asc_rail + b_cost * cost_rail

'''
Remember the coding of the alternatives in the dataset:
1: Car
2: Rail
''' 

# Dictionary defining alternatives mapping
V = {1: V_car, 2: V_rail}

# Dictionary defining availability mapping
av = {1: av_car, 2: av_rail}

# Define the choice model
# loglogit(Alternatives mapping, Availability mapping, Choice variable)
logprob= models.loglogit(V, av, choice)

# Estimate Model
the_biogeme = bio.BIOGEME(database, logprob)
the_biogeme.model_name = 'Spec_Base' # Set the model name

#Calculate null Loglikelihood
the_biogeme.calculate_null_loglikelihood(av)

# Save the estimation results
results = the_biogeme.estimate()

#Print the results
print(results.short_summary())
results.get_estimated_parameters()

#### Specification 1 (Model B from Lecture 3)

In [None]:
# Define beta values to be estimated
## Beta('name', initial value, lower bound, upper bound, reference)
## Reference is used to set the reference alternative in a logit model (1 = yes, 0 = no)
asc_car = Beta('asc_car', 0, None, None, 1) #Reference alternatives
asc_rail = Beta('asc_rail', 0, None, None, 0) 
b_cost = Beta('b_cost', 0, None, None, 0)
b_tt = Beta('b_tt', 0, None, None, 0)

# Utility functions
V_car = asc_car + b_cost * cost_car + b_tt * time_car
V_rail = asc_rail + b_cost * cost_rail + b_tt * time_rail

'''
Remember the coding of the alternatives in the dataset:
1: Car
2: Rail
''' 

# Dictionary defining alternatives mapping
V = {1: V_car, 2: V_rail}

# Dictionary defining availability mapping
av = {1: av_car, 2: av_rail}

# Define the choice model
# loglogit(Alternatives mapping, Availability mapping, Choice variable)
logprob= models.loglogit(V, av, choice)

# Estimate Model
the_biogeme = bio.BIOGEME(database, logprob)
the_biogeme.model_name = 'Spec_1' # Set the model name

#Calculate null Loglikelihood
the_biogeme.calculate_null_loglikelihood(av)

# Save the estimation results
results = the_biogeme.estimate()

#Print the results
print(results.short_summary())
results.get_estimated_parameters()

#### Specification 2 (Alternative-Specific)

In [None]:
# Define beta values to be estimated
## Beta('name', initial value, lower bound, upper bound, reference)
## Reference is used to set the reference alternative in a logit model (1 = yes, 0 = no)
asc_car = Beta('asc_car', 0, None, None, 1) #Reference alternatives
asc_rail = Beta('asc_rail', 0, None, None, 0) 
b_cost = Beta('b_cost', 0, None, None, 0)
b_tt_car = Beta('b_tt_car', 0, None, None, 0)
b_tt_rail = Beta('b_tt_rail', 0, None, None, 0)

# Utility functions
V_car = asc_car + b_cost * cost_car + b_tt_car * time_car
V_rail = asc_rail + b_cost * cost_rail + b_tt_rail * time_rail

'''
Remember the coding of the alternatives in the dataset:
1: Car
2: Rail
''' 

# Dictionary defining alternatives mapping
V = {1: V_car, 2: V_rail}

# Dictionary defining availability mapping
av = {1: av_car, 2: av_rail}

# Define the choice model
# loglogit(Alternatives mapping, Availability mapping, Choice variable)
logprob= models.loglogit(V, av, choice)

# Estimate Model
the_biogeme = bio.BIOGEME(database, logprob)
the_biogeme.model_name = 'Spec_2' # Set the model name

#Calculate null Loglikelihood
the_biogeme.calculate_null_loglikelihood(av)

# Save the estimation results
results = the_biogeme.estimate()

#Print the results
print(results.short_summary())
results.get_estimated_parameters()

#### Specification 3

In [None]:
# Define beta values to be estimated
## Beta('name', initial value, lower bound, upper bound, reference)
## Reference is used to set the reference alternative in a logit model (1 = yes, 0 = no)
asc_car = Beta('asc_car', 0, None, None, 1) #Reference alternatives
asc_rail = Beta('asc_rail', 0, None, None, 0) 
b_cost = Beta('b_cost', 0, None, None, 0)
b_tt_car = Beta('b_tt_car', 0, None, None, 0)
b_tt_rail = Beta('b_tt_rail', 0, None, None, 0)
b_female_rail = Beta('b_female_rail', 0, None, None, 0)

# Utility functions
V_car = asc_car + b_cost * cost_car + b_tt_car * time_car
V_rail = asc_rail + b_cost * cost_rail + b_tt_rail * time_rail + b_female_rail * female

'''
Remember the coding of the alternatives in the dataset:
1: Car
2: Rail
''' 

# Dictionary defining alternatives mapping
V = {1: V_car, 2: V_rail}

# Dictionary defining availability mapping
av = {1: av_car, 2: av_rail}

# Define the choice model
# loglogit(Alternatives mapping, Availability mapping, Choice variable)
logprob= models.loglogit(V, av, choice)

# Estimate Model
the_biogeme = bio.BIOGEME(database, logprob)
the_biogeme.model_name = 'Spec_3' # Set the model name

#Calculate null Loglikelihood
the_biogeme.calculate_null_loglikelihood(av)

# Save the estimation results
results = the_biogeme.estimate()

#Print the results
print(results.short_summary())
results.get_estimated_parameters()

#### Specification 4

Since income level doesn't exist originally, we need to update the dataset

In [None]:
# Update the database with income levels
df['income_level'] = pd.cut(df['income'], bins=[0, 35, 55, df['income'].max()], labels=[1, 2, 3])
# Change income_level type from categorical to integer
df['income_level'] = df['income_level'].astype(int)
df



In [None]:
# Update the BIOGEME-specific database
database = db.Database("ModeChoice", df)
database._df

In [None]:
#Update Variable definitions for income levels
#Hint: if you have multiple new columns,
# you can re run the loop function to get the variable definitions, 
# then copy the output and paste it below to define the new variables.
income_level = Variable("income_level")

In [None]:
# Define beta values to be estimated
## Beta('name', initial value, lower bound, upper bound, reference)
## Reference is used to set the reference alternative in a logit model (1 = yes, 0 = no)
asc_car = Beta('asc_car', 0, None, None, 1) #Reference alternatives
asc_rail = Beta('asc_rail', 0, None, None, 0) 
b_cost = Beta('b_cost', 0, None, None, 0)

b_MI_rail = Beta('b_mid_rail', 0, None, None, 0) # New parameter for mid income
b_HI_rail = Beta('b_high_rail', 0, None, None, 0) # New parameter for high income

# Utility functions
V_car = asc_car + b_cost * cost_car
V_rail = asc_rail + b_cost * cost_rail  + b_MI_rail * (income_level == 2) + b_HI_rail * (income_level == 3)

'''
Remember the coding of the alternatives in the dataset:
1: Car
2: Rail
''' 

# Dictionary defining alternatives mapping
V = {1: V_car, 2: V_rail}

# Dictionary defining availability mapping
av = {1: av_car, 2: av_rail}

# Define the choice model
# loglogit(Alternatives mapping, Availability mapping, Choice variable)
logprob= models.loglogit(V, av, choice)

# Estimate Model
the_biogeme = bio.BIOGEME(database, logprob)
the_biogeme.model_name = 'Spec_4' # Set the model name

#Calculate null Loglikelihood
the_biogeme.calculate_null_loglikelihood(av)

# Save the estimation results
results = the_biogeme.estimate()

#Print the results
print(results.short_summary())
results.get_estimated_parameters()

#### Specification 5

In [None]:
# Define beta values to be estimated
## Beta('name', initial value, lower bound, upper bound, reference)
## Reference is used to set the reference alternative in a logit model (1 = yes, 0 = no)
asc_car = Beta('asc_car', 0, None, None, 1) #Reference alternatives
asc_rail = Beta('asc_rail', 0, None, None, 0) 
b_cost = Beta('b_cost', 0, None, None, 0)
b_tt_car = Beta('b_tt_car', 0, None, None, 0)
b_tt_rail = Beta('b_tt_rail', 0, None, None, 0)
b_female_rail = Beta('b_female_rail', 0, None, None, 0)
b_business_rail = Beta('b_business_rail', 0, None, None, 0)
busi_inc_rail = Beta('busi_inc_rail', 0, None, None, 0)

# Utility functions
V_car = (asc_car + 
         b_cost * cost_car + 
         b_tt_car * time_car
        )
V_rail = (asc_rail + 
          b_cost * cost_rail + 
          b_tt_rail * time_rail + 
          b_female_rail * female +
          b_business_rail * business +
          busi_inc_rail * income * business
          )

'''
Remember the coding of the alternatives in the dataset:
1: Car
2: Rail
''' 

# Dictionary defining alternatives mapping
V = {1: V_car, 2: V_rail}

# Dictionary defining availability mapping
av = {1: av_car, 2: av_rail}

# Define the choice model
# loglogit(Alternatives mapping, Availability mapping, Choice variable)
logprob= models.loglogit(V, av, choice)

# Estimate Model
the_biogeme = bio.BIOGEME(database, logprob)
the_biogeme.model_name = 'Spec_5' # Set the model name

#Calculate null Loglikelihood
the_biogeme.calculate_null_loglikelihood(av)

# Save the estimation results
results = the_biogeme.estimate()

#Print the results
print(results.short_summary())
results.get_estimated_parameters()

#### Specification 6

In [None]:
# Define beta values to be estimated
## Beta('name', initial value, lower bound, upper bound, reference)
## Reference is used to set the reference alternative in a logit model (1 = yes, 0 = no)
asc_car = Beta('asc_car', 0, None, None, 1) #Reference alternatives
asc_rail = Beta('asc_rail', 0, None, None, 0) 
b_cost = Beta('b_cost', 0, None, None, 0)
income_rail = Beta('income_rail', 0, None, None, 0) # New parameter for income effect on rail
b_cost_inc = Beta('b_cost_inc', 0, None, None, 0) # New parameter for cost effect on income

# Utility functions
V_car = (asc_car + 
         b_cost * cost_car + 
         b_cost_inc * cost_car / income
        )
V_rail = (asc_rail + 
          b_cost * cost_rail + 
          income_rail * income +
          b_cost_inc * cost_rail / income
        )

'''
Remember the coding of the alternatives in the dataset:
1: Car
2: Rail
''' 

# Dictionary defining alternatives mapping
V = {1: V_car, 2: V_rail}

# Dictionary defining availability mapping
av = {1: av_car, 2: av_rail}

# Define the choice model
# loglogit(Alternatives mapping, Availability mapping, Choice variable)
logprob= models.loglogit(V, av, choice)

# Estimate Model
the_biogeme = bio.BIOGEME(database, logprob)
the_biogeme.model_name = 'Spec_6' # Set the model name

#Calculate null Loglikelihood
the_biogeme.calculate_null_loglikelihood(av)

# Save the estimation results
results = the_biogeme.estimate()

#Print the results
print(results.short_summary())
results.get_estimated_parameters()

#### Log-Transformation

In [None]:
import numpy as np
# Update the dataset with log-transformed cost variables
df['log_cost_car'] = np.log(df['cost_car'])
df['log_cost_rail'] = np.log(df['cost_rail'])
# Update the BIOGEME-specific database
database = db.Database("ModeChoice", df)
database._df
#update Variables
log_cost_car = Variable("log_cost_car")
log_cost_rail = Variable("log_cost_rail")

In [None]:
# Define beta values to be estimated
## Beta('name', initial value, lower bound, upper bound, reference)
## Reference is used to set the reference alternative in a logit model (1 = yes, 0 = no)
asc_car = Beta('asc_car', 0, None, None, 1) #Reference alternatives
asc_rail = Beta('asc_rail', 0, None, None, 0) 
b_cost = Beta('b_cost', 0, None, None, 0)

# Utility functions
V_car = (asc_car + 
         b_cost * log_cost_car
        )
V_rail = (asc_rail + 
          b_cost * log_cost_rail
        )

'''
Remember the coding of the alternatives in the dataset:
1: Car
2: Rail
''' 

# Dictionary defining alternatives mapping
V = {1: V_car, 2: V_rail}

# Dictionary defining availability mapping
av = {1: av_car, 2: av_rail}

# Define the choice model
# loglogit(Alternatives mapping, Availability mapping, Choice variable)
logprob= models.loglogit(V, av, choice)

# Estimate Model
the_biogeme = bio.BIOGEME(database, logprob)
the_biogeme.model_name = 'Spec_Log' # Set the model name

#Calculate null Loglikelihood
the_biogeme.calculate_null_loglikelihood(av)

# Save the estimation results
results = the_biogeme.estimate()

#Print the results
print(results.short_summary())
results.get_estimated_parameters()

#### Power Expansion Series

In [38]:
# Define beta values to be estimated
## Beta('name', initial value, lower bound, upper bound, reference)
## Reference is used to set the reference alternative in a logit model (1 = yes, 0 = no)
asc_car = Beta('asc_car', 0, None, None, 1) #Reference alternatives
asc_rail = Beta('asc_rail', 0, None, None, 0) 
b_cost = Beta('b_cost', 0, None, None, 0)
b_cost_squared = Beta('b_cost_squared', 0, None, None, 0)
b_cost_cubed = Beta('b_cost_cubed', 0, None, None, 0)

# Utility functions
V_car = (asc_car + 
         b_cost * cost_car +
         b_cost_squared * cost_car * cost_car +
         b_cost_cubed * cost_car * cost_car * cost_car
        )
V_rail = (asc_rail + 
          b_cost * cost_rail +
          b_cost_squared * cost_rail * cost_rail +
          b_cost_cubed * cost_rail * cost_rail * cost_rail
        )

'''
Remember the coding of the alternatives in the dataset:
1: Car
2: Rail
''' 

# Dictionary defining alternatives mapping
V = {1: V_car, 2: V_rail}

# Dictionary defining availability mapping
av = {1: av_car, 2: av_rail}

# Define the choice model
# loglogit(Alternatives mapping, Availability mapping, Choice variable)
logprob= models.loglogit(V, av, choice)

# Estimate Model
the_biogeme = bio.BIOGEME(database, logprob)
the_biogeme.model_name = 'Spec_Pwr' # Set the model name

#Calculate null Loglikelihood
the_biogeme.calculate_null_loglikelihood(av)

# Save the estimation results
results = the_biogeme.estimate()

#Print the results
print(results.short_summary())
results.get_estimated_parameters()

Results for model Spec_Pwr
Nbr of parameters:		4
Sample size:			512
Excluded data:			0
Null log likelihood:		-354.8914
Final log likelihood:		-344.5117
Likelihood ratio test (null):		20.75928
Rho square (null):			0.0292
Rho bar square (null):			0.018
Akaike Information Criterion:	697.0234
Bayesian Information Criterion:	713.9767



  results.get_estimated_parameters()


Unnamed: 0,Name,Value,Robust std err.,Robust t-stat.,Robust p-value
0,b_cost,-0.049148,0.274441,-0.179086,0.85787
1,b_cost_squared,0.000649,0.005485,0.118248,0.905871
2,b_cost_cubed,-5e-06,3.5e-05,-0.137582,0.890571
3,asc_rail,0.126668,0.123643,1.024462,0.305617


#### Piecewise Linearization

In [None]:
#Update the database with piecewise cost variables
df['cost_car_TC1'] = df['cost_car'].apply(lambda x: 40 if x >= 40 else x)
df['cost_car_TC2'] = df['cost_car'].apply(lambda x: 0 if x < 40 else (10 if x >= 50 else x - 40))
df['cost_car_TC3'] = df['cost_car'].apply(lambda x: 0 if x < 50 else (10 if x >= 60 else x - 50))
df['cost_car_TC4'] = df['cost_car'].apply(lambda x: 0 if x < 60 else x-60)

df['cost_rail_TC1'] = df['cost_rail'].apply(lambda x: 40 if x >= 40 else x)
df['cost_rail_TC2'] = df['cost_rail'].apply(lambda x: 0 if x < 40 else (10 if x >= 50 else x - 40))
df['cost_rail_TC3'] = df['cost_rail'].apply(lambda x: 0 if x < 50 else (10 if x >= 60 else x - 50))
df['cost_rail_TC4'] = df['cost_rail'].apply(lambda x: 0 if x < 60 else x-60)
# Update the BIOGEME-specific database
database = db.Database("ModeChoice", df)
#update Variables
cost_car_TC1 = Variable("cost_car_TC1")
cost_car_TC2 = Variable("cost_car_TC2")
cost_car_TC3 = Variable("cost_car_TC3")
cost_car_TC4 = Variable("cost_car_TC4")
cost_rail_TC1 = Variable("cost_rail_TC1")
cost_rail_TC2 = Variable("cost_rail_TC2")
cost_rail_TC3 = Variable("cost_rail_TC3")
cost_rail_TC4 = Variable("cost_rail_TC4")

In [37]:
# Define beta values to be estimated
## Beta('name', initial value, lower bound, upper bound, reference)
## Reference is used to set the reference alternative in a logit model (1 = yes, 0 = no)
asc_car = Beta('asc_car', 0, None, None, 1) #Reference alternatives
asc_rail = Beta('asc_rail', 0, None, None, 0) 
b_cost_TC1 = Beta('b_cost_TC1', 0, None, None, 0)
b_cost_TC2 = Beta('b_cost_TC2', 0, None, None, 0)
b_cost_TC3 = Beta('b_cost_TC3', 0, None, None, 0)
b_cost_TC4 = Beta('b_cost_TC4', 0, None, None, 0)

# Utility functions
V_car = (asc_car + 
         b_cost_TC1 * cost_car_TC1 +
         b_cost_TC2 * cost_car_TC2 +
         b_cost_TC3 * cost_car_TC3 +
         b_cost_TC4 * cost_car_TC4
        )

V_rail = (asc_rail + 
          b_cost_TC1 * cost_rail_TC1 +
          b_cost_TC2 * cost_rail_TC2 +
          b_cost_TC3 * cost_rail_TC3 +
          b_cost_TC4 * cost_rail_TC4)

'''
Remember the coding of the alternatives in the dataset:
1: Car
2: Rail
''' 

# Dictionary defining alternatives mapping
V = {1: V_car, 2: V_rail}

# Dictionary defining availability mapping
av = {1: av_car, 2: av_rail}

# Define the choice model
# loglogit(Alternatives mapping, Availability mapping, Choice variable)
logprob= models.loglogit(V, av, choice)

# Estimate Model
the_biogeme = bio.BIOGEME(database, logprob)
the_biogeme.model_name = 'Spec_Piecewise' # Set the model name

#Calculate null Loglikelihood
the_biogeme.calculate_null_loglikelihood(av)

# Save the estimation results
results = the_biogeme.estimate()

#Print the results
print(results.short_summary())
results.get_estimated_parameters()

Results for model Spec_Piecewise
Nbr of parameters:		5
Sample size:			512
Excluded data:			0
Null log likelihood:		-354.8914
Final log likelihood:		-344.1149
Likelihood ratio test (null):		21.55281
Rho square (null):			0.0304
Rho bar square (null):			0.0163
Akaike Information Criterion:	698.2299
Bayesian Information Criterion:	719.4215



  results.get_estimated_parameters()


Unnamed: 0,Name,Value,Robust std err.,Robust t-stat.,Robust p-value
0,b_cost_TC1,-0.000305,0.027794,-0.010982,0.991238
1,b_cost_TC2,-0.044811,0.026891,-1.666406,0.095633
2,b_cost_TC3,-0.005466,0.036301,-0.150564,0.88032
3,b_cost_TC4,-0.032338,0.025828,-1.252058,0.210549
4,asc_rail,0.126367,0.132846,0.951225,0.34149


#### Improved Model