# BIOGEME
1. Installation and libraries imports
2. Cleaning the data and defining variables
3. Binary Logit model

# 1. Installation and libraries imports

In [19]:
pip install biogeme

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [20]:
#check BIOGEME version
import biogeme.version as ver
print("Biogeme version:", ver.getVersion())

#Import necessary libraries
import pandas as pd
import biogeme.database as db
import biogeme.biogeme as bio
from biogeme import models
from biogeme.expressions import Beta, Variable

Biogeme version: 3.3.0


  print("Biogeme version:", ver.getVersion())


# 2. Cleaning the data and defining variables

**Important!: All data have to be numeric**

In [21]:
# Load the database
df = pd.read_csv("ModeChoiceData_RP.csv", sep=',') # Check the path to your CSV file

In [22]:
# Check missing values
print(df.isnull().sum())

Unnamed: 0        0
ID                0
RP                0
SP                0
RP_journey        0
SP_task         512
av_car            0
av_rail           0
time_car          0
cost_car          0
time_rail         0
cost_rail         0
business          0
access_rail       0
service_rail      0
female            0
income            0
choice            0
dtype: int64


In [23]:
# Drop columns with missing values
## df = df.drop(columns=["SP_task"])
# Or, set the missing values to -1
df = df.fillna(-1)
print(df.isnull().sum())

# Create a row index
df["RowID"] = df.index + 1
df

Unnamed: 0      0
ID              0
RP              0
SP              0
RP_journey      0
SP_task         0
av_car          0
av_rail         0
time_car        0
cost_car        0
time_rail       0
cost_rail       0
business        0
access_rail     0
service_rail    0
female          0
income          0
choice          0
dtype: int64


Unnamed: 0.1,Unnamed: 0,ID,RP,SP,RP_journey,SP_task,av_car,av_rail,time_car,cost_car,time_rail,cost_rail,business,access_rail,service_rail,female,income,choice,RowID
0,129,9,1,0,1,-1.0,1,1,250,45,155,65,1,15,0,0,52.314,1,1
1,145,10,1,0,1,-1.0,1,1,345,45,130,35,0,5,0,0,66.143,1,2
2,146,10,1,0,2,-1.0,1,1,275,50,120,45,0,15,0,0,66.143,1,3
3,161,11,1,0,1,-1.0,1,1,300,30,155,75,0,25,0,1,26.718,1,4
4,178,12,1,0,2,-1.0,1,1,300,35,140,45,0,20,0,0,47.709,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507,7938,497,1,0,2,-1.0,1,1,275,30,155,65,0,25,0,0,65.335,1,508
508,7954,498,1,0,2,-1.0,1,1,345,35,170,35,0,15,0,1,37.162,2,509
509,7969,499,1,0,1,-1.0,1,1,390,30,130,75,1,15,0,1,21.154,1,510
510,7985,500,1,0,1,-1.0,1,1,390,35,155,55,1,15,0,0,19.910,1,511


In [24]:
# Define the BIOGEME-specific database
database = db.Database("ModeChoice", df)
#print the BIOGEME-specific database
database._df

Unnamed: 0.1,Unnamed: 0,ID,RP,SP,RP_journey,SP_task,av_car,av_rail,time_car,cost_car,time_rail,cost_rail,business,access_rail,service_rail,female,income,choice,RowID
0,129.0,9.0,1.0,0.0,1.0,-1.0,1.0,1.0,250.0,45.0,155.0,65.0,1.0,15.0,0.0,0.0,52.314,1.0,1.0
1,145.0,10.0,1.0,0.0,1.0,-1.0,1.0,1.0,345.0,45.0,130.0,35.0,0.0,5.0,0.0,0.0,66.143,1.0,2.0
2,146.0,10.0,1.0,0.0,2.0,-1.0,1.0,1.0,275.0,50.0,120.0,45.0,0.0,15.0,0.0,0.0,66.143,1.0,3.0
3,161.0,11.0,1.0,0.0,1.0,-1.0,1.0,1.0,300.0,30.0,155.0,75.0,0.0,25.0,0.0,1.0,26.718,1.0,4.0
4,178.0,12.0,1.0,0.0,2.0,-1.0,1.0,1.0,300.0,35.0,140.0,45.0,0.0,20.0,0.0,0.0,47.709,1.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507,7938.0,497.0,1.0,0.0,2.0,-1.0,1.0,1.0,275.0,30.0,155.0,65.0,0.0,25.0,0.0,0.0,65.335,1.0,508.0
508,7954.0,498.0,1.0,0.0,2.0,-1.0,1.0,1.0,345.0,35.0,170.0,35.0,0.0,15.0,0.0,1.0,37.162,2.0,509.0
509,7969.0,499.0,1.0,0.0,1.0,-1.0,1.0,1.0,390.0,30.0,130.0,75.0,1.0,15.0,0.0,1.0,21.154,1.0,510.0
510,7985.0,500.0,1.0,0.0,1.0,-1.0,1.0,1.0,390.0,35.0,155.0,55.0,1.0,15.0,0.0,0.0,19.910,1.0,511.0


In [25]:
"""
Use the loop below to get all of columns names.
for col in df.columns:
    print(f'{col} = Variable("{col}")')

1. The code will generate the Variable for each column.    
2. Copy the output and paste it below to define the variables.
"""
for col in df.columns:
    print(f'{col} = Variable("{col}")')


Unnamed: 0 = Variable("Unnamed: 0")
ID = Variable("ID")
RP = Variable("RP")
SP = Variable("SP")
RP_journey = Variable("RP_journey")
SP_task = Variable("SP_task")
av_car = Variable("av_car")
av_rail = Variable("av_rail")
time_car = Variable("time_car")
cost_car = Variable("cost_car")
time_rail = Variable("time_rail")
cost_rail = Variable("cost_rail")
business = Variable("business")
access_rail = Variable("access_rail")
service_rail = Variable("service_rail")
female = Variable("female")
income = Variable("income")
choice = Variable("choice")
RowID = Variable("RowID")


In [26]:
#Define the variables
ID = Variable("ID")
RP = Variable("RP")
SP = Variable("SP")
RP_journey = Variable("RP_journey")
av_car = Variable("av_car")
av_rail = Variable("av_rail")
time_car = Variable("time_car")
cost_car = Variable("cost_car")
time_rail = Variable("time_rail")
cost_rail = Variable("cost_rail")
business = Variable("business")
access_rail = Variable("access_rail")
service_rail = Variable("service_rail")
female = Variable("female")
income = Variable("income")
choice = Variable("choice")

# 3. Binary Logit Model


#### Model A

In [27]:
# Define beta values to be estimated
## Beta('name', initial value, lower bound, upper bound, reference)
## Reference is used to set the reference alternative in a logit model (1 = yes, 0 = no)
asc_car = Beta('asc_car', 0, None, None, 1) #Reference alternatives
asc_rail = Beta('asc_rail', 0, None, None, 0) 
b_cost = Beta('b_cost', 0, None, None, 0)

# Utility functions
V_car = asc_car + b_cost * cost_car
V_rail = asc_rail + b_cost * cost_rail

'''
Remember the coding of the alternatives in the dataset:
1: Car
2: Rail
''' 

# Dictionary defining alternatives mapping
V = {1: V_car, 2: V_rail}

# Dictionary defining availability mapping
av = {1: av_car, 2: av_rail}

# Define the choice model
# loglogit(Alternatives mapping, Availability mapping, Choice variable)
logprob= models.loglogit(V, av, choice)

# Estimate Model
the_biogeme = bio.BIOGEME(database, logprob)
the_biogeme.model_name = 'Model_A' # Set the model name

#Calculate null Loglikelihood
the_biogeme.calculate_null_loglikelihood(av)

# Save the estimation results
Model_A = the_biogeme.estimate()

#Print the results
print(Model_A.short_summary())
Model_A.get_estimated_parameters()

Results for model Model_A
Nbr of parameters:		2
Sample size:			512
Excluded data:			0
Null log likelihood:		-354.8914
Final log likelihood:		-344.5628
Likelihood ratio test (null):		20.65708
Rho square (null):			0.0291
Rho bar square (null):			0.0235
Akaike Information Criterion:	693.1256
Bayesian Information Criterion:	701.6023



  Model_A.get_estimated_parameters()


Unnamed: 0,Name,Value,Robust std err.,Robust t-stat.,Robust p-value
0,b_cost,-0.022415,0.005687,-3.941476,8.1e-05
1,asc_rail,0.126531,0.119387,1.059833,0.289221


#### Model B

In [28]:
# Define beta values to be estimated
## Beta('name', initial value, lower bound, upper bound, reference)
## Reference is used to set the reference alternative in a logit model (1 = yes, 0 = no)
asc_car = Beta('asc_car', 0, None, None, 1) #Reference alternatives
asc_rail = Beta('asc_rail', 0, None, None, 0) 
b_cost = Beta('b_cost', 0, None, None, 0)
b_tt = Beta('b_tt', 0, None, None, 0)

# Utility functions
V_car = asc_car + b_cost * cost_car + b_tt * time_car
V_rail = asc_rail + b_cost * cost_rail + b_tt * time_rail

'''
Remember the coding of the alternatives in the dataset:
1: Car
2: Rail
''' 

# Dictionary defining alternatives mapping
V = {1: V_car, 2: V_rail}

# Dictionary defining availability mapping
av = {1: av_car, 2: av_rail}

# Define the choice model
# loglogit(Alternatives mapping, Availability mapping, Choice variable)
logprob= models.loglogit(V, av, choice)

# Estimate Model
the_biogeme = bio.BIOGEME(database, logprob)
the_biogeme.model_name = 'Model_B' # Set the model name

#Calculate null Loglikelihood
the_biogeme.calculate_null_loglikelihood(av)

# Save the estimation results
Model_B = the_biogeme.estimate()

#Print the results
print(Model_B.short_summary())
Model_B.get_estimated_parameters()

Results for model Model_B
Nbr of parameters:		3
Sample size:			512
Excluded data:			0
Null log likelihood:		-354.8914
Final log likelihood:		-342.2266
Likelihood ratio test (null):		25.32959
Rho square (null):			0.0357
Rho bar square (null):			0.0272
Akaike Information Criterion:	690.4531
Bayesian Information Criterion:	703.1681



  Model_B.get_estimated_parameters()


Unnamed: 0,Name,Value,Robust std err.,Robust t-stat.,Robust p-value
0,b_cost,-0.026745,0.006053,-4.418222,1e-05
1,b_tt,-0.003782,0.001746,-2.165887,0.03032
2,asc_rail,-0.439832,0.286333,-1.536087,0.124517


#### Model C

In [29]:
# Define beta values to be estimated
## Beta('name', initial value, lower bound, upper bound, reference)
## Reference is used to set the reference alternative in a logit model (1 = yes, 0 = no)
asc_car = Beta('asc_car', 0, None, None, 1) #Reference alternatives
asc_rail = Beta('asc_rail', 0, None, None, 0) 
b_cost = Beta('b_cost', 0, None, None, 0)
b_tt = Beta('b_tt', 0, None, None, 0)
b_business_rail = Beta('b_business_rail', 0, None, None, 0)

# Utility functions
V_car = asc_car + b_cost * cost_car + b_tt * time_car
V_rail = asc_rail + b_cost * cost_rail + b_tt * time_rail + b_business_rail * business

'''
Remember the coding of the alternatives in the dataset:
1: Car
2: Rail
''' 

# Dictionary defining alternatives mapping
V = {1: V_car, 2: V_rail}

# Dictionary defining availability mapping
av = {1: av_car, 2: av_rail}

# Define the choice model
# loglogit(Alternatives mapping, Availability mapping, Choice variable)
logprob= models.loglogit(V, av, choice)

# Estimate Model
the_biogeme = bio.BIOGEME(database, logprob)
the_biogeme.model_name = 'Model_C' # Set the model name

#Calculate null Loglikelihood
the_biogeme.calculate_null_loglikelihood(av)

# Save the estimation results
Model_C = the_biogeme.estimate()

#Print the results
print(Model_C.short_summary())
Model_C.get_estimated_parameters()

Results for model Model_C
Nbr of parameters:		4
Sample size:			512
Excluded data:			0
Null log likelihood:		-354.8914
Final log likelihood:		-329.0222
Likelihood ratio test (null):		51.73827
Rho square (null):			0.0729
Rho bar square (null):			0.0616
Akaike Information Criterion:	666.0444
Bayesian Information Criterion:	682.9977



  Model_C.get_estimated_parameters()


Unnamed: 0,Name,Value,Robust std err.,Robust t-stat.,Robust p-value
0,b_cost,-0.027234,0.00623,-4.371523,1.233826e-05
1,b_tt,-0.003786,0.001797,-2.107531,0.03507155
2,asc_rail,-0.755264,0.301166,-2.507802,0.01214847
3,b_business_rail,1.015091,0.203645,4.984612,6.208613e-07
