In [13]:
# Load modules
import pandas as pd
import numpy as np

from biogeme.expressions import Beta, Variable
from biogeme.models import loglogit

import biogeme.biogeme as bio
import biogeme.database as db

In [14]:
# Set variables
CHOICE = Variable('CHOICE')

TRAIN_TT = Variable('TRAIN_TT')
TRAIN_COST = Variable('TRAIN_COST')
SM_TT = Variable('SM_TT')
SM_COST = Variable('SM_COST')
CAR_TT = Variable('CAR_TT')
CAR_COST = Variable('CAR_COST')

# Set betas
B_TT = Beta('B_TT',0,None,None,0)
B_COST = Beta('B_COST',0,None,None,0)

# Set utility functions
V1 = TRAIN_TT * B_TT + TRAIN_COST * B_COST
V2 = SM_TT * B_TT + SM_COST * B_COST
V3 = CAR_TT * B_TT + CAR_COST * B_COST

V = {1: V1, 2: V2, 3: V3}

# Set availability conditions
av = {1: 1, 2: 1, 3: 1}

# Set model as MNL
logprob = loglogit(V,av,CHOICE)

In [15]:
# Load data
data_train = pd.read_csv('data/data_rum_1_train.csv')
data_test = pd.read_csv('data/data_rum_1_test.csv')

J = 3
K = 2
Xvars = ['TRAIN_COST','TRAIN_TT','SM_COST','SM_TT','CAR_COST','CAR_TT']
X_test = data_test[Xvars].to_numpy()

# Set database
database = db.Database('pseudosynthetic_1',data_train)
database_test = db.Database('pseudosynthetic_1_test',data_test)

# Set biogeme object to estimate
model = bio.BIOGEME(database,logprob)

# Set model to silent output
model.generateHtml = False
model.generatePickle = False
model.saveIterations = False
model.modelName = None

# Estimate
results = model.estimate()

In [16]:
# Get log-likelihood in train and test sample
model_test = bio.BIOGEME(database_test,logprob)

ll_train = model.simulate(theBetaValues=results.getBetaValues()).to_numpy().sum()
ll_test = model_test.simulate(theBetaValues=results.getBetaValues()).to_numpy().sum()
ll_full = ll_train + ll_test
r2_test = 1 - ll_test/(len(X_test)*np.log(1/J))

# Create metrics dataframe
metrics = pd.Series(np.r_[ll_full,ll_train,ll_test,r2_test],index=['Log-lik (full)','Log-lik (train)','Log-lik (test)','Rho-sq (test)'],name='Value')
metrics.to_csv('results/mnl_synth_1_metrics.csv')
metrics

Log-lik (full)    -5807.074220
Log-lik (train)   -4620.881086
Log-lik (test)    -1186.193134
Rho-sq (test)         0.402810
Name: Value, dtype: float64

In [17]:
# MU and VTT
pars = results.getEstimatedParameters()

mu_train_cost = pars['Value']['B_COST']
mu_train_tt   = pars['Value']['B_TT']
mu_sm_cost    = pars['Value']['B_COST']
mu_sm_tt      = pars['Value']['B_TT']
mu_car_cost   = pars['Value']['B_COST']
mu_car_tt     = pars['Value']['B_TT']

mu_array = np.c_[mu_train_cost,mu_train_tt,mu_sm_cost,mu_sm_tt,mu_car_cost,mu_car_tt]

# Compute 'true' MU
beta = [-2,-3]

true_mu_train_cost = beta[0]
true_mu_train_tt   = beta[1]
true_mu_sm_cost    = beta[0]
true_mu_sm_tt      = beta[1]
true_mu_car_cost   = beta[0]
true_mu_car_tt     = beta[1]

true_mu_array = np.c_[true_mu_train_cost,true_mu_train_tt,true_mu_sm_cost,true_mu_sm_tt,true_mu_car_cost,true_mu_car_tt]

# Create statistics
mu_mean   = np.mean(mu_array,axis=0)
mu_std    = np.std(mu_array,axis=0)
mu_median = np.median(mu_array,axis=0)
mu_mean_bias = np.mean(mu_array-true_mu_array,axis=0)
mu_rmse = np.sqrt(np.mean((mu_array-true_mu_array)**2,axis=0))

true_mu_mean   = np.mean(true_mu_array,axis=0)
true_mu_median = np.median(true_mu_array,axis=0)

In [18]:
# Create dataframe with results
df_mu = pd.DataFrame(np.c_[mu_mean,true_mu_mean,mu_median,true_mu_median,mu_mean_bias,mu_rmse],index=Xvars,columns=['Mean','Mean true','Median','Median true','Mean bias','RMSE'])
df_mu.to_csv('results/mnl_synth_1_mu.csv')
df_mu

Unnamed: 0,Mean,Mean true,Median,Median true,Mean bias,RMSE
TRAIN_COST,-2.005254,-2.0,-2.005254,-2.0,-0.005254,0.005254
TRAIN_TT,-3.048169,-3.0,-3.048169,-3.0,-0.048169,0.048169
SM_COST,-2.005254,-2.0,-2.005254,-2.0,-0.005254,0.005254
SM_TT,-3.048169,-3.0,-3.048169,-3.0,-0.048169,0.048169
CAR_COST,-2.005254,-2.0,-2.005254,-2.0,-0.005254,0.005254
CAR_TT,-3.048169,-3.0,-3.048169,-3.0,-0.048169,0.048169


In [19]:
# Get VTT
vtt_train      = mu_train_tt/mu_train_cost
vtt_sm         = mu_sm_tt/mu_sm_cost
vtt_car        = mu_car_tt/mu_car_cost

vtt_array = np.c_[vtt_train,vtt_sm,vtt_car]

# Compute 'true' VTT
true_vtt_train = true_mu_train_tt/true_mu_train_cost
true_vtt_sm = true_mu_sm_tt/true_mu_sm_cost
true_vtt_car = true_mu_car_tt/true_mu_car_cost

true_vtt_array = np.c_[true_vtt_train,true_vtt_sm,true_vtt_car]

# Create statistics
vtt_mean   = np.mean(vtt_array,axis=0)
vtt_std    = np.std(vtt_array,axis=0)
vtt_median = np.median(vtt_array,axis=0)
vtt_mean_bias = np.mean(vtt_array-true_vtt_array,axis=0)
vtt_rmse = np.sqrt(np.mean((vtt_array-true_vtt_array)**2,axis=0))

true_vtt_mean   = np.mean(true_vtt_array,axis=0)
true_vtt_median = np.median(true_vtt_array,axis=0)

In [20]:
vtt_names = ['TRAIN', 'SM', 'CAR']
df_vtt = pd.DataFrame(np.c_[vtt_mean,true_vtt_mean,vtt_median,true_vtt_median,vtt_mean_bias,vtt_rmse],index=vtt_names,columns=['Mean','Mean true','Median','Median true','Mean bias','RMSE'])
df_vtt.to_csv('results/mnl_synth_1_vtt.csv')
df_vtt

Unnamed: 0,Mean,Mean true,Median,Median true,Mean bias,RMSE
TRAIN,1.520092,1.5,1.520092,1.5,0.020092,0.020092
SM,1.520092,1.5,1.520092,1.5,0.020092,0.020092
CAR,1.520092,1.5,1.520092,1.5,0.020092,0.020092


In [21]:
# Load data
data_train = pd.read_csv('data/data_rum_4_train.csv')
data_test = pd.read_csv('data/data_rum_4_test.csv')

J = 3
K = 2
X_test = data_test[Xvars].to_numpy()

# Set database
database = db.Database('pseudosynthetic_4',data_train)
database_test = db.Database('pseudosynthetic_4_test',data_test)

# Set biogeme object to estimate
model = bio.BIOGEME(database,logprob)

# Set model to silent output
model.generateHtml = False
model.generatePickle = False
model.saveIterations = False
model.modelName = None

# Estimate
results = model.estimate()

In [22]:
# Get log-likelihood in train and test sample
model_test = bio.BIOGEME(database_test,logprob)

ll_train = model.simulate(theBetaValues=results.getBetaValues()).to_numpy().sum()
ll_test = model_test.simulate(theBetaValues=results.getBetaValues()).to_numpy().sum()
ll_full = ll_train + ll_test
r2_test = 1 - ll_test/(len(X_test)*np.log(1/J))

# Create metrics dataframe
metrics = pd.Series(np.r_[ll_full,ll_train,ll_test,r2_test],index=['Log-lik (full)','Log-lik (train)','Log-lik (test)','Rho-sq (test)'],name='Value')
metrics.to_csv('results/mnl_synth_4_metrics.csv')
metrics

Log-lik (full)    -4961.577871
Log-lik (train)   -3952.830442
Log-lik (test)    -1008.747429
Rho-sq (test)         0.492145
Name: Value, dtype: float64

In [24]:
# MU and VTT
pars = results.getEstimatedParameters()

mu_train_cost = pars['Value']['B_COST']
mu_train_tt   = pars['Value']['B_TT']
mu_sm_cost    = pars['Value']['B_COST']
mu_sm_tt      = pars['Value']['B_TT']
mu_car_cost   = pars['Value']['B_COST']
mu_car_tt     = pars['Value']['B_TT']

mu_array = np.c_[mu_train_cost,mu_train_tt,mu_sm_cost,mu_sm_tt,mu_car_cost,mu_car_tt]

# Compute 'true' MU
N = len(data_test)
J = 3
K = 2

# Define attributes
X = data_test[Xvars].to_numpy()

beta = [-3.,-5.]

true_mu_train_cost = beta[0]/(X[:,0]+0.1)
true_mu_train_tt   = beta[1]/(X[:,1]+0.1)
true_mu_sm_cost    = beta[0]/(X[:,2]+0.1)
true_mu_sm_tt      = beta[1]/(X[:,3]+0.1)
true_mu_car_cost   = beta[0]/(X[:,4]+0.1)
true_mu_car_tt     = beta[1]/(X[:,5]+0.1)

true_mu_array = np.c_[true_mu_train_cost,true_mu_train_tt,true_mu_sm_cost,true_mu_sm_tt,true_mu_car_cost,true_mu_car_tt]

# Create statistics
mu_mean   = np.mean(mu_array,axis=0)
mu_std    = np.std(mu_array,axis=0)
mu_median = np.median(mu_array,axis=0)
mu_mean_bias = np.mean(mu_array-true_mu_array,axis=0)
mu_rmse = np.sqrt(np.mean((mu_array-true_mu_array)**2,axis=0))

true_mu_mean   = np.mean(true_mu_array,axis=0)
true_mu_median = np.median(true_mu_array,axis=0)

In [25]:
# Create dataframe with results
df_mu = pd.DataFrame(np.c_[mu_mean,true_mu_mean,mu_median,true_mu_median,mu_mean_bias,mu_rmse],index=Xvars,columns=['Mean','Mean true','Median','Median true','Mean bias','RMSE'])
df_mu.to_csv('results/mnl_synth_4_mu.csv')
df_mu

Unnamed: 0,Mean,Mean true,Median,Median true,Mean bias,RMSE
TRAIN_COST,-2.242227,-5.692981,-2.242227,-3.26087,3.450754,7.953034
TRAIN_TT,-3.777047,-3.235772,-3.777047,-2.808989,-0.541275,1.588509
SM_COST,-2.242227,-5.156657,-2.242227,-2.777778,2.91443,7.76414
SM_TT,-3.777047,-6.032699,-3.777047,-5.434783,2.255652,3.563527
CAR_COST,-2.242227,-3.6201,-2.242227,-3.191489,1.377872,2.346561
CAR_TT,-3.777047,-3.901054,-3.777047,-3.333333,0.124006,1.886008


In [26]:
# Get VTT
vtt_train      = mu_train_tt/mu_train_cost
vtt_sm         = mu_sm_tt/mu_sm_cost
vtt_car        = mu_car_tt/mu_car_cost

vtt_array = np.c_[vtt_train,vtt_sm,vtt_car]

# Compute 'true' VTT
true_vtt_train = true_mu_train_tt/true_mu_train_cost
true_vtt_sm = true_mu_sm_tt/true_mu_sm_cost
true_vtt_car = true_mu_car_tt/true_mu_car_cost

true_vtt_array = np.c_[true_vtt_train,true_vtt_sm,true_vtt_car]

# Create statistics
vtt_mean   = np.mean(vtt_array,axis=0)
vtt_std    = np.std(vtt_array,axis=0)
vtt_median = np.median(vtt_array,axis=0)
vtt_mean_bias = np.mean(vtt_array-true_vtt_array,axis=0)
vtt_rmse = np.sqrt(np.mean((vtt_array-true_vtt_array)**2,axis=0))

true_vtt_mean   = np.mean(true_vtt_array,axis=0)
true_vtt_median = np.median(true_vtt_array,axis=0)

In [27]:
vtt_names = ['TRAIN', 'SM', 'CAR']
df_vtt = pd.DataFrame(np.c_[vtt_mean,true_vtt_mean,vtt_median,true_vtt_median,vtt_mean_bias,vtt_rmse],index=vtt_names,columns=['Mean','Mean true','Median','Median true','Mean bias','RMSE'])
df_vtt.to_csv('results/mnl_synth_4_vtt.csv')
df_vtt

Unnamed: 0,Mean,Mean true,Median,Median true,Mean bias,RMSE
TRAIN,1.684507,1.005046,1.684507,0.849465,0.679461,1.167797
SM,1.684507,2.276807,1.684507,1.839679,-0.5923,2.164404
CAR,1.684507,1.153914,1.684507,1.074074,0.530593,0.686745
