In [None]:
# Michel Bierlaire
# Thu Oct 25 13:49:48 2018

import pandas as pd
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme.expressions import Beta, DefineVariable, log

df = pd.read_csv("swissmetro.dat",sep='\t')
database = db.Database("swissmetro",df)
pd.options.display.float_format = '{:.3g}'.format
globals().update(database.variables)

exclude = ((  PURPOSE   !=  1  ) * (  PURPOSE   !=  3  ) + (  CHOICE   ==  0  ) + (  AGE == 6  ) + (  INCOME == 4  ))>0
database.remove(exclude)


#Parameters to be estimated
# Arguments:
#   1  Name for report. Typically, the same as the variable
#   2  Starting value
#   3  Lower bound
#   4  Upper bound
#   5  0: estimate the parameter, 1: keep it fixed
ASC_CAR	 = Beta('ASC_CAR',0,None,None,0)
ASC_SBB	 = Beta('ASC_SBB',0,None,None,1)
ASC_SM	 = Beta('ASC_SM',0,None,None,0)
B_CAR_COST	 = Beta('B_CAR_COST',0,None,None,0)
B_HE	 = Beta('B_HE',0,None,None,0)
B_SM_COST	 = Beta('B_SM_COST',0,None,None,0)
B_TIME	 = Beta('B_TIME',0,None,None,0)
B_TRAIN_COST	 = Beta('B_TRAIN_COST',0,None,None,0)
B_SENIOR	 = Beta('B_SENIOR',0,None,None,0)
B_GA	 = Beta('B_GA',0,None,None,0)

# Define here arithmetic expressions for name that are not directly 
# available from the data

SENIOR  = DefineVariable('SENIOR', AGE   ==  5 ,database)
CAR_AV_SP  = DefineVariable('CAR_AV_SP', CAR_AV    *  (  SP   !=  0  ),database)
SM_COST  = DefineVariable('SM_COST', SM_CO   * (  GA   ==  0  ),database)
TRAIN_AV_SP  = DefineVariable('TRAIN_AV_SP', TRAIN_AV    *  (  SP   !=  0  ),database)
TRAIN_COST  = DefineVariable('TRAIN_COST', TRAIN_CO   * (  GA   ==  0  ),database)

TRAIN_TT_SCALED = DefineVariable('TRAIN_TT_SCALED',\
                                 TRAIN_TT / 100.0,database)
TRAIN_COST_SCALED = DefineVariable('TRAIN_COST_SCALED',\
                                   TRAIN_COST / 100,database)
SM_TT_SCALED = DefineVariable('SM_TT_SCALED', SM_TT / 100.0,database)
SM_COST_SCALED = DefineVariable('SM_COST_SCALED', SM_COST / 100,database)
CAR_TT_SCALED = DefineVariable('CAR_TT_SCALED', CAR_TT / 100,database)
CAR_CO_SCALED = DefineVariable('CAR_CO_SCALED', CAR_CO / 100,database)
TRAIN_HE_SCALED = DefineVariable('TRAIN_HE_SCALED', TRAIN_HE / 100,database)
SM_HE_SCALED = DefineVariable('SM_HE_SCALED', SM_HE / 100,database)

#Utilities
Car_SP = ASC_CAR + B_TIME * CAR_TT_SCALED + B_CAR_COST * CAR_CO_SCALED + B_SENIOR * SENIOR
SBB_SP = ASC_SBB + B_TIME * TRAIN_TT_SCALED + B_TRAIN_COST * TRAIN_COST_SCALED + B_HE * TRAIN_HE_SCALED + B_GA * GA
SM_SP = ASC_SM + B_TIME * SM_TT_SCALED + B_SM_COST * SM_COST_SCALED + B_HE * SM_HE_SCALED + B_GA * GA + B_SENIOR * SENIOR

V = {3: Car_SP,1: SBB_SP,2: SM_SP}
av = {3: CAR_AV_SP,1: TRAIN_AV_SP,2: SM_AV}

In [3]:
# Market Segmentation - Male and Female
# Duplicate the database
database_males = db.Database("airline_males",pd.DataFrame.copy(database.data))
database_females = db.Database("airline_females",pd.DataFrame.copy(database.data))
# Remove observations
database_males.remove(MALE   ==  0)
database_females.remove(MALE   ==  1)
print(f"Total number of observations: {database.getNumberOfObservations()}")
print(f"Females                     : {database_females.getNumberOfObservations()}")
print(f"Males                       : {database_males.getNumberOfObservations()}")

logprob = models.loglogit(V,av,CHOICE)

biogeme_full  = bio.BIOGEME(database,logprob)
biogeme_full.modelName = "SpecTest_SM_fullSample"
results_full = biogeme_full.estimate()
ll_full = results_full.data.logLike

biogeme_females  = bio.BIOGEME(database_females,logprob)
biogeme_females.modelName = "SpecTest_SM_females"
results_females = biogeme_females.estimate()
ll_females = results_females.data.logLike

biogeme_males  = bio.BIOGEME(database_males,logprob)
biogeme_males.modelName = "SpecTest_SM_males"
results_males = biogeme_males.estimate()
ll_males = results_males.data.logLike

print(f"LL full:    {ll_full:.3f}  Parameters: {results_full.data.nparam}")
print(f"LL females: {ll_females:.3f}  Parameters: {results_females.data.nparam}")
print(f"LL males:   {ll_males:.3f}  Parameters: {results_males.data.nparam}")
unrestricted = ll_females+ll_males
print(f"Sum LL :    {unrestricted:.3f}")
lr = -2 * (ll_full - unrestricted)
print(f"likelihood ratio: {lr:.3f}")
print("Output files:")
print(f"{results_full.data.htmlFileName}")
print(f"{results_females.data.htmlFileName}")
print(f"{results_males.data.htmlFileName}")

Total number of observations: 6192
Females                     : 1152
Males                       : 5040
LL full:    -4444.392  Parameters: 9
LL females: -867.078  Parameters: 9
LL males:   -3460.648  Parameters: 9
Sum LL :    -4327.726
likelihood ratio: 233.332
Output files:
SpecTest_SM_fullSample.html
SpecTest_SM_females.html
SpecTest_SM_males.html


In [4]:
# Market Segmentation - High, Medium and Low Income
# Duplicate the database
database_highinc = db.Database("swiss_highinc",pd.DataFrame.copy(database.data))
database_medinc = db.Database("swiss_medinc",pd.DataFrame.copy(database.data))
database_lowinc = db.Database("swiss_lowinc",pd.DataFrame.copy(database.data))
# Remove observations
database_highinc.remove(INCOME   !=  3)
database_medinc.remove(INCOME   !=  2)
database_lowinc.remove((INCOME   !=  0) * (INCOME   !=  1))

print(f"Total number of observations: {database.getNumberOfObservations()}")
print(f"High Income                 : {database_highinc.getNumberOfObservations()}")
print(f"Med Income                  : {database_medinc.getNumberOfObservations()}")
print(f"Low Income                  : {database_lowinc.getNumberOfObservations()}")

logprob = models.loglogit(V,av,CHOICE)

biogeme_full_inc  = bio.BIOGEME(database,logprob)
biogeme_full_inc.modelName = "SpecTest_SM_fullSample_income"
results_full_inc = biogeme_full.estimate()
ll_full_inc = results_full_inc.data.logLike

biogeme_highinc  = bio.BIOGEME(database_highinc,logprob)
biogeme_highinc.modelName = "SpecTest_SM_highinc"
results_highinc = biogeme_highinc.estimate()
ll_highinc = results_highinc.data.logLike

biogeme_medinc  = bio.BIOGEME(database_medinc,logprob)
biogeme_medinc.modelName = "SpecTest_SM_medinc"
results_medinc = biogeme_medinc.estimate()
ll_medinc = results_medinc.data.logLike

biogeme_lowinc  = bio.BIOGEME(database_lowinc,logprob)
biogeme_lowinc.modelName = "SpecTest_SM_lowinc"
results_lowinc = biogeme_lowinc.estimate()
ll_lowinc = results_lowinc.data.logLike

print(f"LL full income:    {ll_full_inc:.3f}  Parameters: {results_full_inc.data.nparam}")
print(f"LL high income: {ll_highinc:.3f}  Parameters: {results_highinc.data.nparam}")
print(f"LL med income: {ll_medinc:.3f}  Parameters: {results_medinc.data.nparam}")
print(f"LL low income: {ll_lowinc:.3f}  Parameters: {results_lowinc.data.nparam}")

unrestricted = ll_highinc+ll_medinc+ll_lowinc
print(f"Sum LL :    {unrestricted:.3f}")
lr = -2 * (ll_full_inc - unrestricted)
print(f"likelihood ratio: {lr:.3f}")
print("Output files:")
print(f"{results_full_inc.data.htmlFileName}")
print(f"{results_highinc.data.htmlFileName}")
print(f"{results_medinc.data.htmlFileName}")
print(f"{results_lowinc.data.htmlFileName}")

Total number of observations: 6192
High Income                 : 2907
Med Income                  : 2124
Low Income                  : 1161
LL full income:    -4444.392  Parameters: 9
LL high income: -1878.352  Parameters: 9
LL med income: -1583.673  Parameters: 9
LL low income: -863.462  Parameters: 9
Sum LL :    -4325.487
likelihood ratio: 237.809
Output files:
SpecTest_SM_fullSample~00.html
SpecTest_SM_highinc.html
SpecTest_SM_medinc.html
SpecTest_SM_lowinc.html


In [30]:
full_inc_pandasResults = results_full_inc.getEstimatedParameters()
high_inc_pandasResults = results_highinc.getEstimatedParameters()
med_inc_pandasResults = results_medinc.getEstimatedParameters()
low_inc_pandasResults = results_lowinc.getEstimatedParameters()
print(f'-----Parameter Statistics------')
print(full_inc_pandasResults)
print(high_inc_pandasResults)
print(med_inc_pandasResults)
print(low_inc_pandasResults)

-----Parameter Statistics------
               Value  Std err  t-test  p-value  Rob. Std err  Rob. t-test  \
ASC_CAR         -0.5     0.13   -3.86 0.000113         0.157        -3.19   
ASC_SM       -0.0799   0.0915  -0.873    0.383         0.118       -0.679   
B_CAR_COST        -1   0.0951   -10.5        0         0.123         -8.1   
B_GA           0.898    0.221    4.06 4.98e-05         0.224         4.01   
B_HE          -0.574    0.115      -5 5.72e-07         0.115        -4.98   
B_SENIOR       -1.69    0.128   -13.2        0         0.115        -14.7   
B_SM_COST      -1.07   0.0575   -18.6        0        0.0782        -13.7   
B_TIME         -1.14   0.0644   -17.6        0         0.132         -8.6   
B_TRAIN_COST   -2.82    0.129   -21.9        0           0.2        -14.1   

              Rob. p-value  
ASC_CAR            0.00141  
ASC_SM               0.497  
B_CAR_COST        4.44e-16  
B_GA              6.13e-05  
B_HE              6.33e-07  
B_SENIOR               

In [23]:
print(full_inc_pandasResults.iloc[:,0])

ASC_CAR           -0.5
ASC_SM         -0.0799
B_CAR_COST          -1
B_GA             0.898
B_HE            -0.574
B_SENIOR         -1.69
B_SM_COST        -1.07
B_TIME           -1.14
B_TRAIN_COST     -2.82
Name: Value, dtype: float64


In [47]:
from functools import reduce
data_frames = [full_inc_pandasResults[['Value', 'Rob. t-test']], high_inc_pandasResults[['Value', 'Rob. t-test']], \
               med_inc_pandasResults[['Value', 'Rob. t-test']], low_inc_pandasResults[['Value', 'Rob. t-test']]]
df_merged = reduce(lambda  left,right: pd.merge(left,right,left_index=True, right_index=True,
                                            how='outer'), data_frames)
# df_merged = reduce(lambda left,right: print(right[0],right[1]), enumerate(data_frames))
df_merged.columns = ['Value Full', 'Rob. t-test Full', 'Value High', 'Rob. t-test High', \
                     'Value Medium', 'Rob. t-test Medium', 'Value Low', 'Rob. t-test Low']
df_merged

  df_merged = reduce(lambda  left,right: pd.merge(left,right,left_index=True, right_index=True,


Unnamed: 0,Value Full,Rob. t-test Full,Value High,Rob. t-test High,Value Medium,Rob. t-test Medium,Value Low,Rob. t-test Low
ASC_CAR,-0.5,-3.19,-0.512,-2.22,-0.209,-0.706,-0.511,-1.09
ASC_SM,-0.0799,-0.679,-0.131,-0.708,0.551,2.44,-0.204,-0.619
B_CAR_COST,-1.0,-8.1,-0.672,-5.03,-1.25,-5.37,-1.88,-3.18
B_GA,0.898,4.01,2.05,4.27,0.549,1.41,-0.343,-0.885
B_HE,-0.574,-4.98,-0.625,-2.77,-0.832,-4.01,-0.451,-2.4
B_SENIOR,-1.69,-14.7,-0.545,-1.34,-2.29,-9.61,-1.49,-8.67
B_SM_COST,-1.07,-13.7,-1.01,-11.5,-1.23,-6.86,-1.64,-5.28
B_TIME,-1.14,-8.6,-1.77,-16.1,-0.507,-3.25,-0.709,-1.28
B_TRAIN_COST,-2.82,-14.1,-2.75,-9.32,-2.6,-5.6,-3.2,-7.81
