# <font color=red>To Do:</font>
<p>
<ol>
    <li> <strike>Try using smaller dataset and use the new specification</strike> </li>
    <li> <strike>Try using starting values that are similar to the mnl values (try truncated or adding jitter).</strike> </li>
    <li> <strike>Try rescaling variables so they are all on the same order of magnitude</strike> </li>
    <li>  <strike>Try specifying the alternatives better for shared ride 2 and 3+ (for instance taking into account couples and families).</strike>
        <ol>
            <li> <strike> Include a variable for whether or not a person lives in a household with family members (1 - non_related_flag).</strike> </li>
            <li> <strike>Include a variable for the number of kids in a household under 18 or whether or no there are kids under 18.</strike> </li>
            <li> <strike>Include a variable indicating whether the person is married or living with their partner.</strike> </li>
            <li> <strike>Perhaps include cross-bay tour? That may be sketchy since that variable was created specifically based on whether one's origins and destinations were near BART.</strike> </li>
        </ol>
    </li>
    <li> Look at Tierra's dissertation to see how she got the travel cost parameters to be negative. Did she do something I am not? </li>
    <li> Try "making" PyPolychord with an explicit flag setting MPI=0. If that works, see if it estimates the clog-log model any faster than PyMultiNest. </li>
</ol>
</p>
    
<font color=darkgreen size=4>Note: What finally worked was specifying the "cost" variable for driving as dollars-per-mile instead of simply as dollars.</font>

In [1]:
from collections import OrderedDict
from pprint import pprint
import math
import os

import scipy.stats
import numpy as np
import pandas as pd
import seaborn
import matplotlib.pyplot as plt

import general_choice_model as cm

%matplotlib inline



# Get the Mode Choice Data

In [2]:
# Create a variable for the path to the long format data for the multinomial choice model
long_form_path = "spring_2016_all_bay_area_long_format_plus_cross_bay_col.csv"
bike_data = pd.read_csv(long_form_path)

# If in previous work we accidentally saved the index with the dataframe
# remove the old index from the data
if "Unnamed: 0" in bike_data.columns:
    del bike_data["Unnamed: 0"]
    
print "The columns of bike_data are:"
bike_data.columns

The columns of bike_data are:


Index([u'household_id', u'person_id', u'tour_id', u'observation_id',
       u'mode_id', u'choice', u'tour_origin_taz', u'primary_dest_taz',
       u'total_travel_time', u'total_travel_cost', u'total_travel_distance',
       u'age', u'household_size', u'household_income',
       u'household_income_values', u'transit_subsidy',
       u'transit_subsidy_amount', u'num_cars', u'num_licensed_drivers',
       u'cross_bay', u'oakland_and_berkeley', u'survey_id', u'gender',
       u'non_relative_flag', u'num_pre_school', u'num_school_aged', u'married',
       u'parent', u'income_category_1', u'income_category_2',
       u'income_category_3', u'income_category_4', u'income_category_5',
       u'income_category_6', u'income_category_7', u'income_category_8',
       u'income_category_9', u'income_category_10', u'income_unknown',
       u'ln_drive_cost', u'ln_drive_cost_sq', u'total_travel_time_10x',
       u'total_travel_time_tenth', u'high_income', u'medium_income',
       u'low_income', u'high_i

In [3]:
# Look at the mode shares in the data set
alt_id_to_mode_name = {1: "Drive Alone",
                       2: "Shared Ride 2",
                       3: "Shared Ride 3+",
                       4: "Walk-Transit-Walk",
                       5: "Drive-Transit-Walk",
                       6: "Walk-Transit-Drive",
                       7: "Walk",
                       8: "Bike"}

mode_counts = bike_data.loc[bike_data.choice==1,
                            "mode_id"].value_counts().loc[range(1, 9)]

mode_shares = mode_counts / bike_data.observation_id.max()
mode_shares.index = [alt_id_to_mode_name[x] for x in mode_shares.index.values]
mode_shares.name = "Mode Shares"
mode_shares

Drive Alone           0.428322
Shared Ride 2         0.158841
Shared Ride 3+        0.139860
Walk-Transit-Walk     0.103397
Drive-Transit-Walk    0.015485
Walk-Transit-Drive    0.013237
Walk                  0.094406
Bike                  0.046454
Name: Mode Shares, dtype: float64

# Add variables that may be useful

In [4]:
# Initialize a container to map the income categories to price range strings
income_cat_to_value_range = dict(zip(range(1, 11),
                                     ["0-10k", "10-25k",
                                      "25-35k", "35-50k",
                                      "50-75k", "75-100k",
                                      "100-150k", "150-200k",
                                      "200-250k", "250k+"]))

# Add the income info as dummy variables to the dataset
for i in range(1, 11):
    bike_data["income_category_{}".format(i)] = (bike_data["household_income"] == i).astype(int)
    
# Take special account of the income categories we don't know
bike_data["income_unknown"] = bike_data["household_income"].isin([98, 99]).astype(int)

In [5]:
# Add the natural log of travel costs to the dataset
bike_data["ln_drive_cost"] = 0
bike_data.loc[bike_data.mode_id.isin([1, 2, 3]),
              "ln_drive_cost"] = np.log(bike_data.loc[bike_data.mode_id.isin([1, 2, 3]),
                                                      "total_travel_cost"] )
bike_data["ln_drive_cost_sq"] = np.square(bike_data["ln_drive_cost"])

In [6]:
# Scale travel time by 10 or by 0.1. I don't remember which 
# will increase the value of the estimate
bike_data["total_travel_time_10x"] = 10 * bike_data["total_travel_time"]
bike_data["total_travel_time_tenth"] = 0.1 * bike_data["total_travel_time"]

In [7]:
# Create categories for low, medium, and high income
# Also create categories for unknown income

# Low = 35k or less a year
# Medium = 35k to 100k
# High = 100k and up
bike_data["high_income"] = bike_data.household_income.isin(range(7, 11)).astype(int)
bike_data["medium_income"] = bike_data.household_income.isin(range(4, 7)).astype(int)
bike_data["low_income"] = bike_data.household_income.isin(range(1, 4)).astype(int)
bike_data["income_unknown"] = bike_data.household_income.isin([98, 99]).astype(int)

# Create interactions of cost and income categories
bike_data["high_income_cost"] = bike_data["total_travel_cost"] * bike_data["high_income"]
bike_data["medium_income_cost"] = bike_data["total_travel_cost"] * bike_data["medium_income"]
bike_data["low_income_cost"] = bike_data["total_travel_cost"] * bike_data["low_income"]
bike_data["unknown_income_cost"] = bike_data["total_travel_cost"] * bike_data["income_unknown"]

bike_data["high_income_ln_cost"] = bike_data["ln_drive_cost"] * bike_data["high_income"]
bike_data["medium_income_ln_cost"] = bike_data["ln_drive_cost"] * bike_data["medium_income"]
bike_data["low_income_ln_cost"] = bike_data["ln_drive_cost"] * bike_data["low_income"]
bike_data["unknown_income_ln_cost"] = bike_data["ln_drive_cost"] * bike_data["income_unknown"]

In [8]:
# Hypothesis 4: Still omitted variables bias
# The idea is that people who own cars will, in general,
# want to drive those cars. Moreover, I think the number
# of  cars per licensed drivers is a proxy for how much
# an individual likes to drive. The greater one's income,
# the greater the likelihood of owning many cars

# Create a cars per licensed drivers column
bike_data["cars_per_licensed_drivers"] = 0
bike_data.loc[bike_data.num_licensed_drivers > 0,
              "cars_per_licensed_drivers"] = bike_data.num_cars / bike_data.num_licensed_drivers.astype(float)

In [9]:
# Add a variable representing the number of people under 18 years old in the household
bike_data["num_kids"] = bike_data["num_pre_school"] + bike_data["num_school_aged"]

# Add a variable for whether or not the individual's 
# household has one or more members in it who are related to each other
bike_data["family_in_household"] = 1 - bike_data["non_relative_flag"]

# Add a variable for whether a person is a married woman
bike_data["married_woman"] = (bike_data["gender"] == 2).astype(int) * bike_data["married"]

# Add a variable representing cost divided by distance
bike_data["cost_per_distance"] = 0
bike_data.loc[bike_data.mode_id.isin([1, 2, 3]),
              "cost_per_distance"] = (bike_data.loc[bike_data.mode_id.isin([1, 2, 3]),
                                                "total_travel_cost"] /
                                      bike_data.loc[bike_data.mode_id.isin([1, 2, 3]),
                                                    "total_travel_distance"])

# Specify and estimate various choice models.

In [10]:
# Create my specification and variable names for the basic MNL model
# NOTE: - Keys should be variables within the long format dataframe.
#         The sole exception to this is the "intercept" key.
#       - For the specification dictionary, the values should be lists
#         or lists of lists. Within a list, or within the inner-most
#         list should be the alternative ID's of the alternative whose
#         utility specification the explanatory variable is entering.

mnl_specification = OrderedDict()
mnl_names = OrderedDict()

mnl_specification["intercept"] = range(2, 9)
mnl_names["intercept"] = ['ASC Shared Ride: 2',
                          'ASC Shared Ride: 3+',
                          'ASC Walk-Transit-Walk',
                          'ASC Drive-Transit-Walk',
                          'ASC Walk-Transit-Drive',
                          'ASC Walk',
                          'ASC Bike']

mnl_specification["total_travel_time"] = [[1, 2, 3], [4, 5, 6]]
mnl_names["total_travel_time"] = ['Travel Time, units:min (All Auto Modes)',
                                  'Travel Time, units:min (All Transit Modes)']

mnl_specification["total_travel_cost"] = [[4, 5, 6]]
mnl_names["total_travel_cost"] = ['Travel Cost, units:$ (All Transit Modes)']

mnl_specification["cost_per_distance"] = [1, 2, 3]
mnl_names["cost_per_distance"] = ["Travel Cost per Distance, units:$/mi (Drive Alone)",
                                  "Travel Cost per Distance, units:$/mi (SharedRide-2)",
                                  "Travel Cost per Distance, units:$/mi (SharedRide-3+)"]

mnl_specification["cars_per_licensed_drivers"] = [[1, 2, 3]]
mnl_names["cars_per_licensed_drivers"] = ["Autos per licensed drivers (All Auto Modes)"]

mnl_specification["total_travel_distance"] = [7, 8]
mnl_names["total_travel_distance"] = ['Travel Distance, units:mi (Walk)',
                                      'Travel Distance, units:mi (Bike)']

# mnl_specification["cross_bay"] = [[2, 3], [4, 5, 6]]
# mnl_names["cross_bay"] = ["Cross-Bay Tour (Shared Ride 2 & 3+)",
#                           "Cross-Bay Tour (All Transit Modes)"]
mnl_specification["cross_bay"] = [[2, 3]]
mnl_names["cross_bay"] = ["Cross-Bay Tour (Shared Ride 2 & 3+)"]

mnl_specification["household_size"] = [[2, 3]]
mnl_names["household_size"] = ['Household Size (Shared Ride 2 & 3+)']

mnl_specification["num_kids"] = [[2, 3]]
mnl_names["num_kids"] = ["Number of Kids in Household (Shared Ride 2 & 3+)"]

### Use this cell to keep track of the unused specification commands

In [11]:
# Original working specification of travel costs
# mnl_specification["total_travel_cost"] = [1, 2, 3, [4, 5, 6]]
# mnl_names["total_travel_cost"] = ['Travel Cost, units:$ (Drive Alone)',
#                                   'Travel Cost, units:$ (SharedRide-2)',
#                                   'Travel Cost, units:$ (SharedRide-3+)',
#                                   'Travel Cost, units:$ (All Transit Modes)']

# Add the natural log of travel costs to represent the log-normal distributed travel
# costs for the class conditional distribution of travel costs for drivers
# mnl_specification["ln_drive_cost"] = [1, 2, 3]
# mnl_names["ln_drive_cost"] = ["Ln(Travel Cost, units:$) (Drive Alone)",
#                               "Ln(Travel Cost, units:$) (SharedRide-2)",
#                               "Ln(Travel Cost, units:$) (SharedRide-3+)"]

# mnl_specification["ln_drive_cost_sq"] = [1, 2, 3]
# mnl_names["ln_drive_cost_sq"] = ["[Ln(Travel Cost, units:$)]^2 (Drive Alone)",
#                                  "[Ln(Travel Cost, units:$)]^2 (SharedRide-2)",
#                                  "[Ln(Travel Cost, units:$)]^2 (SharedRide-3+)"]

# Add main effects for income categories to the model
# for i in range(2, 11):
#     mnl_specification["income_category_{}".format(i)] = [[1, 2, 3]]
#     coef_string = "Household Income {}, units:$ (All Auto Modes)"
#     coef_string = coef_string.format(income_cat_to_value_range[i])
#     mnl_names["income_category_{}".format(i)] = [coef_string]
# # Don't forget the people with unknown income
# mnl_specification["income_unknown"] = [[1, 2, 3]]
# mnl_names["income_unknown"] = ["Household Income Unknown (All Auto Modes)"]

# Add interactions with course income categories
# mnl_specification["high_income_cost"] = [[1, 2, 3]]
# mnl_names["high_income_cost"] = ['Travel Cost, units:$ (All Auto Modes, Income > \$75k)']

# mnl_specification["medium_income_cost"] = [[1, 2, 3]]
# mnl_names["medium_income_cost"] = ['Travel Cost, units:$ (All Auto Modes, \$35k < Income < \$75k)']

# mnl_specification["low_income_cost"] = [[1, 2, 3]]
# mnl_names["low_income_cost"] = ['Travel Cost, units:$ (All Auto Modes, Income < \$35k)']

# mnl_specification["unknown_income_cost"] = [[1, 2, 3]]
# mnl_names["unknown_income_cost"] = ['Travel Cost, units:$ (All Auto Modes, Income unknown)']

# Add interactions with ln cost
# mnl_specification["high_income_ln_cost"] = [[1, 2, 3]]
# mnl_names["high_income_ln_cost"] = ['Ln(Travel Cost, units:$) (All Auto Modes, Income > \$75k)']

# mnl_specification["medium_income_ln_cost"] = [[1, 2, 3]]
# mnl_names["medium_income_ln_cost"] = ['Ln(Travel Cost, units:$) (All Auto Modes, \$35k < Income < \$75k)']

# mnl_specification["low_income_cost"] = [[1, 2, 3]]
# mnl_names["low_income_cost"] = ['Travel Cost, units:$ (All Auto Modes, Income < \$35k)']

# mnl_specification["unknown_income_ln_cost"] = [[1, 2, 3]]
# mnl_names["unknown_income_ln_cost"] = ['Ln(Travel Cost, units:$) (All Auto Modes, Income unknown)']

# Add variables for whether the person lives with a spouse/partner
# mnl_specification["married"] = [[2, 3]]
# mnl_names["married"] = ["Lives with Partner/Spouse (Shared Ride 2 & 3+)"]

# Interact gender with whether the person lives with a spouse/partner
# mnl_specification["married_woman"] = [[2, 3]]
# mnl_names["married_woman"] = ["Female Living with Partner/Spouse (Shared Ride 2 & 3+)"]

Note that in order to get correct signs on ALL of the travel cost coefficients for driving, I had to
<ol>
    <li> Disaggregate the travel cost coefficients from being generic across the three driving modes to being alternative specific. This changed my significant and positive travel cost coefficient into negative coefficients for the shared ride alternatives and still positive for the drive alone alternative. The drive alone and shared ride 2 coefficients were insignificant though.</li>
    <li> Add a variable for the number of automobiles per household divided by the number of licensed drivers in the household. This made all travel cost coefficients negative, though the drive alone coefficient is insignificant.</li>
</ol>

Despite the specification above getting me correct signs, my preferred specification would probably be using a log-normal transformation on the travel cost for each of the driving modes (i.e. including $\ln (\textrm{cost})$ and $\left[ \ln(\textrm{cost}) \right]^2$ with alternative specific coefficients in each of the driving modes' utilities).

## Estimate a Multinomial Logit Model

In [12]:
# Estimate the basic MNL model, using the hessian and newton-conjugate gradient
mnl_model = cm.create_choice_model(data=bike_data,
                                   alt_id_col="mode_id",
                                   obs_id_col="observation_id",
                                   choice_col="choice",
                                   specification=mnl_specification,
                                   model_type="MNL",
                                   names=mnl_names)

num_vars = len(reduce(lambda x, y: x + y, mnl_names.values()))
# Note newton-cg used to ensure convergence to a point where gradient 
# is essentially zero for all dimensions. 
mnl_model.fit_mle(np.zeros(num_vars),
                  method="BFGS")

# Look at the estimation results
mnl_model.get_statsmodels_summary()

Log-likelihood at zero: -7,599.7019
Initial Log-likelihood: -7,599.7019




Estimation Time: 0.25 seconds.
Final log-likelihood: -5,073.4276


0,1,2,3
Dep. Variable:,choice,No. Observations:,4004.0
Model:,Multinomial Logit Model,Df Residuals:,3985.0
Method:,MLE,Df Model:,19.0
Date:,"Fri, 21 Jul 2017",Pseudo R-squ.:,0.332
Time:,14:39:16,Pseudo R-bar-squ.:,0.33
AIC:,10184.855,Log-Likelihood:,-5073.428
BIC:,10304.461,LL-Null:,-7599.702

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
ASC Shared Ride: 2,-1.0097,0.486,-2.079,0.038,-1.962 -0.058
ASC Shared Ride: 3+,3.4619,1.064,3.254,0.001,1.377 5.547
ASC Walk-Transit-Walk,-0.3921,0.288,-1.360,0.174,-0.957 0.173
ASC Drive-Transit-Walk,-2.6220,0.303,-8.660,0.000,-3.215 -2.029
ASC Walk-Transit-Drive,-2.9773,0.306,-9.725,0.000,-3.577 -2.377
ASC Walk,1.5541,0.305,5.101,0.000,0.957 2.151
ASC Bike,-1.1059,0.305,-3.628,0.000,-1.703 -0.508
"Travel Time, units:min (All Auto Modes)",-0.0760,0.006,-13.728,0.000,-0.087 -0.065
"Travel Time, units:min (All Transit Modes)",-0.0274,0.002,-12.768,0.000,-0.032 -0.023


In [13]:
# Look at the gradient of the estimated parameters
mnl_model.gradient

ASC Shared Ride: 2                                     -8.859282e-07
ASC Shared Ride: 3+                                     1.276466e-07
ASC Walk-Transit-Walk                                   9.080250e-08
ASC Drive-Transit-Walk                                  1.145707e-08
ASC Walk-Transit-Drive                                 -6.369727e-07
ASC Walk                                                3.714483e-07
ASC Bike                                                6.687875e-07
Travel Time, units:min (All Auto Modes)                -9.226280e-06
Travel Time, units:min (All Transit Modes)             -2.664724e-05
Travel Cost, units:$ (All Transit Modes)               -2.664269e-06
Travel Cost per Distance, units:$/mi (Drive Alone)      1.059869e-08
Travel Cost per Distance, units:$/mi (SharedRide-2)    -1.125364e-07
Travel Cost per Distance, units:$/mi (SharedRide-3+)    9.414940e-09
Autos per licensed drivers (All Auto Modes)            -9.467068e-07
Travel Distance, units:mi (Walk)  

## Estimate a Multinomial Clog-log model

In [14]:
# Create the various specification and name dictionaries 
# for the clog-log model
clog_specification = OrderedDict()
clog_names = OrderedDict()

for col in mnl_specification:
    if col != "intercept":
        clog_specification[col] = mnl_specification[col]
        clog_names[col] = mnl_names[col]

# Get the list of intercept names for the clog-log model        
clog_intercept_names = mnl_names["intercept"]

In [15]:
# Estimate the Clog-log model based on the MNL model
clog_model = cm.create_choice_model(data=bike_data,
                                    alt_id_col="mode_id",
                                    obs_id_col="observation_id",
                                    choice_col="choice",
                                    specification=clog_specification,
                                    model_type="Cloglog",
                                    intercept_ref_pos=0,
                                    names=clog_names,
                                    intercept_names=clog_intercept_names)
 
clog_model.fit_mle(None,
                   init_intercepts=mnl_model.params.values[:7],
                   init_coefs=mnl_model.params.values[7:],
                   method='powell')

# Look at the parameter estimates
clog_model.get_statsmodels_summary()

Log-likelihood at zero: -7,599.7019
Initial Log-likelihood: -6,333.2964


  transformations = np.log(exp_exp_v - 1)
  **kwargs)
  derivs = 1.0 / (denom_part_1 * exp_neg_v)
  exp_exp_v = np.exp(exp_v)


Estimation Time: 58.51 seconds.
Final log-likelihood: -5,116.1742


0,1,2,3
Dep. Variable:,choice,No. Observations:,4004.0
Model:,Multinomial Clog-log Model,Df Residuals:,3985.0
Method:,MLE,Df Model:,19.0
Date:,"Fri, 21 Jul 2017",Pseudo R-squ.:,0.327
Time:,14:41:08,Pseudo R-bar-squ.:,0.324
AIC:,10270.348,Log-Likelihood:,-5116.174
BIC:,10389.954,LL-Null:,-7599.702

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
ASC Shared Ride: 2,1.0331,0.551,1.874,0.061,-0.047 2.113
ASC Shared Ride: 3+,5.9355,1.125,5.275,0.000,3.730 8.141
ASC Walk-Transit-Walk,-1.7237,0.304,-5.662,0.000,-2.320 -1.127
ASC Drive-Transit-Walk,-3.9833,0.319,-12.470,0.000,-4.609 -3.357
ASC Walk-Transit-Drive,-4.3262,0.323,-13.382,0.000,-4.960 -3.693
ASC Walk,-0.0892,0.338,-0.264,0.792,-0.751 0.573
ASC Bike,-2.8691,0.337,-8.520,0.000,-3.529 -2.209
"Travel Time, units:min (All Auto Modes)",-0.0773,0.006,-13.756,0.000,-0.088 -0.066
"Travel Time, units:min (All Transit Modes)",-0.0263,0.002,-12.214,0.000,-0.031 -0.022


In [16]:
# Look at the gradient of the estimated parameters
clog_model.gradient

ASC Shared Ride: 2                                      -0.906185
ASC Shared Ride: 3+                                      0.403990
ASC Walk-Transit-Walk                                   -0.598174
ASC Drive-Transit-Walk                                   0.060425
ASC Walk-Transit-Drive                                   0.000778
ASC Walk                                                 0.018416
ASC Bike                                                -0.267434
Travel Time, units:min (All Auto Modes)                  5.410969
Travel Time, units:min (All Transit Modes)             -51.956767
Travel Cost, units:$ (All Transit Modes)                -2.766699
Travel Cost per Distance, units:$/mi (Drive Alone)       0.162307
Travel Cost per Distance, units:$/mi (SharedRide-2)     -0.012860
Travel Cost per Distance, units:$/mi (SharedRide-3+)     0.001429
Autos per licensed drivers (All Auto Modes)              0.584494
Travel Distance, units:mi (Walk)                         0.026058
Travel Dis

The clog-log model is far from a local optimum. However, none of the other solvers get me closer to a local maximum of the log-likelihood functiom.

For future reference, these results change <strong>slightly</strong> and non-significantly when one changes the method used to guard against underflow and overflow.

## Estimate a Multinomial Asymmetric Logit Model

In [17]:
# Give names to the shape parameters of the asymmetric logit model
asym_shape_names = ["shape_" + x for x in
                   ["Shared Ride: 2", "Shared Ride: 3+",
                   "Walk-Transit-Walk", "Drive-Transit-Walk",
                   "Walk-Transit-Drive", "Walk", "Bicycle"]]

# Note the index of the alternative whose shape parameter is constrained.
asym_ref = 0

In [18]:
asym_model = cm.create_choice_model(data=bike_data,
                                    alt_id_col="mode_id",
                                    obs_id_col="observation_id",
                                    choice_col="choice",
                                    specification=mnl_specification,
                                    model_type="Asym",
                                    shape_ref_pos=asym_ref,
                                    names=mnl_names,
                                    shape_names=asym_shape_names)

# Note that the division by log(8) is to account for the fact that
# when each shape parameter is 1/8, the value of the estimated coefficients
# are equal to the mnl estimates, divided by log(8)
asym_model.fit_mle(None,
                   init_shapes=np.zeros(7),
                   init_coefs=mnl_model.params.values / math.log(8),
                   method="BFGS")

# Look at the parameter estimates
asym_model.get_statsmodels_summary()

Log-likelihood at zero: -7,599.7019
Initial Log-likelihood: -5,073.4276
Estimation Time: 1.28 seconds.
Final log-likelihood: -4,941.0134


0,1,2,3
Dep. Variable:,choice,No. Observations:,4004.0
Model:,Multinomial Asymmetric Logit Model,Df Residuals:,3978.0
Method:,MLE,Df Model:,26.0
Date:,"Fri, 21 Jul 2017",Pseudo R-squ.:,0.35
Time:,14:41:15,Pseudo R-bar-squ.:,0.346
AIC:,9934.027,Log-Likelihood:,-4941.013
BIC:,10097.698,LL-Null:,-7599.702

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
shape_Shared Ride: 2,2.0077,0.546,3.679,0.000,0.938 3.077
shape_Shared Ride: 3+,2.8046,0.552,5.076,0.000,1.722 3.887
shape_Walk-Transit-Walk,-1.3426,7.134,-0.188,0.851,-15.325 12.639
shape_Drive-Transit-Walk,-3.5829,3.110,-1.152,0.249,-9.678 2.512
shape_Walk-Transit-Drive,-3.9531,2.462,-1.606,0.108,-8.778 0.872
shape_Walk,-0.9607,0.364,-2.638,0.008,-1.675 -0.247
shape_Bicycle,-1.6316,0.367,-4.441,0.000,-2.352 -0.912
ASC Shared Ride: 2,-1.2413,0.315,-3.944,0.000,-1.858 -0.624
ASC Shared Ride: 3+,-0.7234,0.419,-1.725,0.084,-1.545 0.098


In [19]:
# Check the gradient of the asymmetric logit model
asym_model.gradient

shape_Shared Ride: 2                                   -0.001157
shape_Shared Ride: 3+                                  -0.005698
shape_Walk-Transit-Walk                                 0.009139
shape_Drive-Transit-Walk                               -0.003537
shape_Walk-Transit-Drive                               -0.001077
shape_Walk                                              0.004360
shape_Bicycle                                           0.001726
ASC Shared Ride: 2                                      0.019494
ASC Shared Ride: 3+                                    -0.067352
ASC Walk-Transit-Walk                                   0.019544
ASC Drive-Transit-Walk                                  0.079492
ASC Walk-Transit-Drive                                  0.002303
ASC Walk                                                0.121689
ASC Bike                                               -0.197913
Travel Time, units:min (All Auto Modes)                -0.344429
Travel Time, units:min (A

Again, this model is not at a local optimum.

## Estimate a Scobit Model

In [20]:
# Create the names of the shape parameters that are needed for the scobit model
scobit_shape_names = ["shape_" + x for x in
                      ["Drive Alone", "Shared Ride: 2",
                       "Shared Ride: 3+", "Walk-Transit-Walk",
                       "Drive-Transit-Walk", "Walk-Transit-Drive", 
                       "Walk", "Bicycle"]]

scobit_intercept_ref = 0

In [21]:
scobit_model = cm.create_choice_model(data=bike_data,
                                      alt_id_col="mode_id",
                                      obs_id_col="observation_id",
                                      choice_col="choice",
                                      specification=clog_specification,
                                      model_type="Scobit",
                                      intercept_ref_pos=scobit_intercept_ref,
                                      names=clog_names,
                                      intercept_names=clog_intercept_names,
                                      shape_names=scobit_shape_names)

scobit_model.fit_mle(None,
                     init_shapes=np.zeros(8),
                     init_intercepts=mnl_model.params.values[:7],
                     init_coefs=mnl_model.params.values[7:],
                     method="BFGS",
                     maxiter=1200)

# Look at the parameter estimates
scobit_model.get_statsmodels_summary()

Log-likelihood at zero: -7,599.7019
Initial Log-likelihood: -5,073.4276


  powered_term = np.power(1 + exp_neg_v, long_natural_shapes)
  term_2 = np.log(powered_term - 1)
  powered_term = np.power(1 + exp_neg_v, long_curve_shapes)
  powered_term / (powered_term - 1)) * long_curve_shapes
  powered_term / (powered_term - 1)) * long_curve_shapes
  powered_term = np.power(1 + exp_neg_v, long_curve_shapes)
  small_powered_term = np.power(1 + exp_neg_v, long_curve_shapes - 1)
  (powered_term - 1))
  (powered_term - 1))
  too_small_idx = transformed_utilities < min_exponent_val
  too_large_idx = transformed_utilities > max_exponent_val
  shape_too_big_idx = np.where((np.abs(systematic_utilities) <= 10) &
  small_powered_term /
  powered_term / (powered_term - 1)) * long_curve_shapes


Estimation Time: 3.57 seconds.
Final log-likelihood: -4,902.7909


0,1,2,3
Dep. Variable:,choice,No. Observations:,4004.0
Model:,Multinomial Scobit Model,Df Residuals:,3977.0
Method:,MLE,Df Model:,27.0
Date:,"Fri, 21 Jul 2017",Pseudo R-squ.:,0.355
Time:,14:43:13,Pseudo R-bar-squ.:,0.351
AIC:,9859.582,Log-Likelihood:,-4902.791
BIC:,10029.548,LL-Null:,-7599.702

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
shape_Drive Alone,0.5028,0.372,1.351,0.177,-0.227 1.232
shape_Shared Ride: 2,0.8035,0.375,2.142,0.032,0.068 1.539
shape_Shared Ride: 3+,0.9875,0.342,2.891,0.004,0.318 1.657
shape_Walk-Transit-Walk,2.9168,1.571,1.856,0.063,-0.163 5.996
shape_Drive-Transit-Walk,2.5653,1.572,1.632,0.103,-0.515 5.646
shape_Walk-Transit-Drive,2.4344,1.571,1.549,0.121,-0.645 5.514
shape_Walk,-0.8113,0.677,-1.198,0.231,-2.139 0.516
shape_Bicycle,-0.6618,0.509,-1.300,0.194,-1.660 0.336
ASC Shared Ride: 2,-0.2797,0.380,-0.735,0.462,-1.025 0.466


In [22]:
# Look at the gradient of the estimated parameters
scobit_model.gradient

shape_Drive Alone                                      -2.607805e-06
shape_Shared Ride: 2                                   -6.209087e-07
shape_Shared Ride: 3+                                   9.744885e-08
shape_Walk-Transit-Walk                                 7.362141e-06
shape_Drive-Transit-Walk                               -1.094433e-06
shape_Walk-Transit-Drive                               -7.469307e-08
shape_Walk                                              7.388080e-07
shape_Bicycle                                          -4.337768e-08
ASC Shared Ride: 2                                      1.636304e-07
ASC Shared Ride: 3+                                    -2.698426e-08
ASC Walk-Transit-Walk                                  -4.219510e-07
ASC Drive-Transit-Walk                                  6.264987e-08
ASC Walk-Transit-Drive                                  7.681464e-09
ASC Walk                                               -2.237817e-07
ASC Bike                          

Woot woot, the scobit model is not far from a local optima! (The gradient is approximately zero).

## Estimate a Multinomial Uneven Logit Model

In [23]:
uneven_model = cm.create_choice_model(data=bike_data,
                                       alt_id_col="mode_id",
                                       obs_id_col="observation_id",
                                       choice_col="choice",
                                       specification=clog_specification,
                                       model_type="Uneven",
                                       intercept_ref_pos=scobit_intercept_ref,
                                       names=clog_names,
                                       shape_names=scobit_shape_names,
                                       intercept_names=clog_intercept_names)

# Note that there are sign restrictions on the parameters so we 
# use the TNC optimization method and pass an argument that specifies
# these bounds for a constrained optimization.
uneven_model.fit_mle(None,
                     init_shapes=np.zeros(8),
                     init_intercepts=mnl_model.params.values[:7],
                     init_coefs=mnl_model.params.values[7:],
                     method="BFGS",
                     maxiter=1200)

# Look at the parameter estimates
uneven_model.get_statsmodels_summary()

Log-likelihood at zero: -7,599.7019
Initial Log-likelihood: -5,073.4276


  systematic_utilities)
  exp_shape_utilities = np.exp(long_shapes * systematic_utilities)
  exp_shape_utilities = np.exp(long_shapes * systematic_utilities)


Estimation Time: 13.05 seconds.
Final log-likelihood: -4,868.3533


0,1,2,3
Dep. Variable:,choice,No. Observations:,4004.0
Model:,Multinomial Uneven Logit Model,Df Residuals:,3977.0
Method:,MLE,Df Model:,27.0
Date:,"Fri, 21 Jul 2017",Pseudo R-squ.:,0.359
Time:,14:45:16,Pseudo R-bar-squ.:,0.356
AIC:,9790.707,Log-Likelihood:,-4868.353
BIC:,9960.673,LL-Null:,-7599.702

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
shape_Drive Alone,10.1565,7463.004,0.001,0.999,-1.46e+04 1.46e+04
shape_Shared Ride: 2,10.4403,7462.961,0.001,0.999,-1.46e+04 1.46e+04
shape_Shared Ride: 3+,10.6308,7462.953,0.001,0.999,-1.46e+04 1.46e+04
shape_Walk-Transit-Walk,-2.5517,9932.206,-0.000,1.000,-1.95e+04 1.95e+04
shape_Drive-Transit-Walk,-2.9018,9932.201,-0.000,1.000,-1.95e+04 1.95e+04
shape_Walk-Transit-Drive,-3.0172,9932.201,-0.000,1.000,-1.95e+04 1.95e+04
shape_Walk,0.1455,1.781,0.082,0.935,-3.345 3.636
shape_Bicycle,0.2788,1.298,0.215,0.830,-2.265 2.823
ASC Shared Ride: 2,-0.8064,0.261,-3.088,0.002,-1.318 -0.295


In [24]:
# Look at the gradient of the estimated parameters
uneven_model.gradient

shape_Drive Alone                                          0.014320
shape_Shared Ride: 2                                      -0.008025
shape_Shared Ride: 3+                                      0.025629
shape_Walk-Transit-Walk                                   -0.010737
shape_Drive-Transit-Walk                                  -0.002779
shape_Walk-Transit-Drive                                  -0.002034
shape_Walk                                                -0.001720
shape_Bicycle                                             -0.001141
ASC Shared Ride: 2                                         0.005546
ASC Shared Ride: 3+                                       -0.007126
ASC Walk-Transit-Walk                                      0.002482
ASC Drive-Transit-Walk                                     0.000736
ASC Walk-Transit-Drive                                     0.000591
ASC Walk                                                   0.000756
ASC Bike                                        

In [25]:
uneven_model.estimation_message

'Maximum number of iterations has been exceeded.'

The gradient for the uneven logit is not terribly close to zero, but the optimization function reached the maximum number of function evaluations. However, increasing the maximum number of evaluations does not seem to help. 

# Compare models

In [26]:
# Create a list of the models estimated in this notebook
current_models = [mnl_model, clog_model, asym_model, scobit_model, uneven_model]

In [27]:
# Look at the value of time for each model
for model in current_models:
    print model.model_type
    value_time_da = 30 *60 * (model.params["Travel Time, units:min (All Auto Modes)"] /
                          model.params["Travel Cost per Distance, units:$/mi (Drive Alone)"])
    value_time_s2 = 30 *60 * (model.params["Travel Time, units:min (All Auto Modes)"] /
                          model.params["Travel Cost per Distance, units:$/mi (SharedRide-2)"])
    value_time_s3 = 30 *60 * (model.params["Travel Time, units:min (All Auto Modes)"] /
                          model.params["Travel Cost per Distance, units:$/mi (SharedRide-3+)"])
    value_time_transit = 60 * (model.params["Travel Time, units:min (All Transit Modes)"] /
                               model.params["Travel Cost, units:$ (All Transit Modes)"])
    
    for mode, vt in zip(["Drive Alone", "SharedRide-2", "SharedRide-3+", "Transit"],
                        [value_time_da, value_time_s2, value_time_s3, value_time_transit]):
        conditional_msg = "" if vt > 0 else " <--(clearly wrong)"
#         if mode != "Transit":
#             conditional_msg = "-mi" + conditional_msg
        print "Value of Time for 30 mi {} is {:,.2f} $/hr{}".format(mode, vt, conditional_msg)
    print "="*35

Multinomial Logit Model
Value of Time for 30 mi Drive Alone is 27.01 $/hr
Value of Time for 30 mi SharedRide-2 is 6.73 $/hr
Value of Time for 30 mi SharedRide-3+ is 1.50 $/hr
Value of Time for 30 mi Transit is 12.92 $/hr
Multinomial Clog-log Model
Value of Time for 30 mi Drive Alone is 12.84 $/hr
Value of Time for 30 mi SharedRide-2 is 2.89 $/hr
Value of Time for 30 mi SharedRide-3+ is 1.02 $/hr
Value of Time for 30 mi Transit is 7.66 $/hr
Multinomial Asymmetric Logit Model
Value of Time for 30 mi Drive Alone is 31.00 $/hr
Value of Time for 30 mi SharedRide-2 is 9.72 $/hr
Value of Time for 30 mi SharedRide-3+ is 4.62 $/hr
Value of Time for 30 mi Transit is 12.09 $/hr
Multinomial Scobit Model
Value of Time for 30 mi Drive Alone is 17.58 $/hr
Value of Time for 30 mi SharedRide-2 is 6.92 $/hr
Value of Time for 30 mi SharedRide-3+ is 2.54 $/hr
Value of Time for 30 mi Transit is 10.87 $/hr
Multinomial Uneven Logit Model
Value of Time for 30 mi Drive Alone is 21.18 $/hr
Value of Time for 30 

In [28]:
# Compare the fit of all the models
fit_compare_df = pd.concat([model.fit_summary for model in current_models], axis=1)
fit_compare_df.columns = [model.model_type for model in current_models]
fit_compare_df

Unnamed: 0,Multinomial Logit Model,Multinomial Clog-log Model,Multinomial Asymmetric Logit Model,Multinomial Scobit Model,Multinomial Uneven Logit Model
Number of Parameters,19,19,26,27,27
Number of Observations,4004,4004,4004,4004,4004
Null Log-Likelihood,-7599.7,-7599.7,-7599.7,-7599.7,-7599.7
Fitted Log-Likelihood,-5073.43,-5116.17,-4941.01,-4902.79,-4868.35
Rho-Squared,0.332418,0.326793,0.349841,0.354871,0.359402
Rho-Bar-Squared,0.329917,0.324293,0.34642,0.351318,0.355849
Estimation Message,Desired error not necessarily achieved due to ...,Optimization terminated successfully.,Desired error not necessarily achieved due to ...,Desired error not necessarily achieved due to ...,Maximum number of iterations has been exceeded.


In [29]:
msg = "The geometric mean probability of correct prediction for the {} is {:.3%}"
unique_mode_ids = np.sort(bike_data.mode_id.unique())
for model in current_models:
    # Get the geometric mean probability of a correct in-sample probability forecast.
    geo_mean = np.exp(model.log_likelihood / model.nobs)
    
    # Figure out the geometric mean probability of correct predictions, by mode
    mode_results = OrderedDict()
    for mode in unique_mode_ids:
        rel_fitted_probs = model.long_fitted_probs[np.where((bike_data["mode_id"] == mode) &
                                                   (bike_data["choice"] == 1))]
        mode_results[mode] = round(np.exp(np.log(rel_fitted_probs).mean()), 4)
    print msg.format(model.model_type, geo_mean)
    print "The log-likelihood is {:.3f}".format(model.log_likelihood)
    print "Geometric means by mode:"
    pprint(mode_results.items())
    print "\n"

The geometric mean probability of correct prediction for the Multinomial Logit Model is 28.165%
The log-likelihood is -5073.428
Geometric means by mode:
[(1.0, 0.5314),
 (2.0, 0.1555),
 (3.0, 0.1984),
 (4.0, 0.2507),
 (5.0, 0.0506),
 (6.0, 0.0355),
 (7.0, 0.2526),
 (8.0, 0.0913)]


The geometric mean probability of correct prediction for the Multinomial Clog-log Model is 27.866%
The log-likelihood is -5116.174
Geometric means by mode:
[(1.0, 0.5284),
 (2.0, 0.1525),
 (3.0, 0.191),
 (4.0, 0.2462),
 (5.0, 0.0499),
 (6.0, 0.0366),
 (7.0, 0.2574),
 (8.0, 0.0917)]


The geometric mean probability of correct prediction for the Multinomial Asymmetric Logit Model is 29.112%
The log-likelihood is -4941.013
Geometric means by mode:
[(1.0, 0.5437),
 (2.0, 0.1636),
 (3.0, 0.2201),
 (4.0, 0.2506),
 (5.0, 0.0528),
 (6.0, 0.0358),
 (7.0, 0.253),
 (8.0, 0.091)]


The geometric mean probability of correct prediction for the Multinomial Scobit Model is 29.391%
The log-likelihood is -4902.791
Geometric m

In [30]:
msg = "The geometric mean probability of correct prediction for the old {} is {:.3%}"
old_models = {}
for model in ["logit", "asym_logit", "uneven_logit", "scobit", "clog_log"]:
    if model == "asym_logit":
        obj_path = "mle_model_results/multinomial_{}_model_v2_obj.pkl".format(model)
    else:
        obj_path = "mle_model_results/multinomial_{}_model_obj.pkl".format(model)
    old_models[model] = pd.read_pickle(obj_path)
    
    nobs = old_models[model].fitted_probs.shape[0]
    geo_mean = np.exp(old_models[model].log_likelihood / nobs)
    print msg.format(old_models[model].model_type, geo_mean)
    print "\n"

ImportError: No module named integrated_mnl

# Save the models, data and specifications

In [None]:
results_folder = "mle_model_results/cordon_toll_models/model_2"
for model in current_models:
    model_path = os.path.join(results_folder,
                              model.model_type
                                   .replace(" ", "_")
                                   .lower() + "_object.pkl")
    model.to_pickle(model_path)

In [None]:
# Save the bike data
bike_data.to_csv(long_form_path, index=False)

# Save the specification and parameter names
pd.to_pickle(mnl_specification, "cordon_toll_mnl_specification.pkl")
pd.to_pickle(mnl_names, "cordon_toll_mnl_specification_names.pkl")

# To Do:
<p>
    <ol>
        <li> Try again to see if the MultiNest (and possibly PolyChord) estimation will work. </li>
    </ol>
</p>