# Importing all necessary packages

In [51]:
import numpy as np
import pandas as pd
import scipy.stats
from fitter import Fitter
from collections import defaultdict, OrderedDict
import pylogit as pl
from functools import reduce
import seaborn as sns
import random
from scipy import sparse
import copy
import sys
from tqdm import tqdm, tqdm_notebook

# Distribution Fitting Class Definition

In [17]:
class FitDistribution(object):
    """Fit and simulate data to known distributions.

    Input:
    ------
    - data: array-like or dataframe.
    - dists: list.
        This parameter contains a list of distributions to be explored.
        When None, every available distribution on scipy is explored.
    - bins: int.
        Numbers of bins to be used for the cumulative histogram. This has
        an impact on the quality of the fit.
    - timeout: int.
        Maximum time for a given distribution. If timeout is reached,
        the distribution is skipped.
        """
    def __init__(self, data, dists=None, timeout=30, verbose=False, bins=100):
        self.data = data
        # self.var_types = var_types
        self.dists = dists
        self.timeout = timeout
        self.verbose = verbose
        self.bins = bins
        self.ArrayDistDict = defaultdict()
        self.params_dict = defaultdict(dict)

    def FindArrayDist(self, cat_var):
        """Function to extract the best distribution for a specified array.
        Uses the fit method from the Fitter module in the fitter library
        Inputs:
        -------
        - cat_var: boolean
            Boolean to signify whether the variable to be simulated
            is discrete/categorical or continuous.

        Outputs:
        -------
        By default, the function returns a dictionary with best distribution
        name and parameters associated with it. If a number of distributions
        was specified, the function returns a pandas DataFrame with
        the N best distributions, along with a plot showing all of them."""
        self.ArrayDistDict = dict()
        if cat_var is True:
            self.ArrayDistDict['distribution'] = 'categorical'
            np_array_range = np.arange(self.data.max()+1)
            array_bincount = np.bincount(self.data)
            probs = array_bincount / len(self.data)

            self.ArrayDistDict['parameters'] = [np_array_range,
                                                probs]
        else:
            fitter_object = Fitter(data=self.data,
                                   distributions=self.dists,
                                   timeout=self.timeout)
            fitter_object.fit()
            BestDict = fitter_object.get_best()
            self.ArrayDistDict['distribution'] = list(BestDict.items())[0][0]
            self.ArrayDistDict['parameters'] = list(BestDict.items())[0][1]
        return self.ArrayDistDict

    def SimArray(self, size=100):
        """Function to simulate data for an array based on the best fitted
        distribution.
        Input:
        -----
        - size : int
                size of the array to be simulated.
        Outputs:
        -------
        Simulated array based on the best fit distribution."""
        if self.ArrayDistDict['distribution'] == 'categorical':
            value = self.ArrayDistDict['parameters'][0]
            freq = self.ArrayDistDict['parameters'][1]
            Sim_Array = np.random.choice(a=value,
                                         p=freq,
                                         size=size)
        else:
            dist = getattr(scipy.stats, self.ArrayDistDict['distribution'])
            Sim_Array = dist.rvs(*self.ArrayDistDict['parameters'], size=size)
        return Sim_Array

    def FindDfDist(self, var_types):
        """Function to extract the best distribution from a specified
        dataframe. Uses the function find_dist, which in turn uses the
        fit method from the Fitter module in the fitter library
        Inputs:
        -------
        - var_types: dictionary
            Dictionary with keys as column names for dataset variables,
            the value of each key is a string showing whether the
            variable is discrete/cat or continuous.

        Outputs:
        -------
        *FOR NOW*, the function returns a dictionary showing the best
        distribution name for each array in the dataframe and parameters
        associated with it.
        """

        for column in list(self.data.columns):

            if var_types[column] == 'categorical':
                if len(self.data[column].unique()) == 1:
                    self.params_dict[column]['distribution'] = 'constant'
                    self.params_dict[column]['parameters'] = \
                        self.data[column].unique()
                else:
                    self.params_dict[column]['distribution'] = 'categorical'
                    np_array_range = np.arange(self.data[column].max()+1)
                    array_bincount = np.bincount(self.data[column])
                    probs = array_bincount / len(self.data[column])
                    self.params_dict[column]['parameters'] = [np_array_range,
                                                              probs]
            else:
                if len(self.data[column].unique()) == 1:
                    self.params_dict[column]['distribution'] = 'constant'
                    self.params_dict[column]['parameters'] = \
                        self.data[column].unique()
                else:
                    fitter_object = Fitter(data=self.data[column],
                                           distributions=self.dists,
                                           timeout=self.timeout)
                    fitter_object.fit()
                    BestDict = fitter_object.get_best()
                    self.params_dict[column]['distribution'] = \
                        list(BestDict.items())[0][0]
                    self.params_dict[column]['parameters'] = \
                        list(BestDict.items())[0][1]
        return self.params_dict

    def SimDf(self, size=1000):
        """Funtion to simulate data of size N based on specified
        distribution/parameters found by the fitter package.
        Inputs:
        -------
        data: dataframe from which columns are to be taken
        dist_params: the distribution parameters from find_dist_df
        Outputs:
        -------
        DataFrame object with simulated data based on specified distributions
        """
        Sim_Df = pd.DataFrame(columns=list(self.params_dict.keys()))
        Sim_Df = Sim_Df.fillna(0)
        for column in list(self.params_dict.keys()):
            if self.params_dict[column]['distribution'] == 'categorical':
                value = self.params_dict[column]['parameters'][0]
                freq = self.params_dict[column]['parameters'][1]
                data_sim = np.random.choice(a=value,
                                            p=freq,
                                            size=size)
                Sim_Df[column] = data_sim
            elif self.params_dict[column]['distribution'] == 'constant':
                data_sim = self.params_dict[column]['parameters'][0]
                Sim_Df[column] = data_sim
            else:
                dist = getattr(scipy.stats,
                               self.params_dict[column]['distribution'])
                data_sim = dist.rvs(*self.params_dict[column]['parameters'],
                                    size=size)
                Sim_Df[column] = data_sim
        return Sim_Df

## Find Distribution Based on Long Data

In [90]:


def FindLongDataDist(data_long,
                     alt_id_col,
                     obs_id_col,
                     alt_spec,
                     alt_name_dic,
                     ind_spec,
                     trip_spec,
                     var_types,
                     cont_dists=None):

    # Initialize the output parameters dictionary
    params_dict = defaultdict(dict)

    # Loop around individual specific variables
    for ind in ind_spec:
        # generate array of values for individual specific variable
        ind_var = pd.Series([(data_long.loc[data_long[obs_id_col] == x][ind].unique()[0]) for x in data_long[obs_id_col].unique()])
        # Get distribution if variable is categorical
        if var_types[ind] == 'categorical':
            # If only one category
            if len(ind_var.unique()) == 1:
                params_dict[ind]['distribution'] = 'constant'
                params_dict[ind]['parameters'] = ind_var.unique()
            # If more than one category
            else:
                params_dict[ind]['distribution'] = 'categorical'
                # Count frequency of values and store it as paramater of distribution
                np_array_range = np.arange(ind_var.max()+1)
                array_bincount = np.bincount(ind_var)
                probs = array_bincount / len(ind_var)
                params_dict[ind]['parameters'] = [np_array_range,
                                                  probs]
        else:
            # If not categorical but just one unique value
            if len(ind_var.unique()) == 1:
                params_dict[ind]['distribution'] = 'constant'
                params_dict[ind]['parameters'] = ind_var.unique()
            # If not categorical but not one unique value
            else:
                # Use the Fitter library to fit distributions
                # to the data
                fitter_object = Fitter(data=ind_var,
                                       distributions=cont_dists,
                                       timeout=30)
                fitter_object.fit()
                # Get the best distribution and store in dictionary
                BestDict = fitter_object.get_best()
                params_dict[ind]['distribution'] = list(BestDict.items())[0][0]
                params_dict[ind]['parameters'] = list(BestDict.items())[0][1]

    # Code for Alternative Specific Variables
    # Loop around the different available alternatives
    for alt in data_long[alt_id_col].unique():
        # Store data for specific alternative (mode)
        mode_data = data_long.loc[data_long['mode_id'] == alt]
        # Loop around the alternative specific variables in the input dictionary
        for var in alt_spec:
            # If data is categorical
            if var_types[var] == 'categorical':
                # If only one category
                if len(mode_data[var].unique()) == 1:
                    # Add name of alternative to variable and store distriburion & parameters
                    params_dict[var+'_'+alt_name_dic[alt]]['distribution'] = 'constant'
                    params_dict[var+'_'+alt_name_dic[alt]]['parameters'] = mode_data[var].unique()
                else:
                    # If more than one category, compute the frequency of values
                    # and store as parameters
                    # Add name of alternative to variable and store distriburion & parameters
                    params_dict[var+'_'+alt_name_dic[alt]]['distribution'] = 'categorical'
                    np_array_range = np.arange(mode_data[var].max()+1)
                    array_bincount = np.bincount(mode_data[var])
                    probs = array_bincount / len(mode_data[var])
                    params_dict[var+'_'+alt_name_dic[alt]]['parameters'] = [np_array_range,
                                                                            probs]
            else:
                # If data is not categorical but has one unique value
                if len(mode_data[var].unique()) == 1:
                    # Add name of alternative to variable and store distriburion & parameters
                    params_dict[var+'_'+alt_name_dic[alt]]['distribution'] = 'constant'
                    params_dict[var+'_'+alt_name_dic[alt]]['parameters'] = mode_data[var].unique()
                # If data is not categorical but has more than one unique value
                else:
                    # Use the Fitter library to fit distributions
                    # to the data
                    fitter_object = Fitter(data=mode_data[var],
                                           distributions=cont_dists,
                                           timeout=30)
                    fitter_object.fit()
                    # Get the best distribution and store in dictionary
                    BestDict = fitter_object.get_best()
                    # Add name of alternative to variable and store distriburion & parameters
                    params_dict[var+'_'+alt_name_dic[alt]]['distribution'] = list(BestDict.items())[0][0]
                    params_dict[var+'_'+alt_name_dic[alt]]['parameters'] = list(BestDict.items())[0][1]

    # Trip Specific Variable (maybe combine with individual specific variables)
    # Loop around trip (observation) specific variables
    for var in trip_spec:
        # generate array of values for trip specific variable
        trip_var = pd.Series([(data_long.loc[data_long[obs_id_col] == x][var].unique()[0]) for x in data_long[obs_id_col].unique()])
        # Get distribution if variable is categorical
        if var_types[var] == 'categorical':
            if len(trip_var.unique()) == 1:
            # If only one category
                params_dict[var]['distribution'] = 'constant'
                params_dict[var]['parameters'] = trip_var.unique()
            else:
            # If more than one category
                params_dict[var]['distribution'] = 'categorical'
            # Count frequency of values and store it as paramater of distribution
                np_array_range = np.arange(trip_var.max()+1)
                array_bincount = np.bincount(trip_var)
                probs = array_bincount / len(trip_var)
                params_dict[var]['parameters'] = [np_array_range,
                                                  probs]
        else:
            # If not categorical but just one unique value
            if len(trip_var.unique()) == 1:
                params_dict[var]['distribution'] = 'constant'
                params_dict[var]['parameters'] = trip_var.unique()
            # If not categorical but just one unique value
            else:
                # Use the Fitter library to fit distributions
                # to the data
                fitter_object = Fitter(data=trip_var,
                                       distributions=cont_dists,
                                       timeout=30)
                fitter_object.fit()
                # Get the best distribution and store in dictionary
                BestDict = fitter_object.get_best()
                params_dict[var]['distribution'] = list(BestDict.items())[0][0]
                params_dict[var]['parameters'] = list(BestDict.items())[0][1]

    return params_dict


def SimDf(params_dict, size=1000):
    """Funtion to simulate data of size N based on specified
    distribution/parameters found by the fitter package.
    Inputs:
    -------
    data: dataframe from which columns are to be taken
    dist_params: the distribution parameters from find_dist_df
    Outputs:
    -------
    DataFrame object with simulated data based on specified distributions
    """
    Sim_Df = pd.DataFrame(columns=list(params_dict.keys()))
    Sim_Df = Sim_Df.fillna(0)
    for column in list(params_dict.keys()):
        if params_dict[column]['distribution'] == 'categorical':
            data_sim = np.random.choice(a=params_dict[column]['parameters'][0],
                                        p=params_dict[column]['parameters'][1],
                                        size=size)
            Sim_Df[column] = data_sim
        elif params_dict[column]['distribution'] == 'constant':
            data_sim = params_dict[column]['parameters'][0]
            Sim_Df[column] = data_sim
        else:
            dist = getattr(scipy.stats, params_dict[column]['distribution'])
            data_sim = dist.rvs(*params_dict[column]['parameters'], size=size)
            Sim_Df[column] = data_sim
    return Sim_Df


def SimulateAvailability(data_long, sim_data, obs_id_col, alt_name_dict):

    series = pd.Series([])
    for i, obs in zip(np.arange(len(data_long[obs_id_col].unique())), data_long[obs_id_col].unique()):
        series[i] = data_long[data_long[obs_id_col] == obs].shape[0]

    av_size = sim_data.shape[0]
    alts_sim = np.random.choice(a=np.arange(series.max()+1),
                                p=np.bincount(series)/len(series),
                                size=av_size)

    N = len(alt_name_dict)

    av_sim = [np.array([1] * K + [0]*(N-K)) for K in alts_sim]

    for x in av_sim:
        np.random.shuffle(x)

    np.random.shuffle(av_sim)
    AV_columns = [alt_name_dict[i]+'_AV' for i in alt_name_dict.keys()]
    AV_Df = pd.DataFrame(av_sim, columns=AV_columns)
    choice = [random.choice(np.nonzero(a == 1)[0]) + 1 for a in np.array(AV_Df)]
    choice_df = pd.DataFrame(choice, columns=['sim_choice'])
    Sim_DF_AV = pd.concat([sim_data, AV_Df, choice_df], axis=1, sort=False)
    return Sim_DF_AV

# Functions for simulating choices - to be imported as library

In [91]:
def progress(*args, **kwargs):
    """
    Creates a tqdm progressbar iterable based on whether one is in ipython.
    In ipython it will return a `tqdm_notebook` iterable. Else, it returns a
    `tqdm` iterable. If there is an error with calling `tqdm_notebook`, such as
    errors from using new versions of tqdm with old versions of Juypter
    notebooks, a call to `tqdm` will be made instead.
    Parameters
    ----------
    args, kwargs: passed directly to `tqdm` and `tqdm_notebook`.
    """
    if _is_kernel():
        try:
            return tqdm_notebook(*args, **kwargs)
        except:
            return tqdm(*args, **kwargs)
    return tqdm(*args, **kwargs)

def _simulate_wide_binary_choices(predictions, rseed=None):
    """
    Take vectorized random draws over many bernoulli random variables with
    different probabilities of success. This function is faster than using a
    for-loop and repeated calls to `np.random.choice`.
    """
    # Initialize the simulated choices
    choice_vec = np.zeros(predictions.shape, dtype=int)

    # Set the random seed if desired
    if rseed is not None:
        np.random.seed(rseed)

    # Generate uniform random variates
    uniform_draws =\
        np.random.uniform(size=predictions.shape)

    # Determine which predictions led to 'successful' observations
    choice_vec[np.where(uniform_draws <= predictions)] = 1
    return choice_vec


def _simulate_choices_for_1obs(obs_id,
                               rows_per_obs,
                               predicted_probs):
    """
    Generates the chosen rows for each simulated choice situation for the given
    decision maker.
    Parameters
    ----------
    obs_id : positive int.
        The identification number of a given decision maker.
    rows_per_obs : dict.
        Keys should be integers, including `obs_id`. Values should be a list of
        ints, where each int is a row of `predicted_probs` that is associated
        with the decision maker identified by `obs_id`.
    predicted_probs : 2D ndarray of floats in (0.0, 1.0).
        Each row should correspond to a particular alternative for a particular
        observation. Each column should correspond to a sampled parameter
        vector. Finally, each element should denote the probability of that
        alternative being chosen by that decision maker, given their
        explanatory variables and the sampled model parameters.
    Returns
    -------
    chosen_rows : 1D ndarray of ints.
        Should have shape `(predicted_probs.shape[1],)`. There will be one
        value for each simulated choice situation, i.e. each column of
        `predicted_probs`. Each value will represent the row that correspondes
        to the chosen alternative for the corresponding choice situation.
    """
    # Get the rows belonging to this observation
    obs_rows = rows_per_obs[obs_id]

    # Get the current probabilities
    current_long_probs = predicted_probs[obs_rows, :]

    # Get the 'cdf' of each alternative
    current_cdf = np.cumsum(current_long_probs, axis=0)

    # Draw random uniform values for each probability vector
    uniform_draws = np.random.uniform(size=predicted_probs.shape[1])

    # Determine which alternative's 'bucket' the random value
    # might have fallen into.
    possible_alts =\
        (np.arange(1, obs_rows.size + 1)[:, None] *
         (current_cdf >= uniform_draws[None, :]))
    # Give a 'big' value to alternatives that are not chosen
    possible_alts[np.where(possible_alts == 0)] = obs_rows.size + 10
    # Figure out the exact rows/alternatives that were chosen
    chosen_rows = obs_rows[np.argmin(possible_alts, axis=0)]
    return chosen_rows


def simulate_choice_vector(predicted_probs,
                           observation_ids,
                           wide_binary=False,
                           rseed=None):
    """
    Simulates choice outcomes based on the predicted probabilities of each
    alternative for each observation.
    Parameters
    ----------
    predicted_probs : 2D ndarray of floats in (0.0, 1.0).
        Each row should correspond to a particular alternative for a particular
        observation. Each column should correspond to a sampled parameter
        vector. Finally, each element should denote the probability of that
        alternative being chosen by that decision maker, given their
        explanatory variables and the sampled model parameters.
    observation_ids : 1D ndarray of ints.
        Each element should represent an obervation id. Should have
        `observation_ids.shape[0] == predicted_probs.shape[0]`.
    wide_binary : bool, optional.
        Denotes whether `predicted_probs` are for a wide-format dataset of
        binary choices or not.
    rseed : positive int or None, optional.
        The random seed used to simulate the choices. Use when one wants to
        reproduce particular simulations. Default is None.
    Returns
    -------
    simulated_y : 2D ndarray of zeros and ones.
        Each row should correspond to a particular alternative for a particular
        observation. Each column should correspond to a sampled parameter
        vector. Finally, each element will be a one if that row's alternative
        was chosen by that row's decision-maker for that columns simulated
        parameter vector. Otherwise, the element will be a zero. When
        `wide_binary == True`, each element in `simulated_y` will indicate
        whether that row's observation had `y == 1` for that simulation or not.
    """
    # Make predicted_probs 2D
    if predicted_probs.ndim == 1:
        predicted_probs = predicted_probs[:, None]
    elif predicted_probs.ndim > 2:
        msg = 'predicted_probs should have 1 or 2 dimensions.'
        raise ValueError(msg)

    # Make the wide-format binary simulations if necessary
    if wide_binary:
        return _simulate_wide_binary_choices(predicted_probs, rseed=rseed)

    # Determine the unique values in observation_ids
    unique_idx = np.sort(np.unique(observation_ids, return_index=True)[1])
    unique_obs = observation_ids[unique_idx]

    # Determine the rows belonging to each observation
    rows_per_obs = {k: np.where(observation_ids == k)[0] for k in unique_obs}

    # Initialize an array of simulated choices
    choice_vec = np.zeros(predicted_probs.shape, dtype=int)

    # Create an index for the columns
    col_idx = np.arange(predicted_probs.shape[1])

    # Set the seed if desired
    if isinstance(rseed, int):
        np.random.seed(rseed)

    # Populate the array
    for obs_id in progress(unique_obs.tolist(), desc='Simulating Choices'):
        # Determine the exact rows/alternatives chosen in each situation
        chosen_rows =\
            _simulate_choices_for_1obs(obs_id, rows_per_obs, predicted_probs)

        # Store the simulated choice
        choice_vec[chosen_rows, col_idx] = 1

    return choice_vec

# Functions to calculate probabilities for each alternative **(Replaced by functions from TB)** - For Documentation Purposes

In [None]:
# def add_intercept_to_df(df_long, specification_dict):
#     """Function to add intercept to long format DataFrame
#     Parameters
#     ----------
#     df_long: DataFrame
#         Long format Pandas DataFrame to which to add 
#         intercept column.
#     specification_dict: dict
#         Specification Dictionary for the model
    
#     Returns
#     -------
#     In-place Pandas DataFrame with additional intercept column.
        
#     """
#     if ("intercept" in specification_dict
#             and "intercept" not in df_long.columns):
#         df_long["intercept"] = 1
#     return None


# def create_design_matrix(df_long, specification_dict,
#                          names_dict, alternative_id_col):

#     add_intercept_to_df(df_long, specification_dict)

#     columns = []
#     for col in specification_dict:
#         for group in specification_dict[col]:
#             if type(group) == list:
#                 columns.append(df_long[alternative_id_col].isin(group)
#                                * df_long[col])
#             else:
#                 columns.append((df_long[alternative_id_col] == group)
#                                * df_long[col])

#     design_matrix = np.stack(columns, axis=1)

#     var_names = []
#     for variable in names_dict:
#         for name in names_dict[variable]:
#             var_names.append(name)

#     return design_matrix, var_names


# def calculate_utilities(betas, design_matrix):

#     limit_max = 700
#     limit_min = -700

#     utility = design_matrix.dot(betas)
#     utility[utility > limit_max] = limit_max
#     utility[utility < limit_min] = limit_min

#     utilities = np.exp(utility)

#     return utilities


# def create_mapping_matrix(df_long, observation_id_col):
#     row_to_col_matrix = pd.get_dummies(df_long[observation_id_col]).values
# #     row_to_col_matrix = (df_long[observation_id_col].values[:,None] ==
# #                          np.sort(df_long[observation_id_col].unique())[None,:]).astype(int)
#     sparse_row_to_col_matrix = sparse.csr_matrix(row_to_col_matrix)

#     mapping_matrix = sparse_row_to_col_matrix.dot(sparse_row_to_col_matrix.T)

#     return mapping_matrix


# def calculate_probabilities(betas, design_matrix, mapping_matrix):

#     utilities = calculate_utilities(betas, design_matrix)
#     denominator = mapping_matrix.dot(utilities)
#     probabilities = utilities/denominator
#     probabilities[probabilities == 0] = 1e-300

#     return probabilities

# Function to simulate choices based on long data format - For Documentation Purposes

In [5]:


# def SimulateChoices(long_data, alt_id_col,
#                     obs_id_col, number_alts,
#                     spec_dic, names_dic, init_betas):
#     """
#     Function to simulate choices from a long data
#     format dataset.
    
#     Parameters
#     ----------
#     long_data : DataFrame
#         The DataFrame to be used, in long format.
#     alt_id_col: string
#         Name of the column containing the alternative
#         id numbers in the long format dataset.
#     obs_id_col: string
#         Name of the column containing the observation
#         id numbers in the long format dataset.
#     number_alts: int
#         Number of alternatives in the long format
#         dataset.
#     spec_dic: dictionary
#         Dictionary of the model specification.
#     names_dic: dictionary
#         Dictionary of the alternative names.
#     init_betas: list
#         List of the initial betas for the model
#         from which the choices should be simulated.
    
#     Returns
#     -------
#     DataFrame object with the simulated choices column
#     added as 'sim_choice'
#     """
#     # Declare the simulated choice column name
#     sim_choice_col = 'sim_choice'

#     # Make a copy of the data
#     data = copy.deepcopy(long_data)
#     # Functions to generate the design matrix, mapping matrix,
#     # and calculate the probabilities for each alternative
#     design_matrix, names = create_design_matrix(df_long=data,
#                                                 specification_dict=spec_dic,
#                                                 names_dict=names_dic,
#                                                 alternative_id_col=alt_id_col)
#     mapping_matrix = create_mapping_matrix(df_long=data,
#                                            observation_id_col=obs_id_col)
#     probabilities = calculate_probabilities(betas=initial_betas,
#                                             design_matrix=design_matrix,
#                                             mapping_matrix=mapping_matrix)
#     # Assign calculated probabilities to new dataframe column
#     data['probabilities'] = probabilities
#     # Initialize cumulative sum and simulated choice columns
#     data['cum_sum'] = 0
#     data['sim_choice'] = 0

#     # Loop around the observations and compute probabilities' cumulative
#     # sums for each alternative
#     for observation in data['observation_id'].unique():
#         probs_sum = data[long_data.observation_id == observation]['probabilities'].cumsum()
#         data.loc[data['observation_id'] == observation, 'cum_sum'] = probs_sum

#     # Generate list for observation ids to be used in simulating choices    
#     observation_id_list = list(data.observation_id.unique())
#     # Generate a "random utility" array of the same size as the number
#     # of observations in the dataset
#     u_random = np.random.uniform(size=len(data['observation_id'].unique()))

#     # Loop around the generate utilities and observations in the dataset
#     # to assign a choice to each
#     for u, obs in zip(u_random, observation_id_list):
#         # select data for observation number "obs"
#         data_sample = data[data['observation_id'] == obs]
#         # generate list of available modes for each observation
#         sorted_list = sorted(list(data_sample['mode_id'].unique()))
#         # initialize a dictionary from the available modes for 
#         # each observation
#         choices = dict.fromkeys(sorted_list, 0)
#         # Loop round the modes for each observation and assign 
#         # choice (0 vs. 1)
#         for alt in sorted_list:
#             choices[alt] = np.where(u <= data_sample[data_sample['mode_id']
#                                                      == alt]
#                                     ['cum_sum'], 1, 0).item()
#             # Once a choice is assigned, break out of loop
#             if choices[alt] == 1:
#                 break
#         # Map the choices for the observation to the long format dataframe       
#         data.loc[data.observation_id == obs, sim_choice_col] = \
#             data['mode_id'].map(choices)
#     return data

# Example using bike data 

## Data Ingestion and Exploration 

In [77]:
# Create a variable for the path to the long format data for
# the multinomial choice model
PATH = '/Users/mobouzaghrane/Documents/GitHub/tr_b_causal_2020/'\
        'data/raw/spring_2016_all_bay_area_long_format_plus_cross_bay_col.csv'

In [78]:
# Reading data from the specified PATH
bike_data_long = pd.read_csv(PATH)

# If in previous work we accidentally saved the index with the dataframe
# remove the old index from the data
if "Unnamed: 0" in bike_data_long.columns:
    del bike_data_long["Unnamed: 0"]

print("The columns of bike_data are:")
bike_data_long.columns

The columns of bike_data are:


Index(['household_id', 'person_id', 'tour_id', 'observation_id', 'mode_id',
       'choice', 'tour_origin_taz', 'primary_dest_taz', 'total_travel_time',
       'total_travel_cost', 'total_travel_distance', 'age', 'household_size',
       'household_income', 'household_income_values', 'transit_subsidy',
       'transit_subsidy_amount', 'num_cars', 'num_licensed_drivers',
       'cross_bay', 'oakland_and_berkeley', 'survey_id', 'gender',
       'non_relative_flag', 'num_pre_school', 'num_school_aged', 'married',
       'parent', 'income_category_1', 'income_category_2', 'income_category_3',
       'income_category_4', 'income_category_5', 'income_category_6',
       'income_category_7', 'income_category_8', 'income_category_9',
       'income_category_10', 'income_unknown', 'ln_drive_cost',
       'ln_drive_cost_sq', 'total_travel_time_10x', 'total_travel_time_tenth',
       'high_income', 'medium_income', 'low_income', 'high_income_cost',
       'medium_income_cost', 'low_income_cost', 

In [79]:
# Look at the mode shares in the data set
alt_id_to_mode_name = {1: "Drive Alone",
                       2: "Shared Ride 2",
                       3: "Shared Ride 3+",
                       4: "Walk-Transit-Walk",
                       5: "Drive-Transit-Walk",
                       6: "Walk-Transit-Drive",
                       7: "Walk",
                       8: "Bike"}

mode_counts = bike_data_long.loc[bike_data_long.choice == 1,
                                 "mode_id"].value_counts().loc[range(1, 9)]

mode_shares = mode_counts / bike_data_long.observation_id.max()
mode_shares.index = [alt_id_to_mode_name[x] for x in mode_shares.index.values]
mode_shares.name = "Mode Shares"
mode_shares

Drive Alone           0.428322
Shared Ride 2         0.158841
Shared Ride 3+        0.139860
Walk-Transit-Walk     0.103397
Drive-Transit-Walk    0.015485
Walk-Transit-Drive    0.013237
Walk                  0.094406
Bike                  0.046454
Name: Mode Shares, dtype: float64

## MNL Model Specification 

In [80]:
# Create my specification and variable names for the basic MNL model
# NOTE: - Keys should be variables within the long format dataframe.
#         The sole exception to this is the "intercept" key.
#       - For the specification dictionary, the values should be lists
#         or lists of lists. Within a list, or within the inner-most
#         list should be the alternative ID's of the alternative whose
#         utility specification the explanatory variable is entering.

mnl_specification = OrderedDict()
mnl_names = OrderedDict()

mnl_specification["intercept"] = [2, 3, 4, 5, 6, 7, 8]
mnl_names["intercept"] = ['ASC Shared Ride: 2',
                          'ASC Shared Ride: 3+',
                          'ASC Walk-Transit-Walk',
                          'ASC Drive-Transit-Walk',
                          'ASC Walk-Transit-Drive',
                          'ASC Walk',
                          'ASC Bike']

mnl_specification["total_travel_time"] = [[1, 2, 3], [4, 5, 6]]
mnl_names["total_travel_time"] = ['Travel Time, units:min (All Auto Modes)',
                                  'Travel Time, units:min (All Transit Modes)']

mnl_specification["total_travel_cost"] = [[4, 5, 6]]
mnl_names["total_travel_cost"] = ['Travel Cost, units:$ (All Transit Modes)']

mnl_specification["cost_per_distance"] = [1, 2, 3]
mnl_names["cost_per_distance"] = ["Travel Cost per Distance, units:$/mi (Drive Alone)",
                                  "Travel Cost per Distance, units:$/mi (SharedRide-2)",
                                  "Travel Cost per Distance, units:$/mi (SharedRide-3+)"]

mnl_specification["cars_per_licensed_drivers"] = [[1, 2, 3]]
mnl_names["cars_per_licensed_drivers"] = ["Autos per licensed drivers (All Auto Modes)"]

mnl_specification["total_travel_distance"] = [7, 8]
mnl_names["total_travel_distance"] = ['Travel Distance, units:mi (Walk)',
                                      'Travel Distance, units:mi (Bike)']

# mnl_specification["cross_bay"] = [[2, 3], [4, 5, 6]]
# mnl_names["cross_bay"] = ["Cross-Bay Tour (Shared Ride 2 & 3+)",
#                           "Cross-Bay Tour (All Transit Modes)"]
mnl_specification["cross_bay"] = [[2, 3]]
mnl_names["cross_bay"] = ["Cross-Bay Tour (Shared Ride 2 & 3+)"]

mnl_specification["household_size"] = [[2, 3]]
mnl_names["household_size"] = ['Household Size (Shared Ride 2 & 3+)']

mnl_specification["num_kids"] = [[2, 3]]
mnl_names["num_kids"] = ["Number of Kids in Household (Shared Ride 2 & 3+)"]

In [81]:
# Estimate the basic MNL model, using the hessian and newton-conjugate gradient
mnl_model = pl.create_choice_model(data=bike_data_long,
                                   alt_id_col="mode_id",
                                   obs_id_col="observation_id",
                                   choice_col="choice",
                                   specification=mnl_specification,
                                   model_type="MNL",
                                   names=mnl_names)

num_vars = len(reduce(lambda x, y: x + y, mnl_names.values()))
# Note newton-cg used to ensure convergence to a point where gradient
# is essentially zero for all dimensions.
mnl_model.fit_mle(np.zeros(num_vars),
                  method="BFGS")

# Look at the estimation results
mnl_model.get_statsmodels_summary()

Log-likelihood at zero: -7,599.7019
Initial Log-likelihood: -7,599.7019
Estimation Time for Point Estimation: 0.17 seconds.
Final log-likelihood: -5,073.4276




0,1,2,3
Dep. Variable:,choice,No. Observations:,4004.0
Model:,Multinomial Logit Model,Df Residuals:,3985.0
Method:,MLE,Df Model:,19.0
Date:,"Wed, 25 Mar 2020",Pseudo R-squ.:,0.332
Time:,22:42:35,Pseudo R-bar-squ.:,0.33
AIC:,10184.855,Log-Likelihood:,-5073.428
BIC:,10304.461,LL-Null:,-7599.702

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ASC Shared Ride: 2,-1.0097,0.486,-2.079,0.038,-1.962,-0.058
ASC Shared Ride: 3+,3.4619,1.064,3.254,0.001,1.377,5.547
ASC Walk-Transit-Walk,-0.3921,0.288,-1.360,0.174,-0.957,0.173
ASC Drive-Transit-Walk,-2.6220,0.303,-8.660,0.000,-3.215,-2.029
ASC Walk-Transit-Drive,-2.9773,0.306,-9.725,0.000,-3.577,-2.377
ASC Walk,1.5541,0.305,5.101,0.000,0.957,2.151
ASC Bike,-1.1059,0.305,-3.628,0.000,-1.703,-0.508
"Travel Time, units:min (All Auto Modes)",-0.0760,0.006,-13.728,0.000,-0.087,-0.065
"Travel Time, units:min (All Transit Modes)",-0.0274,0.002,-12.768,0.000,-0.032,-0.023


# Simulate Data - Based on Wide Data

## Convert Data from Long to Wide before simulation

In [82]:
# Create variable list for subset of long data,
# I will add the remaining values from the long format dataset
alt_id_col = "mode_id"

obs_id_col = "observation_id"

choice_col = "choice"

# Store of columns relevant to the data to be simulated
model_variables = list(mnl_model.specification.keys())
model_variables.remove('intercept')
model_variables.extend([alt_id_col, obs_id_col,
                        choice_col, 'num_cars', 'num_licensed_drivers'])
print('The model variables of interest are:')
model_variables

The model variables of interest are:


['total_travel_time',
 'total_travel_cost',
 'cost_per_distance',
 'cars_per_licensed_drivers',
 'total_travel_distance',
 'cross_bay',
 'household_size',
 'num_kids',
 'mode_id',
 'observation_id',
 'choice',
 'num_cars',
 'num_licensed_drivers']

In [83]:
# Create a copy of the data subset
subset_bike_data = bike_data_long[model_variables].copy()

In [84]:
# Create the needed variables for the conversion
ind_spec_vars = ['num_kids', 'household_size',
                 'cars_per_licensed_drivers', 'cross_bay',
                 'num_cars', 'num_licensed_drivers']

alt_spec_vars = ['total_travel_time', 'total_travel_distance']

subset_spec_vars = {'total_travel_cost': [1, 2, 3],
                    'cost_per_distance': [1, 2, 3]}

alternative_name_dict = {1: 'drive_alone',
                         2: 'shared_2',
                         3: 'shared_3p',
                         4: 'wtw',
                         5: 'dtw',
                         6: 'wtd',
                         7: 'walk',
                         8: 'bike'}

In [85]:
# Convert data from long to wide, I assigned a null value of 0
# because it will make it easier to simulate data when we have
# Unavailable variables
bike_data_wide = pl.convert_long_to_wide(long_data=subset_bike_data,
                                         ind_vars=ind_spec_vars,
                                         alt_specific_vars=alt_spec_vars,
                                         subset_specific_vars=subset_spec_vars,
                                         obs_id_col=obs_id_col,
                                         alt_id_col=alt_id_col,
                                         choice_col=choice_col,
                                         alt_name_dict=alternative_name_dict)
bike_data_wide.head()

Unnamed: 0,observation_id,choice,availability_drive_alone,availability_shared_2,availability_shared_3p,availability_wtw,availability_dtw,availability_wtd,availability_walk,availability_bike,...,total_travel_distance_dtw,total_travel_distance_wtd,total_travel_distance_walk,total_travel_distance_bike,total_travel_cost_drive_alone,total_travel_cost_shared_2,total_travel_cost_shared_3p,cost_per_distance_drive_alone,cost_per_distance_shared_2,cost_per_distance_shared_3p
0,1.0,2.0,1,1,1,1,1,1,1,1,...,0.0,0.0,29.11,29.11,5.714,3.2651,2.2856,0.184799,0.105598,0.07392
1,2.0,2.0,1,1,1,1,1,1,1,1,...,0.0,0.0,24.8,24.8,4.4519,2.5439,1.7807,0.184803,0.1056,0.073919
2,3.0,1.0,1,1,1,1,1,1,1,1,...,0.0,0.0,8.38,8.38,1.6817,0.9609,0.6726,0.184802,0.105593,0.073912
3,4.0,1.0,1,1,1,1,1,1,1,1,...,0.0,0.0,8.38,8.38,1.6817,0.9609,0.6726,0.184802,0.105593,0.073912
4,5.0,1.0,1,1,1,0,1,0,1,1,...,0.0,,40.64,40.64,5.9782,3.4162,2.3913,0.184798,0.105601,0.07392


Here, we need to decide how we will simulate data when we have unavailable values. TBD.

In [86]:
# Define the list of variables of interest from data_wide
columns_wide = ['num_kids', 'household_size', 'num_cars',
                'num_licensed_drivers', 'cross_bay',
                'total_travel_time_drive_alone', 'total_travel_time_shared_2',
                'total_travel_time_shared_3p', 'total_travel_time_wtw',
                'total_travel_time_dtw', 'total_travel_time_wtd',
                'total_travel_time_walk', 'total_travel_time_bike',
                'total_travel_distance_drive_alone',
                'total_travel_distance_shared_2',
                'total_travel_distance_shared_3p',
                'total_travel_distance_wtw',
                'total_travel_distance_dtw', 'total_travel_distance_wtd',
                'total_travel_distance_walk', 'total_travel_distance_bike',
                'total_travel_cost_drive_alone', 'total_travel_cost_shared_2',
                'total_travel_cost_shared_3p']

# Restrict data to desired columns
bike_data_wide = bike_data_wide[columns_wide]

In [87]:
# Determine the distributions to be used
distributions = ['normal', 'alpha', 'beta', 'gamma', 'expon', 'gumbel']

# Initial the FitDistribution object
bike_data_fitter = FitDistribution(data=bike_data_wide, dists=distributions)

In [88]:
# Define the nature of each variables whether
# discrete/categorical or continuous
variable_type = {'num_kids': 'categorical',
                 'household_size': 'categorical',
                 'num_cars': 'discrete',
                 'num_licensed_drivers': 'categorical',
                 'cross_bay': 'categorical',
                 'total_travel_time_drive_alone': 'continuous',
                 'total_travel_time_shared_2': 'continuous',
                 'total_travel_time_shared_3p': 'continuous',
                 'total_travel_time_wtw': 'continuous',
                 'total_travel_time_dtw': 'continuous',
                 'total_travel_time_wtd': 'continuous',
                 'total_travel_time_walk': 'continuous',
                 'total_travel_time_bike': 'continuous',
                 'total_travel_distance_drive_alone': 'continuous',
                 'total_travel_distance_shared_2': 'continuous',
                 'total_travel_distance_shared_3p': 'continuous',
                 'total_travel_distance_wtw': 'continuous',
                 'total_travel_distance_dtw': 'continuous',
                 'total_travel_distance_wtd': 'continuous',
                 'total_travel_distance_walk': 'continuous',
                 'total_travel_distance_bike': 'continuous',
                 'total_travel_cost_drive_alone': 'continuous',
                 'total_travel_cost_shared_2': 'continuous',
                 'total_travel_cost_shared_3p': 'continuous'}

In [89]:
# Simulate dataframe based on the estimated distributions
sim_bike_data = bike_data_fitter.SimDf(size=5000)

## Example Implementation 

## Declaring variables

In [92]:
observation_id_col = 'observation_id'

alternative_id_col = 'mode_id'

variable_type = {'num_kids': 'categorical',
                 'household_size': 'categorical',
                 'num_cars': 'categorical',
                 'num_licensed_drivers': 'categorical'}

individual_specific_variables = ['num_kids', 'household_size',
                                 'num_cars', 'num_licensed_drivers']

alternative_specific_variables = ['total_travel_time',
                                  'total_travel_distance',
                                  'total_travel_cost']

trip_specific_variables = ['cross_bay']

alternative_name_dict = {1: 'drive_alone',
                         2: 'shared_2',
                         3: 'shared_3p',
                         4: 'wtw',
                         5: 'dtw',
                         6: 'wtd',
                         7: 'walk',
                         8: 'bike'}

variable_type = {'num_kids': 'categorical',
                 'household_size': 'categorical',
                 'num_cars': 'categorical',
                 'num_licensed_drivers': 'categorical',
                 'cross_bay': 'categorical',
                 'total_travel_time': 'continuous',
                 'total_travel_distance': 'continuous',
                 'total_travel_cost': 'continuous'}

distributions = ['normal', 'alpha', 'beta', 'gamma', 'expon', 'gumbel']

## Implementation of Function

In [93]:
bike_data_params = FindLongDataDist(data_long=bike_data_long,
                                    alt_id_col=alternative_id_col,
                                    obs_id_col=observation_id_col,
                                    alt_spec=alternative_specific_variables,
                                    alt_name_dic=alternative_name_dict,
                                    ind_spec=individual_specific_variables,
                                    trip_spec=trip_specific_variables,
                                    var_types=variable_type,
                                    cont_dists=distributions)

SKIPPED normal distribution (taking more than 30 seconds)
Fitted alpha distribution with error=0.0007989652291582563)
Fitted beta distribution with error=0.00021837510493444555)


  Lhat = muhat - Shat*mu
  sk = 2*(b-a)*np.sqrt(a + b + 1) / (a + b + 2) / np.sqrt(a*b)
  improvement from the last ten iterations.


Fitted gamma distribution with error=0.00021215664843294874)
Fitted expon distribution with error=0.00042675380208476757)
SKIPPED gumbel distribution (taking more than 30 seconds)
SKIPPED normal distribution (taking more than 30 seconds)
Fitted alpha distribution with error=0.014522128648725777)
Fitted beta distribution with error=0.0011963782652540424)
Fitted gamma distribution with error=0.0011394543473683448)
Fitted expon distribution with error=0.002052652669300775)
SKIPPED gumbel distribution (taking more than 30 seconds)
SKIPPED normal distribution (taking more than 30 seconds)
Fitted alpha distribution with error=0.013034447497270776)
Fitted beta distribution with error=0.04104766585766829)
Fitted gamma distribution with error=0.03662007898297577)
Fitted expon distribution with error=0.07921887555564106)
SKIPPED gumbel distribution (taking more than 30 seconds)
SKIPPED normal distribution (taking more than 30 seconds)
Fitted alpha distribution with error=0.001056930694286857)
Fi

  improvement from the last five Jacobian evaluations.


Fitted beta distribution with error=8.724425966292245)
Fitted gamma distribution with error=8.724370722136563)
Fitted expon distribution with error=9.1690513221318)
SKIPPED gumbel distribution (taking more than 30 seconds)
SKIPPED normal distribution (taking more than 30 seconds)
Fitted alpha distribution with error=6.672556141363111e-05)
Fitted beta distribution with error=6.196777802352273e-05)
Fitted gamma distribution with error=6.08535018564024e-05)
Fitted expon distribution with error=0.0007188034183935458)
SKIPPED gumbel distribution (taking more than 30 seconds)
SKIPPED normal distribution (taking more than 30 seconds)
Fitted alpha distribution with error=8.324919501617495)
Fitted beta distribution with error=9.381940852860888)
Fitted gamma distribution with error=8.638629330672966)
Fitted expon distribution with error=9.271523141699383)
SKIPPED gumbel distribution (taking more than 30 seconds)
SKIPPED normal distribution (taking more than 30 seconds)
Fitted alpha distribution 

In [95]:
sim_data = SimDf(params_dict=bike_data_params, size=2000)

In [96]:
sim_data.head()

Unnamed: 0,num_kids,household_size,num_cars,num_licensed_drivers,total_travel_time_drive_alone,total_travel_distance_drive_alone,total_travel_cost_drive_alone,total_travel_time_shared_2,total_travel_distance_shared_2,total_travel_cost_shared_2,...,total_travel_time_wtd,total_travel_distance_wtd,total_travel_cost_wtd,total_travel_time_walk,total_travel_distance_walk,total_travel_cost_walk,total_travel_time_bike,total_travel_distance_bike,total_travel_cost_bike,cross_bay
0,2,1.0,2.0,2.0,50.580973,12.456223,0.783291,16.512103,3.168624,33.289835,...,62.247643,0.0,4.690495,22.725299,64.603739,0.0,12.41196,13.184623,0.0,0
1,0,1.0,4.0,1.0,21.041464,23.02026,2.126832,48.092422,31.495162,0.486445,...,56.560907,0.0,9.65944,15.566029,2.022584,0.0,56.941207,3.929513,0.0,0
2,1,4.0,3.0,3.0,23.397878,13.297877,0.562465,66.009773,70.269217,0.023247,...,65.009785,0.0,7.529626,24.58219,5.811552,0.0,27.716994,1.089178,0.0,0
3,0,4.0,2.0,4.0,34.597713,16.418423,1.515789,52.66245,34.91901,0.243737,...,126.71404,0.0,12.402977,153.157199,1.92308,0.0,30.661741,1.196031,0.0,0
4,0,2.0,1.0,1.0,4.539613,12.380527,4.24649,89.950129,11.813895,0.444571,...,117.857128,0.0,3.382506,78.25933,68.34544,0.0,91.964961,2.253704,0.0,0


In [97]:
wide_sim_data = SimulateAvailability(bike_data_long,
                                     sim_data=sim_data,
                                     obs_id_col=observation_id_col,
                                     alt_name_dict=alternative_name_dict)

# Simulate Choices 

## Convert Simulated Data from Wide to Long 

In [99]:
ind_variables = ['num_kids', 'household_size',
                 'num_cars', 'num_licensed_drivers', 'cross_bay']


alt_varying_variables = {u'total_travel_time': dict([(1, 'total_travel_time_drive_alone'),
                                                     (2, 'total_travel_time_shared_2'),
                                                     (3, 'total_travel_time_shared_3p'),
                                                     (4, 'total_travel_time_wtw'),
                                                     (5, 'total_travel_time_dtw'),
                                                     (6, 'total_travel_time_wtd'),
                                                     (7, 'total_travel_time_walk'),
                                                     (8, 'total_travel_time_bike')]),
                         u'total_travel_cost': dict([(1, 'total_travel_cost_drive_alone'),
                                                     (2, 'total_travel_cost_shared_2'),
                                                     (3, 'total_travel_cost_shared_3p'),
                                                     (4, 'total_travel_cost_wtw'),
                                                     (5, 'total_travel_cost_dtw'),
                                                     (6, 'total_travel_cost_wtd'),
                                                     (7, 'total_travel_cost_walk'),
                                                     (8, 'total_travel_cost_bike')]),
                         u'total_travel_distance': dict([(1, 'total_travel_distance_drive_alone'),
                                                         (2, 'total_travel_distance_shared_2'),
                                                         (3, 'total_travel_distance_shared_3p'),
                                                         (4, 'total_travel_distance_wtw'),
                                                         (5, 'total_travel_distance_dtw'),
                                                         (6, 'total_travel_distance_wtd'),
                                                         (7, 'total_travel_distance_walk'),
                                                         (8, 'total_travel_distance_bike')]),
                            }


availability_variables = {1: 'drive_alone_AV',
                          2: 'shared_2_AV',
                          3: 'shared_3p_AV',
                          4: 'wtw_AV',
                          5: 'dtw_AV',
                          6: 'wtd_AV',
                          7: 'walk_AV',
                          8: 'bike_AV'}

##########
# Determine the columns for: alternative ids, the observation ids and the choice
##########
# The 'custom_alt_id' is the name of a column to be created in the long-format data
# It will identify the alternative associated with each row.
custom_alt_id = "mode_id"

# Create a custom id column that ignores the fact that this is a
# panel/repeated-observations dataset. Note the +1 ensures the id's start at one.
obs_id_column = "observation_id"
wide_sim_data[obs_id_column] = np.arange(wide_sim_data.shape[0],
                                         dtype=int) + 1


# Create an empty choice column
choice_column = "sim_choice"

## Convert to Long Format Data

In [100]:
long_sim_data = pl.convert_wide_to_long(wide_sim_data,
                                        ind_variables,
                                        alt_varying_variables,
                                        availability_variables,
                                        obs_id_column,
                                        choice_column,
                                        new_alt_id_name=custom_alt_id)

In [101]:
# Create a cars per licensed drivers column
long_sim_data["cars_per_licensed_drivers"] = 0
long_sim_data.loc[long_sim_data.num_licensed_drivers > 0,
                  "cars_per_licensed_drivers"] = long_sim_data.num_cars / long_sim_data.num_licensed_drivers.astype(float)

In [102]:
# Add a variable representing cost divided by distance
long_sim_data["cost_per_distance"] = 0
long_sim_data.loc[long_sim_data.mode_id.isin([1, 2, 3]),
                  "cost_per_distance"] = (long_sim_data.loc[long_sim_data.mode_id.isin([1, 2, 3]),
                                                            "total_travel_cost"] /
                                          long_sim_data.loc[long_sim_data.mode_id.isin([1, 2, 3]),
                                                            "total_travel_distance"])

In [103]:
posterior_probs = mnl_model.predict(long_sim_data)

long_sim_data['sim_choice'] = simulate_choice_vector(posterior_probs,
                               long_sim_data['observation_id'].values)

  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2000, style=ProgressStyle(descriptio…




In [104]:
long_sim_data

Unnamed: 0,observation_id,mode_id,sim_choice,num_kids,household_size,num_cars,num_licensed_drivers,cross_bay,total_travel_time,total_travel_cost,total_travel_distance,cars_per_licensed_drivers,cost_per_distance,intercept
0,1,1,0,2,1.0,2.0,2.0,0,50.580973,0.783291,12.456223,1.0,0.062884,1.0
1,1,2,0,2,1.0,2.0,2.0,0,16.512103,33.289835,3.168624,1.0,10.506086,1.0
2,1,3,1,2,1.0,2.0,2.0,0,11.720076,0.217120,12.212146,1.0,0.017779,1.0
3,1,4,0,2,1.0,2.0,2.0,0,85.651747,8.502555,0.000000,1.0,0.000000,1.0
4,1,5,0,2,1.0,2.0,2.0,0,33.792558,5.135150,0.000000,1.0,0.000000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13633,2000,4,0,0,2.0,2.0,2.0,0,97.031797,6.421595,0.000000,1.0,0.000000,1.0
13634,2000,5,0,0,2.0,2.0,2.0,0,85.442085,3.607945,0.000000,1.0,0.000000,1.0
13635,2000,6,0,0,2.0,2.0,2.0,0,73.264470,11.272720,0.000000,1.0,0.000000,1.0
13636,2000,7,1,0,2.0,2.0,2.0,0,209.828605,0.000000,1.375411,1.0,0.000000,1.0


In [105]:
# Estimate the basic MNL model, using the hessian and newton-conjugate gradient
mnl_model_sim = pl.create_choice_model(data=long_sim_data_choice,
                                       alt_id_col="mode_id",
                                       obs_id_col="observation_id",
                                       choice_col="sim_choice",
                                       specification=mnl_specification,
                                       model_type="MNL",
                                       names=mnl_names)

num_vars = len(reduce(lambda x, y: x + y, mnl_names.values()))
# Note newton-cg used to ensure convergence to a point where gradient
# is essentially zero for all dimensions.
mnl_model_sim.fit_mle(np.zeros(num_vars),
                      method="BFGS")

# Look at the estimation results
mnl_model_sim.get_statsmodels_summary()

Log-likelihood at zero: -3,788.9016
Initial Log-likelihood: -3,788.9016
Estimation Time for Point Estimation: 0.20 seconds.
Final log-likelihood: -1,351.9510




0,1,2,3
Dep. Variable:,sim_choice,No. Observations:,2000.0
Model:,Multinomial Logit Model,Df Residuals:,1981.0
Method:,MLE,Df Model:,19.0
Date:,"Wed, 25 Mar 2020",Pseudo R-squ.:,0.643
Time:,22:46:45,Pseudo R-bar-squ.:,0.638
AIC:,2741.902,Log-Likelihood:,-1351.951
BIC:,2848.319,LL-Null:,-3788.902

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ASC Shared Ride: 2,-0.8202,0.262,-3.130,0.002,-1.334,-0.307
ASC Shared Ride: 3+,3.8607,0.311,12.413,0.000,3.251,4.470
ASC Walk-Transit-Walk,-0.9018,0.309,-2.915,0.004,-1.508,-0.296
ASC Drive-Transit-Walk,-2.6874,0.311,-8.628,0.000,-3.298,-2.077
ASC Walk-Transit-Drive,-2.9876,0.324,-9.209,0.000,-3.623,-2.352
ASC Walk,1.4572,0.279,5.214,0.000,0.909,2.005
ASC Bike,-0.9829,0.223,-4.410,0.000,-1.420,-0.546
"Travel Time, units:min (All Auto Modes)",-0.0775,0.004,-19.649,0.000,-0.085,-0.070
"Travel Time, units:min (All Transit Modes)",-0.0236,0.002,-9.530,0.000,-0.028,-0.019


# Repeat simulation many times 

In [106]:
initial_betas= list(mnl_model.params.values)

observation_id_col = 'observation_id'

alternative_id_col = 'mode_id'

variable_type = {'num_kids': 'categorical',
                 'household_size': 'categorical',
                 'num_cars': 'categorical',
                 'num_licensed_drivers': 'categorical'}

individual_specific_variables = ['num_kids', 'household_size',
                                 'num_cars', 'num_licensed_drivers']

alternative_specific_variables = ['total_travel_time', 'total_travel_distance', 'total_travel_cost']

trip_specific_variables = ['cross_bay']

alternative_name_dict = {1: 'drive_alone',
                         2: 'shared_2',
                         3: 'shared_3p',
                         4: 'wtw',
                         5: 'dtw',
                         6: 'wtd',
                         7: 'walk',
                         8: 'bike'}

variable_type = {'num_kids': 'categorical',
                 'household_size': 'categorical',
                 'num_cars': 'categorical',
                 'num_licensed_drivers': 'categorical',
                 'cross_bay': 'categorical',
                 'total_travel_time': 'continuous',
                 'total_travel_distance': 'continuous',
                 'total_travel_cost': 'continuous'}

distributions = ['normal', 'alpha', 'beta', 'gamma', 'expon', 'gumbel']

choice_column = "sim_choice"

custom_alt_id = "mode_id"

alt_varying_variables = {u'total_travel_time': dict([(1, 'total_travel_time_drive_alone'),
                                                     (2, 'total_travel_time_shared_2'),
                                                     (3, 'total_travel_time_shared_3p'),
                                                     (4, 'total_travel_time_wtw'),
                                                     (5, 'total_travel_time_dtw'),
                                                     (6, 'total_travel_time_wtd'),
                                                     (7, 'total_travel_time_walk'),
                                                     (8, 'total_travel_time_bike')]),
                         u'total_travel_cost': dict([(1, 'total_travel_cost_drive_alone'),
                                                     (2, 'total_travel_cost_shared_2'),
                                                     (3, 'total_travel_cost_shared_3p'),
                                                     (4, 'total_travel_cost_wtw'),
                                                     (5, 'total_travel_cost_dtw'),
                                                     (6, 'total_travel_cost_wtd'),
                                                     (7, 'total_travel_cost_walk'),
                                                     (8, 'total_travel_cost_bike')]),
                         u'total_travel_distance': dict([(1, 'total_travel_distance_drive_alone'),
                                                         (2, 'total_travel_distance_shared_2'),
                                                         (3, 'total_travel_distance_shared_3p'),
                                                         (4, 'total_travel_distance_wtw'),
                                                         (5, 'total_travel_distance_dtw'),
                                                         (6, 'total_travel_distance_wtd'),
                                                         (7, 'total_travel_distance_walk'),
                                                         (8, 'total_travel_distance_bike')]),
                        }


availability_variables = {1: 'drive_alone_AV',
                          2: 'shared_2_AV',
                          3: 'shared_3p_AV',
                          4: 'wtw_AV',
                          5: 'dtw_AV',
                          6: 'wtd_AV',
                          7: 'walk_AV',
                          8: 'bike_AV'}

bike_data_params = FindLongDataDist(data_long=bike_data_long,
                                    alt_id_col=alternative_id_col,
                                    obs_id_col=observation_id_col,
                                    alt_spec=alternative_specific_variables,
                                    alt_name_dic=alternative_name_dict,
                                    ind_spec=individual_specific_variables,
                                    trip_spec=trip_specific_variables,
                                    var_types=variable_type,
                                    cont_dists=distributions)

SKIPPED normal distribution (taking more than 30 seconds)
Fitted alpha distribution with error=0.0007989652291582563)
Fitted beta distribution with error=0.00021837510493444555)


  Lhat = muhat - Shat*mu
  sk = 2*(b-a)*np.sqrt(a + b + 1) / (a + b + 2) / np.sqrt(a*b)
  improvement from the last ten iterations.


Fitted gamma distribution with error=0.00021215664843294874)
Fitted expon distribution with error=0.00042675380208476757)
SKIPPED gumbel distribution (taking more than 30 seconds)
SKIPPED normal distribution (taking more than 30 seconds)
Fitted alpha distribution with error=0.014522128648725777)
Fitted beta distribution with error=0.0011963782652540424)
Fitted gamma distribution with error=0.0011394543473683448)
Fitted expon distribution with error=0.002052652669300775)
SKIPPED gumbel distribution (taking more than 30 seconds)
SKIPPED normal distribution (taking more than 30 seconds)
Fitted alpha distribution with error=0.013034447497270776)
Fitted beta distribution with error=0.04104766585766829)
Fitted gamma distribution with error=0.03662007898297577)
Fitted expon distribution with error=0.07921887555564106)
SKIPPED gumbel distribution (taking more than 30 seconds)
SKIPPED normal distribution (taking more than 30 seconds)
Fitted alpha distribution with error=0.001056930694286857)
Fi

  improvement from the last five Jacobian evaluations.



Fitted gamma distribution with error=8.724370722136563)
Fitted expon distribution with error=9.1690513221318)
SKIPPED gumbel distribution (taking more than 30 seconds)
SKIPPED normal distribution (taking more than 30 seconds)
Fitted alpha distribution with error=6.672556141363111e-05)
Fitted beta distribution with error=6.196777802352273e-05)
Fitted gamma distribution with error=6.08535018564024e-05)
Fitted expon distribution with error=0.0007188034183935458)
SKIPPED gumbel distribution (taking more than 30 seconds)
SKIPPED normal distribution (taking more than 30 seconds)
Fitted alpha distribution with error=8.324919501617495)
Fitted beta distribution with error=9.381940852860888)
Fitted gamma distribution with error=8.638629330672966)
Fitted expon distribution with error=9.271523141699383)
SKIPPED gumbel distribution (taking more than 30 seconds)
SKIPPED normal distribution (taking more than 30 seconds)
Fitted alpha distribution with error=8.054952402098987e-05)
Fitted beta distribu

In [None]:
simulation_size = np.random.randint(low=2000, high=3000, size=100)
sim_number = np.arange(1,101)
models_dictionary = defaultdict(dict)

for size, number in zip(simulation_size, sim_number):
    print('Simulation number', number , 'is in process...')
    print('------------------------------------------')

    sim_data = SimDf(params_dict=bike_data_params,
                     size = size)

    wide_sim_data = SimulateAvailability(data_long=bike_data_long, 
                                         sim_data=sim_data, 
                                         obs_id_col=observation_id_col, 
                                         alt_name_dict=alternative_name_dict)

    wide_sim_data[obs_id_column] = np.arange(wide_sim_data.shape[0],
                                            dtype=int) + 1
    
    long_sim_data = pl.convert_wide_to_long(wide_data=wide_sim_data,
                                            ind_vars=ind_variables, 
                                            alt_specific_vars=alt_varying_variables, 
                                            availability_vars=availability_variables,
                                            obs_id_col=observation_id_col,
                                            choice_col=choice_column,
                                            new_alt_id_name=custom_alt_id)
    
    # Create a cars per licensed drivers column
    long_sim_data["cars_per_licensed_drivers"] = 0
    long_sim_data.loc[long_sim_data.num_licensed_drivers > 0,
                      "cars_per_licensed_drivers"] = long_sim_data.num_cars / long_sim_data.num_licensed_drivers.astype(float)

    # Add a variable representing cost divided by distance
    long_sim_data["cost_per_distance"] = 0
    long_sim_data.loc[long_sim_data.mode_id.isin([1, 2, 3]),
                      "cost_per_distance"] = (long_sim_data.loc[long_sim_data.mode_id.isin([1, 2, 3]),
                                                                "total_travel_cost"] /
                                              long_sim_data.loc[long_sim_data.mode_id.isin([1, 2, 3]),
                                                        "total_travel_distance"])
    

    posterior_probs = mnl_model.predict(long_sim_data)

    long_sim_data['sim_choice'] = simulate_choice_vector(posterior_probs,
                                                         long_sim_data['observation_id'].values)

    # Estimate the basic MNL model, using the hessian and newton-conjugate gradient
    mnl_model_sim = pl.create_choice_model(data=long_sim_data,
                                           alt_id_col=alternative_id_col,
                                           obs_id_col=observation_id_col,
                                           choice_col=choice_column,
                                           specification=mnl_specification,
                                           model_type="MNL",
                                           names=mnl_names)

    num_vars = len(reduce(lambda x, y: x + y, mnl_names.values()))
    # Note newton-cg used to ensure convergence to a point where gradient 
    # is essentially zero for all dimensions. 
    mnl_model_sim.fit_mle(np.zeros(num_vars),
                          method="BFGS")

    models_dictionary[number] = mnl_model_sim
    
    print('Simulation number', number , 'is complete!')
    print('==========================================')
    print('==========================================')

Simulation number 1 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2710, style=ProgressStyle(descriptio…


Log-likelihood at zero: -5,147.3262
Initial Log-likelihood: -5,147.3262




Estimation Time for Point Estimation: 0.37 seconds.
Final log-likelihood: -1,743.1685
Simulation number 1 is complete!
Simulation number 2 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2033, style=ProgressStyle(descriptio…


Log-likelihood at zero: -3,867.9032
Initial Log-likelihood: -3,867.9032




Estimation Time for Point Estimation: 0.17 seconds.
Final log-likelihood: -1,400.7280
Simulation number 2 is complete!
Simulation number 3 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2772, style=ProgressStyle(descriptio…


Log-likelihood at zero: -5,256.4634
Initial Log-likelihood: -5,256.4634




Estimation Time for Point Estimation: 0.25 seconds.
Final log-likelihood: -1,807.4437
Simulation number 3 is complete!
Simulation number 4 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2873, style=ProgressStyle(descriptio…


Log-likelihood at zero: -5,453.9001
Initial Log-likelihood: -5,453.9001




Estimation Time for Point Estimation: 0.23 seconds.
Final log-likelihood: -1,838.1734
Simulation number 4 is complete!
Simulation number 5 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2192, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,166.9665
Initial Log-likelihood: -4,166.9665




Estimation Time for Point Estimation: 0.23 seconds.
Final log-likelihood: -1,384.3633
Simulation number 5 is complete!
Simulation number 6 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2426, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,618.9451
Initial Log-likelihood: -4,618.9451
Estimation Time for Point Estimation: 0.19 seconds.
Final log-likelihood: -1,584.9268




Simulation number 6 is complete!
Simulation number 7 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2965, style=ProgressStyle(descriptio…


Log-likelihood at zero: -5,627.2490
Initial Log-likelihood: -5,627.2490




Estimation Time for Point Estimation: 0.28 seconds.
Final log-likelihood: -1,940.9550
Simulation number 7 is complete!
Simulation number 8 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2487, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,733.1784
Initial Log-likelihood: -4,733.1784
Estimation Time for Point Estimation: 0.18 seconds.
Final log-likelihood: -1,645.3285




Simulation number 8 is complete!
Simulation number 9 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2267, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,303.1587
Initial Log-likelihood: -4,303.1587
Estimation Time for Point Estimation: 0.15 seconds.
Final log-likelihood: -1,390.5563




Simulation number 9 is complete!
Simulation number 10 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2087, style=ProgressStyle(descriptio…


Log-likelihood at zero: -3,959.6865
Initial Log-likelihood: -3,959.6865




Estimation Time for Point Estimation: 0.20 seconds.
Final log-likelihood: -1,342.3558
Simulation number 10 is complete!
Simulation number 11 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2918, style=ProgressStyle(descriptio…


Log-likelihood at zero: -5,534.8993
Initial Log-likelihood: -5,534.8993




Estimation Time for Point Estimation: 0.46 seconds.
Final log-likelihood: -1,906.8363
Simulation number 11 is complete!
Simulation number 12 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2525, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,791.7362
Initial Log-likelihood: -4,791.7362




Estimation Time for Point Estimation: 0.21 seconds.
Final log-likelihood: -1,670.8533
Simulation number 12 is complete!
Simulation number 13 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2973, style=ProgressStyle(descriptio…


Log-likelihood at zero: -5,642.7037
Initial Log-likelihood: -5,642.7037




Estimation Time for Point Estimation: 0.26 seconds.
Final log-likelihood: -1,925.8018
Simulation number 13 is complete!
Simulation number 14 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2047, style=ProgressStyle(descriptio…


Log-likelihood at zero: -3,899.6089
Initial Log-likelihood: -3,899.6089
Estimation Time for Point Estimation: 0.18 seconds.
Final log-likelihood: -1,341.1531




Simulation number 14 is complete!
Simulation number 15 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2498, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,752.6297
Initial Log-likelihood: -4,752.6297




Estimation Time for Point Estimation: 0.30 seconds.
Final log-likelihood: -1,772.9247
Simulation number 15 is complete!
Simulation number 16 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2955, style=ProgressStyle(descriptio…


Log-likelihood at zero: -5,591.0413
Initial Log-likelihood: -5,591.0413




Estimation Time for Point Estimation: 0.27 seconds.
Final log-likelihood: -2,006.3148
Simulation number 16 is complete!
Simulation number 17 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2839, style=ProgressStyle(descriptio…


Log-likelihood at zero: -5,375.5629
Initial Log-likelihood: -5,375.5629




Estimation Time for Point Estimation: 0.32 seconds.
Final log-likelihood: -1,857.9856
Simulation number 17 is complete!
Simulation number 18 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2426, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,586.1981
Initial Log-likelihood: -4,586.1981




Estimation Time for Point Estimation: 0.23 seconds.
Final log-likelihood: -1,532.3530
Simulation number 18 is complete!
Simulation number 19 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2791, style=ProgressStyle(descriptio…


Log-likelihood at zero: -5,312.5864
Initial Log-likelihood: -5,312.5864




Estimation Time for Point Estimation: 0.28 seconds.
Final log-likelihood: -1,845.9512
Simulation number 19 is complete!
Simulation number 20 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2297, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,349.0753
Initial Log-likelihood: -4,349.0753




Estimation Time for Point Estimation: 0.22 seconds.
Final log-likelihood: -1,503.5592
Simulation number 20 is complete!
Simulation number 21 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2009, style=ProgressStyle(descriptio…


Log-likelihood at zero: -3,820.6627
Initial Log-likelihood: -3,820.6627




Estimation Time for Point Estimation: 0.29 seconds.
Final log-likelihood: -1,341.8840
Simulation number 21 is complete!
Simulation number 22 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2437, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,617.2144
Initial Log-likelihood: -4,617.2144




Estimation Time for Point Estimation: 0.34 seconds.
Final log-likelihood: -1,590.8420
Simulation number 22 is complete!
Simulation number 23 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2382, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,523.8316
Initial Log-likelihood: -4,523.8316




Estimation Time for Point Estimation: 0.27 seconds.
Final log-likelihood: -1,539.0010
Simulation number 23 is complete!
Simulation number 24 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2658, style=ProgressStyle(descriptio…


Log-likelihood at zero: -5,044.0014
Initial Log-likelihood: -5,044.0014




Estimation Time for Point Estimation: 0.25 seconds.
Final log-likelihood: -1,754.3460
Simulation number 24 is complete!
Simulation number 25 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2505, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,751.6658
Initial Log-likelihood: -4,751.6658




Estimation Time for Point Estimation: 0.29 seconds.
Final log-likelihood: -1,763.4062
Simulation number 25 is complete!
Simulation number 26 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2511, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,796.6535
Initial Log-likelihood: -4,796.6535




Estimation Time for Point Estimation: 0.24 seconds.
Final log-likelihood: -1,727.8543
Simulation number 26 is complete!
Simulation number 27 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2564, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,873.4405
Initial Log-likelihood: -4,873.4405




Estimation Time for Point Estimation: 0.21 seconds.
Final log-likelihood: -1,680.8399
Simulation number 27 is complete!
Simulation number 28 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2345, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,445.0021
Initial Log-likelihood: -4,445.0021




Estimation Time for Point Estimation: 0.22 seconds.
Final log-likelihood: -1,493.1342
Simulation number 28 is complete!
Simulation number 29 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2805, style=ProgressStyle(descriptio…


Log-likelihood at zero: -5,315.4348
Initial Log-likelihood: -5,315.4348




Estimation Time for Point Estimation: 0.30 seconds.
Final log-likelihood: -1,825.0665
Simulation number 29 is complete!
Simulation number 30 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2211, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,185.9175
Initial Log-likelihood: -4,185.9175




Estimation Time for Point Estimation: 0.23 seconds.
Final log-likelihood: -1,441.4455
Simulation number 30 is complete!
Simulation number 31 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2967, style=ProgressStyle(descriptio…


Log-likelihood at zero: -5,625.3966
Initial Log-likelihood: -5,625.3966




Estimation Time for Point Estimation: 0.27 seconds.
Final log-likelihood: -2,042.2981
Simulation number 31 is complete!
Simulation number 32 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2822, style=ProgressStyle(descriptio…


Log-likelihood at zero: -5,347.0119
Initial Log-likelihood: -5,347.0119




Estimation Time for Point Estimation: 0.21 seconds.
Final log-likelihood: -1,793.2455
Simulation number 32 is complete!
Simulation number 33 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2130, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,029.1707
Initial Log-likelihood: -4,029.1707




Estimation Time for Point Estimation: 0.21 seconds.
Final log-likelihood: -1,314.0927
Simulation number 33 is complete!
Simulation number 34 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2678, style=ProgressStyle(descriptio…


Log-likelihood at zero: -5,072.0508
Initial Log-likelihood: -5,072.0508




Estimation Time for Point Estimation: 0.29 seconds.
Final log-likelihood: -1,760.6208
Simulation number 34 is complete!
Simulation number 35 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2247, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,263.0812
Initial Log-likelihood: -4,263.0812




Estimation Time for Point Estimation: 0.24 seconds.
Final log-likelihood: -1,484.3269
Simulation number 35 is complete!
Simulation number 36 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2661, style=ProgressStyle(descriptio…


Log-likelihood at zero: -5,047.1404
Initial Log-likelihood: -5,047.1404




Estimation Time for Point Estimation: 0.28 seconds.
Final log-likelihood: -1,755.2967
Simulation number 36 is complete!
Simulation number 37 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2506, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,782.1729
Initial Log-likelihood: -4,782.1729
Estimation Time for Point Estimation: 0.19 seconds.
Final log-likelihood: -1,591.4563




Simulation number 37 is complete!
Simulation number 38 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2069, style=ProgressStyle(descriptio…


Log-likelihood at zero: -3,939.4888
Initial Log-likelihood: -3,939.4888




Estimation Time for Point Estimation: 0.17 seconds.
Final log-likelihood: -1,334.9374
Simulation number 38 is complete!
Simulation number 39 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2539, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,816.3343
Initial Log-likelihood: -4,816.3343
Estimation Time for Point Estimation: 0.17 seconds.
Final log-likelihood: -1,656.1486




Simulation number 39 is complete!
Simulation number 40 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2655, style=ProgressStyle(descriptio…


Log-likelihood at zero: -5,023.4833
Initial Log-likelihood: -5,023.4833




Estimation Time for Point Estimation: 0.24 seconds.
Final log-likelihood: -1,772.7473
Simulation number 40 is complete!
Simulation number 41 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2951, style=ProgressStyle(descriptio…


Log-likelihood at zero: -5,614.2958
Initial Log-likelihood: -5,614.2958




Estimation Time for Point Estimation: 0.35 seconds.
Final log-likelihood: -1,987.5971
Simulation number 41 is complete!
Simulation number 42 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2621, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,992.8515
Initial Log-likelihood: -4,992.8515




Estimation Time for Point Estimation: 0.23 seconds.
Final log-likelihood: -1,766.7704
Simulation number 42 is complete!
Simulation number 43 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2768, style=ProgressStyle(descriptio…


Log-likelihood at zero: -5,261.0593
Initial Log-likelihood: -5,261.0593




Estimation Time for Point Estimation: 0.25 seconds.
Final log-likelihood: -1,791.2177
Simulation number 43 is complete!
Simulation number 44 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2248, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,253.2494
Initial Log-likelihood: -4,253.2494




Estimation Time for Point Estimation: 0.24 seconds.
Final log-likelihood: -1,435.7171
Simulation number 44 is complete!
Simulation number 45 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2252, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,275.3751
Initial Log-likelihood: -4,275.3751




Estimation Time for Point Estimation: 0.27 seconds.
Final log-likelihood: -1,491.1490
Simulation number 45 is complete!
Simulation number 46 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2876, style=ProgressStyle(descriptio…


Log-likelihood at zero: -5,465.6361
Initial Log-likelihood: -5,465.6361




Estimation Time for Point Estimation: 0.24 seconds.
Final log-likelihood: -1,929.3666
Simulation number 46 is complete!
Simulation number 47 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2115, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,019.4863
Initial Log-likelihood: -4,019.4863




Estimation Time for Point Estimation: 0.22 seconds.
Final log-likelihood: -1,321.6467
Simulation number 47 is complete!
Simulation number 48 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2110, style=ProgressStyle(descriptio…


Log-likelihood at zero: -3,993.9344
Initial Log-likelihood: -3,993.9344
Estimation Time for Point Estimation: 0.18 seconds.
Final log-likelihood: -1,352.0484




Simulation number 48 is complete!
Simulation number 49 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2171, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,124.6044
Initial Log-likelihood: -4,124.6044




Estimation Time for Point Estimation: 0.19 seconds.
Final log-likelihood: -1,454.0386
Simulation number 49 is complete!
Simulation number 50 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2178, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,148.3544
Initial Log-likelihood: -4,148.3544




Estimation Time for Point Estimation: 0.28 seconds.
Final log-likelihood: -1,384.1516
Simulation number 50 is complete!
Simulation number 51 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2023, style=ProgressStyle(descriptio…


Log-likelihood at zero: -3,835.0273
Initial Log-likelihood: -3,835.0273




Estimation Time for Point Estimation: 0.17 seconds.
Final log-likelihood: -1,325.2119
Simulation number 51 is complete!
Simulation number 52 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2087, style=ProgressStyle(descriptio…


Log-likelihood at zero: -3,959.3734
Initial Log-likelihood: -3,959.3734




Estimation Time for Point Estimation: 0.27 seconds.
Final log-likelihood: -1,317.7396
Simulation number 52 is complete!
Simulation number 53 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2577, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,886.8539
Initial Log-likelihood: -4,886.8539




Estimation Time for Point Estimation: 0.33 seconds.
Final log-likelihood: -1,681.2443
Simulation number 53 is complete!
Simulation number 54 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2221, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,200.9283
Initial Log-likelihood: -4,200.9283
Estimation Time for Point Estimation: 0.20 seconds.
Final log-likelihood: -1,472.4717




Simulation number 54 is complete!
Simulation number 55 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2925, style=ProgressStyle(descriptio…


Log-likelihood at zero: -5,549.2824
Initial Log-likelihood: -5,549.2824
Estimation Time for Point Estimation: 0.19 seconds.
Final log-likelihood: -1,945.2344




Simulation number 55 is complete!
Simulation number 56 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2525, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,805.4089
Initial Log-likelihood: -4,805.4089




Estimation Time for Point Estimation: 0.24 seconds.
Final log-likelihood: -1,604.0133
Simulation number 56 is complete!
Simulation number 57 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2755, style=ProgressStyle(descriptio…


Log-likelihood at zero: -5,242.7716
Initial Log-likelihood: -5,242.7716




Estimation Time for Point Estimation: 0.24 seconds.
Final log-likelihood: -1,837.4333
Simulation number 57 is complete!
Simulation number 58 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2131, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,046.6925
Initial Log-likelihood: -4,046.6925
Estimation Time for Point Estimation: 0.19 seconds.
Final log-likelihood: -1,448.6804




Simulation number 58 is complete!
Simulation number 59 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2351, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,467.9853
Initial Log-likelihood: -4,467.9853




Estimation Time for Point Estimation: 0.20 seconds.
Final log-likelihood: -1,709.1938
Simulation number 59 is complete!
Simulation number 60 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2826, style=ProgressStyle(descriptio…


Log-likelihood at zero: -5,379.5652
Initial Log-likelihood: -5,379.5652




Estimation Time for Point Estimation: 0.27 seconds.
Final log-likelihood: -1,902.0491
Simulation number 60 is complete!
Simulation number 61 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2401, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,571.1851
Initial Log-likelihood: -4,571.1851
Estimation Time for Point Estimation: 0.20 seconds.
Final log-likelihood: -1,576.1165




Simulation number 61 is complete!
Simulation number 62 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2393, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,570.6563
Initial Log-likelihood: -4,570.6563




Estimation Time for Point Estimation: 0.23 seconds.
Final log-likelihood: -1,571.3873
Simulation number 62 is complete!
Simulation number 63 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2609, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,952.2075
Initial Log-likelihood: -4,952.2075




Estimation Time for Point Estimation: 0.29 seconds.
Final log-likelihood: -1,710.3341
Simulation number 63 is complete!
Simulation number 64 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2447, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,647.4409
Initial Log-likelihood: -4,647.4409
Estimation Time for Point Estimation: 0.19 seconds.
Final log-likelihood: -1,666.2782




Simulation number 64 is complete!
Simulation number 65 is in process...
------------------------------------------


  design_matrix = np.hstack((x[:, None] for x in independent_vars))


HBox(children=(IntProgress(value=0, description='Simulating Choices', max=2623, style=ProgressStyle(descriptio…


Log-likelihood at zero: -4,974.2243
Initial Log-likelihood: -4,974.2243




Estimation Time for Point Estimation: 0.24 seconds.
Final log-likelihood: -1,675.8438
Simulation number 65 is complete!
Simulation number 66 is in process...
------------------------------------------
