# Selection on Observables 

## Purpose 

# Set Notebook Parameters

In [None]:
# Path to Data
DATA_PATH = '../../data/raw/spring_2016_all_bay_area_long_format_plus_cross_bay_col.csv'

In [None]:
# Definition of causal graphs nodes and edges

# Independent
NODES_IND = ["Total Travel Distance",
             "Total Travel Time",
             "Total Travel Cost",
             "Number of Licensed Drivers",
             "Number of Autos",
             "Utility (Drive Alone)"]

EDGES_IND = [("Total Travel Distance", "Utility (Drive Alone)"),
             ("Total Travel Time", "Utility (Drive Alone)"),
             ("Total Travel Cost", "Utility (Drive Alone)"),
             ("Number of Licensed Drivers", "Utility (Drive Alone)"),
             ("Number of Autos", "Utility (Drive Alone)")]
# Drive Alone
NODES_DA = ["Total Travel Distance",
            "Total Travel Time",
            "Total Travel Cost",
            "Number of Autos",
            "Number of Licensed Drivers",
            "Utility (Drive Alone)"]

EDGES_DA = [("Total Travel Distance", "Total Travel Time"),
            ("Total Travel Distance", "Total Travel Cost"),
            ("Total Travel Distance", "Utility (Drive Alone)"),
            ("Total Travel Time", "Utility (Drive Alone)"),
            ("Total Travel Cost", "Utility (Drive Alone)"),
            ("Number of Autos", "Utility (Drive Alone)"),
            ("Number of Licensed Drivers", "Utility (Drive Alone)")]

# Shared-2
NODES_SHARED_2 = ["Total Travel Time",
                  "Total Travel Distance",
                  "Total Travel Cost",
                  "Cross Bay Trip",
                  "Number of Autos",
                  "Number of Licensed Drivers",
                  "Household Size",
                  "Number of Kids",
                  "Utility (Shared Ride 2)"]

EDGES_SHARED_2 = [("Total Travel Distance", "Total Travel Time"),
                  ("Total Travel Distance", "Total Travel Cost"),
                  ("Total Travel Distance", "Utility (Shared Ride 2)"),
                  ("Total Travel Time", "Utility (Shared Ride 2)"),
                  ("Number of Autos", "Utility (Shared Ride 2)"),
                  ("Number of Licensed Drivers", "Utility (Shared Ride 2)"),
                  ("Total Travel Cost", "Utility (Shared Ride 2)"),
                  ("Household Size", "Utility (Shared Ride 2)"),
                  ("Cross Bay Trip", "Utility (Shared Ride 2)"),
                  ("Number of Kids", "Utility (Shared Ride 2)")]

# Shared-3+
NODES_SHARED_3P = ["Total Travel Time",
                   "Total Travel Distance",
                   "Total Travel Cost",
                   "Cross Bay Trip",
                   "Number of Autos",
                   "Number of Licensed Drivers",
                   "Household Size",
                   "Number of Kids",
                   "Utility (Shared Ride 3+)"]

EDGES_SHARED_3P = [("Total Travel Distance", "Total Travel Time"),
                   ("Total Travel Distance", "Total Travel Cost"),
                   ("Total Travel Distance", "Utility (Shared Ride 3+)"),
                   ("Total Travel Time", "Utility (Shared Ride 3+)"),
                   ("Number of Autos", "Utility (Shared Ride 3+)"),
                   ("Number of Licensed Drivers", "Utility (Shared Ride 3+)"),
                   ("Total Travel Cost", "Utility (Shared Ride 3+)"),
                   ("Household Size", "Utility (Shared Ride 3+)"),
                   ("Cross Bay Trip", "Utility (Shared Ride 3+)"),
                   ("Number of Kids", "Utility (Shared Ride 3+)")]

# Walk-Transit-Walk
NODES_WTW = ["Total Travel Time",
             "Total Travel Cost",
             "Utility (WTW)"]

EDGES_WTW = [("Total Travel Time", "Total Travel Cost"),
             ("Total Travel Time", "Utility (WTW)"),
             ("Total Travel Cost", "Utility (WTW)")]

# Drive-Transit-Walk
NODES_DTW = ["Total Travel Time",
             "Total Travel Cost",
             "Utility (DTW)"]

EDGES_DTW = [("Total Travel Time", "Total Travel Cost"),
             ("Total Travel Time", "Utility (DTW)"),
             ("Total Travel Cost", "Utility (DTW)")]

# Walk-Transit-Drive
NODES_WTD = ["Total Travel Time",
             "Total Travel Cost",
             "Utility (WTD)"]

EDGES_WTD = [("Total Travel Time", "Total Travel Cost"),
             ("Total Travel Time", "Utility (WTD)"),
             ("Total Travel Cost", "Utility (WTD)")]

# Walk
NODES_WALK = ["Total Travel Distance",
              "Utility (Walk)"]

EDGES_WALK = [("Total Travel Distance", "Utility (Walk)")]

# Bike
NODES_BIKE = ["Total Travel Distance",
              "Utility (Bike)"]
EDGES_BIKE = [("Total Travel Distance", "Utility (Bike)")]

In [None]:
# Alternative id column from long format data
ALT_ID_COL = 'mode_id'

# Individual specific variables list
IND_SPEC_VARS = ['household_size', 'num_kids',
                 'num_cars', 'num_licensed_drivers']

# Alternative specific variables dictionary
# Key is alternative number, value is a list
# of alternative specific nodes without parents
ALT_SPEC_DICT = {1: ['total_travel_distance'],
                 2: ['total_travel_distance'],
                 3: ['total_travel_distance'],
                 4: ['total_travel_time'],
                 5: ['total_travel_time'],
                 6: ['total_travel_time'],
                 7: ['total_travel_distance'],
                 8: ['total_travel_distance']}

# Trip specific variables list
TRIP_SPEC_VARS = ['cross_bay']

# Alternative name dictionary
# Key is alternative number
# value is alternative number snake cased
ALT_NAME_DICT = {1: 'drive_alone',
                 2: 'shared_2',
                 3: 'shared_3p',
                 4: 'wtw',
                 5: 'dtw',
                 6: 'wtd',
                 7: 'walk',
                 8: 'bike'}

ALT_ID_TO_MODE_NAME = {1: "Drive Alone",
                       2: "Shared Ride 2",
                       3: "Shared Ride 3+",
                       4: "Walk-Transit-Walk",
                       5: "Drive-Transit-Walk",
                       6: "Walk-Transit-Drive",
                       7: "Walk",
                       8: "Bike"}

# Variable type Dictionary
# Key is string with variable name from previous
# dictionaries and lists, value is a string
# with type of the variable
VARS_TYPE = {'num_kids': 'categorical',
             'household_size': 'categorical',
             'num_cars': 'categorical',
             'num_licensed_drivers': 'categorical',
             'cross_bay': 'categorical',
             'total_travel_time': 'continuous',
             'total_travel_distance': 'continuous',
             'total_travel_cost': 'continuous'}

# Distribution to be explored for continuous variables
CONT_DISTS = ['norm', 'alpha', 'beta', 'gamma', 'expon', 'gumbel']

In [None]:
# Declare regression parameters

REGS_DA = {1: ('total_travel_distance', 'total_travel_cost'),
           2: ('total_travel_distance', 'total_travel_time')}

REGS_TYPE_DA = {1: 'linear',
                2: 'linear'}


REGS_SHARED_2 = {1: ('total_travel_distance', 'total_travel_cost'),
                 2: ('total_travel_distance', 'total_travel_time')}

REGS_TYPE_SHARED_2 = {1: 'linear',
                      2: 'linear'}


REGS_SHARED_3P = {1: ('total_travel_distance', 'total_travel_cost'),
                  2: ('total_travel_distance', 'total_travel_time')}

REGS_TYPE_SHARED_3P = {1: 'linear',
                       2: 'linear'}


REGS_WTW = {1: ('total_travel_time', 'total_travel_cost')}

REGS_TYPE_WTW = {1: 'linear'}


REGS_DTW = {1: ('total_travel_time', 'total_travel_cost')}

REGS_TYPE_DTW = {1: 'linear'}


REGS_WTD = {1: ('total_travel_time', 'total_travel_cost')}

REGS_TYPE_DTW = {1: 'linear'}

In [None]:
# Parameters for conversion from Wide to Long

IND_VARIABLES = ['num_kids', 'household_size',
                 'num_cars', 'num_licensed_drivers', 'cross_bay']


# Dictionary of Alternative Specific Variables
# TODO: verify whether all variables are needed
# for each alternative
ALT_VARYING_VARIABLES = {u'total_travel_time': dict([(1, 'total_travel_time_drive_alone'),
                                                     (2, 'total_travel_time_shared_2'),
                                                     (3, 'total_travel_time_shared_3p'),
                                                     (4, 'total_travel_time_wtw'),
                                                     (5, 'total_travel_time_dtw'),
                                                     (6, 'total_travel_time_wtd')]),
                         u'total_travel_cost': dict([(1, 'total_travel_cost_drive_alone'),
                                                     (2, 'total_travel_cost_shared_2'),
                                                     (3, 'total_travel_cost_shared_3p'),
                                                     (4, 'total_travel_cost_wtw'),
                                                     (5, 'total_travel_cost_dtw'),
                                                     (6, 'total_travel_cost_wtd')]),
                         u'total_travel_distance': dict([(1, 'total_travel_distance_drive_alone'),
                                                         (2, 'total_travel_distance_shared_2'),
                                                         (3, 'total_travel_distance_shared_3p'),
                                                         (7, 'total_travel_distance_walk'),
                                                         (8, 'total_travel_distance_bike')])}


# Dictionary of alternative availability variables
AVAILABILITY_VARIABLES = {1: 'drive_alone_AV',
                          2: 'shared_2_AV',
                          3: 'shared_3p_AV',
                          4: 'wtw_AV',
                          5: 'dtw_AV',
                          6: 'wtd_AV',
                          7: 'walk_AV',
                          8: 'bike_AV'}

##########
# Determine the columns for: alternative ids, the observation ids and the choice
##########
# The 'custom_alt_id' is the name of a column to be created in the long-format data
# It will identify the alternative associated with each row.
CUSTOM_ALT_ID = "mode_id"

OBS_ID_COL = "observation_id"

# Declare choice column
CHOICE_COL = "sim_choice"

In [None]:
# Create my specification and variable names for the basic MNL model
# NOTE: - Keys should be variables within the long format dataframe.
#         The sole exception to this is the "intercept" key.
#       - For the specification dictionary, the values should be lists
#         or lists of lists. Within a list, or within the inner-most
#         list should be the alternative ID's of the alternative whose
#         utility specification the explanatory variable is entering.

MNL_SPECIFICATION = OrderedDict()
MNL_NAMES = OrderedDict()

MNL_SPECIFICATION["intercept"] = [2, 3, 4, 5, 6, 7, 8]
MNL_NAMES["intercept"] = ['ASC Shared Ride: 2',
                          'ASC Shared Ride: 3+',
                          'ASC Walk-Transit-Walk',
                          'ASC Drive-Transit-Walk',
                          'ASC Walk-Transit-Drive',
                          'ASC Walk',
                          'ASC Bike']

MNL_SPECIFICATION["total_travel_time"] = [[1, 2, 3], [4, 5, 6]]
MNL_NAMES["total_travel_time"] = ['Travel Time, units:min (All Auto Modes)',
                                  'Travel Time, units:min (All Transit Modes)']

MNL_SPECIFICATION["total_travel_cost"] = [[4, 5, 6]]
MNL_NAMES["total_travel_cost"] = ['Travel Cost, units:$ (All Transit Modes)']

MNL_SPECIFICATION["cost_per_distance"] = [1, 2, 3]
MNL_NAMES["cost_per_distance"] = ["Travel Cost per Distance, units:$/mi (Drive Alone)",
                                  "Travel Cost per Distance, units:$/mi (SharedRide-2)",
                                  "Travel Cost per Distance, units:$/mi (SharedRide-3+)"]

MNL_SPECIFICATION["cars_per_licensed_drivers"] = [[1, 2, 3]]
MNL_NAMES["cars_per_licensed_drivers"] = ["Autos per licensed drivers (All Auto Modes)"]

MNL_SPECIFICATION["total_travel_distance"] = [7, 8]
MNL_NAMES["total_travel_distance"] = ['Travel Distance, units:mi (Walk)',
                                      'Travel Distance, units:mi (Bike)']

MNL_SPECIFICATION["cross_bay"] = [[2, 3]]
MNL_NAMES["cross_bay"] = ["Cross-Bay Tour (Shared Ride 2 & 3+)"]

MNL_SPECIFICATION["household_size"] = [[2, 3]]
MNL_NAMES["household_size"] = ['Household Size (Shared Ride 2 & 3+)']

MNL_SPECIFICATION["num_kids"] = [[2, 3]]
MNL_NAMES["num_kids"] = ["Number of Kids in Household (Shared Ride 2 & 3+)"]

# Import Needed Libraries 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from functools import reduce
import seaborn as sns
import statsmodels.api as sm
import copy
from scipy import sparse
import scipy.stats
import random
from fitter import Fitter
import attr
from causalgraphicalmodels import CausalGraphicalModel, StructuralCausalModel
from collections import defaultdict, OrderedDict
import pylogit as pl
import viz
from array import array
import json
from scipy.stats import multivariate_normal

## Replace libraries with src .py file names containing the functions

# Load and Describe Data 

In [None]:
# Reading data from the specified PATH
bike_data_long = pd.read_csv(DATA_PATH)

# Look at the mode shares in the data set

mode_counts = bike_data_long.loc[bike_data_long.choice == 1,
                                 "mode_id"].value_counts().loc[range(1, 9)]

mode_shares = mode_counts / bike_data_long.observation_id.max()
mode_shares.index = [ALT_ID_TO_MODE_NAME[x] for x in mode_shares.index.values]
mode_shares.name = "Mode Shares"
mode_shares

# Choice Model Estimation 

In [None]:
# Estimate the basic MNL model, using the hessian and newton-conjugate gradient
mnl_model = pl.create_choice_model(data=bike_data_long,
                                   alt_id_col=ALT_ID_COL,
                                   obs_id_col=OBS_ID_COL,
                                   choice_col="choice",
                                   specification=MNL_SPECIFICATION,
                                   model_type="MNL",
                                   names=MNL_NAMES)

num_vars = len(reduce(lambda x, y: x + y, MNL_NAMES.values()))

# Note newton-cg used to ensure convergence to a point where gradient
# is essentially zero for all dimensions.
mnl_model.fit_mle(np.zeros(num_vars),
                  method="BFGS")

# Look at the estimation results
mnl_model.get_statsmodels_summary()

# Show Causal Graphs

## Independent Causal Graph 

In [None]:
ind_graph = CausalGraphicalModel(NODES_IND, EDGES_IND)
ind_graph.draw()

## Drive Alone

In [None]:
V_drive_alone = CausalGraphicalModel(NODES_DA, EDGES_DA)
V_drive_alone.draw()

## Shared-2

In [None]:
V_shared_2 = CausalGraphicalModel(NODES_SHARED_2, EDGES_SHARED_2)
V_shared_2.draw()

## Shared-3+

In [None]:
V_shared_3p = CausalGraphicalModel(NODES_SHARED_3P, EDGES_SHARED_3P)
V_shared_3p.draw()

## Walk-Transit-Walk

In [None]:
V_wtw = CausalGraphicalModel(NODES_WTW, EDGES_WTW)
V_wtw.draw()

## Drive-Transit-Walk

In [None]:
V_dtw = CausalGraphicalModel(NODES_DTW, EDGES_DTW)
V_dtw.draw()

##  Walk-Transit-Drive

In [None]:
V_wtd = CausalGraphicalModel(NODES_WTD, EDGES_WTD)
V_wtd.draw()

## Walk

In [None]:
V_walk = CausalGraphicalModel(NODES_WALK, EDGES_WALK)
V_walk.draw()

## Bike 

In [None]:
V_bike = CausalGraphicalModel(NODES_BIKE, EDGES_BIKE)
V_bike.draw()

# Selection on Observables Simulation

## Main Idea 

## Distribution Fitting 

In [None]:
bike_data_params = get_dist_node_no_parent(bike_data_long,
                                           ALT_ID_COL,
                                           OBS_ID_COL,
                                           ALT_SPEC_DICT,
                                           ALT_NAME_DICT,
                                           IND_SPEC_VARS,
                                           TRIP_SPEC_VARS,
                                           VARS_TYPE,
                                           CONT_DISTS)

## Regression Fitting 

## Drive Alone 

In [None]:
drive_alone_df = bike_data_long.loc[bike_data_long['mode_id'] == 1]

drive_alone_df.reset_index(drop=True, inplace=True)

fitted_reg_da = fit_alternative_regression(REGS_DA,
                                           REGS_TYPE_DA,
                                           drive_alone_df)

## Shared-2

In [None]:
shared_2_df = bike_data_long.loc[bike_data_long['mode_id'] == 2]

shared_2_df.reset_index(drop=True, inplace=True)

fitted_reg_shared_2 = fit_alternative_regression(REGS_SHARED_2,
                                                 REGS_TYPE_SHARED_2,
                                                 shared_2_df)

## Shared-3+

In [None]:
shared_3p_df = bike_data_long.loc[bike_data_long['mode_id'] == 3]

shared_3p_df.reset_index(drop=True, inplace=True)

fitted_reg_shared_3p = fit_alternative_regression(REGS_SHARED_3P,
                                                  REGS_TYPE_SHARED_3P,
                                                  shared_3p_df)

## Walk-Transit-Walk 

In [None]:
wtw_df = bike_data_long.loc[bike_data_long['mode_id'] == 4]

wtw_df.reset_index(drop=True, inplace=True)

fitted_reg_wtw = fit_alternative_regression(REGS_WTW,
                                            REGS_TYPE_WTW,
                                            wtw_df)

## Drive-Transit-Walk 

In [None]:
dtw_df = bike_data_long.loc[bike_data_long['mode_id'] == 5]

dtw_df.reset_index(drop=True, inplace=True)

fitted_reg_dtw = fit_alternative_regression(REGS_DTW,
                                            REGS_TYPE_DTW,
                                            dtw_df)

## Walk-Transit-Drive

In [None]:
wtd_df = bike_data_long.loc[bike_data_long['mode_id'] == 6]

wtd_df.reset_index(drop=True, inplace=True)

fitted_reg_wtd = fit_alternative_regression(REGS_WTD,
                                            REGS_TYPE_WTD,
                                            wtd_df)

## Simulation

### Simulation Parameters

In [None]:
simulation_sizes = np.random.randint(low=3000, high=9000, size=2)
sim_number = np.arange(1, 3)
models_dictionary = defaultdict(dict)
causal_effect_dictionary = {}
perturb = 0.8
simulation_data = {}
causal_effects = pd.DataFrame(columns=['naive_effect', 'true_effect', 'estimated_effect'])

In [None]:
for sim_size, number in zip(simulation_sizes, sim_number):
    print('Simulation number', number, 'is in progress...')
    print('Simulation size is', sim_size)
    print('------------------------------------------')
    print('Simulating data...')
    # Simulate data without parents
    sim_bike_data_no_parent = sim_node_no_parent(bike_data_params,
                                                 size=sim_size)

    sim_bike_data_wide = copy.deepcopy(sim_bike_data_no_parent)

    # Simulate nodes based on causal graphs
    # Drive Alone
    sim_bike_data_wide['total_travel_time_drive_alone'] =\
        lin_reg_pred(sim_bike_data_wide['total_travel_distance_drive_alone'],
                     drive_alone_reg['total_travel_time_on_total_travel_distance'],
                     sim_size)

    sim_bike_data_wide['total_travel_cost_drive_alone'] =\
        lin_reg_pred(sim_bike_data_wide['total_travel_distance_drive_alone'],
                     drive_alone_reg['total_travel_cost_on_total_travel_distance'],
                     sim_size)

    # Shared Ride 2
    sim_bike_data_wide['total_travel_time_shared_2'] =\
        lin_reg_pred(sim_bike_data_wide['total_travel_distance_shared_2'],
                     shared_2_reg['total_travel_time_on_total_travel_distance'],
                     sim_size)

    sim_bike_data_wide['total_travel_cost_shared_2'] =\
        lin_reg_pred(sim_bike_data_wide['total_travel_distance_shared_2'],
                     shared_2_reg['total_travel_cost_on_total_travel_distance'],
                     sim_size)

    # Shared Ride 3+
    sim_bike_data_wide['total_travel_time_shared_3p'] =\
        lin_reg_pred(sim_bike_data_wide['total_travel_distance_shared_3p'],
                     shared_3p_reg['total_travel_time_on_total_travel_distance'],
                     sim_size)

    sim_bike_data_wide['total_travel_cost_shared_3p'] =\
        lin_reg_pred(sim_bike_data_wide['total_travel_distance_shared_3p'],
                     shared_3p_reg['total_travel_cost_on_total_travel_distance'],
                     sim_size)

    # Walk-Transit-Walk
    sim_bike_data_wide['total_travel_cost_wtw'] =\
        lin_reg_pred(sim_bike_data_wide['total_travel_time_wtw'],
                     wtw_reg['total_travel_cost_on_total_travel_time'],
                     sim_size)

    # Drive-Transit-Walk
    sim_bike_data_wide['total_travel_cost_dtw'] =\
        lin_reg_pred(sim_bike_data_wide['total_travel_time_dtw'],
                     dtw_reg['total_travel_cost_on_total_travel_time'],
                     sim_size)

    # Walk-Transit-Drive
    sim_bike_data_wide['total_travel_cost_wtd'] =\
        lin_reg_pred(sim_bike_data_wide['total_travel_time_wtd'],
                     wtd_reg['total_travel_cost_on_total_travel_time'],
                     sim_size)

    # Simulate Availability
    print('Simulating Availability...')
    alt_av_matrix = simulate_availability(data_long=bike_data_long,
                                          obs_id_col=observation_id_col,
                                          alt_name_dict=alternative_name_dict,
                                          sim_size=sim_size)

    sim_bike_data_wide = sim_bike_data_wide.join(alt_av_matrix)

    sim_bike_data_wide[CHOICE_COL] = sim_fake_choice_col(alt_av_matrix)

    sim_bike_data_wide[OBS_ID_COL] = np.arange(sim_bike_data_wide.shape[0],
                                               dtype=int) + 1

    # Converting Data from Wide to Long
    print('Converting data from wide to long...')
    long_sim_data = pl.convert_wide_to_long(sim_bike_data_wide,
                                            IND_VARIABLES,
                                            ALT_VARYING_VARIABLES,
                                            AVAILABILITY_VARIABLES,
                                            OBS_ID_COL,
                                            CHOICE_COL,
                                            new_alt_id_name=CUSTOM_ALT_ID)

    # Create a cars per licensed drivers column
    long_sim_data["cars_per_licensed_drivers"] = 0
    long_sim_data.loc[long_sim_data.num_licensed_drivers > 0,
                      "cars_per_licensed_drivers"] = long_sim_data.num_cars / long_sim_data.num_licensed_drivers.astype(float)

    # Add a variable representing cost divided by distance
    long_sim_data["cost_per_distance"] = 0
    long_sim_data.loc[long_sim_data.mode_id.isin([1, 2, 3]),
                      "cost_per_distance"] = (long_sim_data.loc[long_sim_data.mode_id.isin([1, 2, 3]),
                                                                "total_travel_cost"] /
                                              long_sim_data.loc[long_sim_data.mode_id.isin([1, 2, 3]),
                                                                "total_travel_distance"])

    # Simulating Choices
    print('Simulating Choices...')
    # Calculate probabilities for each alternative
    # based on the estimated model
    init_mnl_model_probs = mnl_model.predict(long_sim_data)

    # Simulate choice data
    long_sim_data[CHOICE_COL] = viz.simulate_choice_vector(init_mnl_model_probs,
                                                           long_sim_data[OBS_ID_COL].values)

    # Estimating Choice Models
    print('Estimating the choice model...')
    # Estimate the basic MNL model, using the hessian and newton-conjugate gradient
    mnl_model_sim = pl.create_choice_model(data=long_sim_data,
                                           alt_id_col=ALT_ID_COL,
                                           obs_id_col=OBS_ID_COL,
                                           choice_col=CHOICE_COLUMN,
                                           specification=MNL_SPECIFICATION,
                                           model_type="MNL",
                                           names=MNL_NAMES)

    num_vars = len(reduce(lambda x, y: x + y, MNL_NAMES.values()))
    # Note newton-cg used to ensure convergence to a point where gradient
    # is essentially zero for all dimensions.
    mnl_model_sim_params = mnl_model_sim.fit_mle(np.zeros(num_vars),
                                                 method="BFGS",
                                                 just_point=True)
    mnl_model_sim_param_list = [mnl_model_sim_params['x'], None, None, None]
    models_dictionary[number] = mnl_model_sim

    print('Estimating Causal Effects...')

    # Estimating Causal Effects
    # Create copies of long format data
    long_sim_data_naive = copy.deepcopy(long_sim_data)
    long_sim_data_causal = copy.deepcopy(long_sim_data)

    # Initial Probabilities
    init_mnl_model_sim_probs = mnl_model_sim.predict(long_sim_data,
                                                     param_list=mnl_model_sim_param_list)

    long_sim_data['init_mnl_model_sim_probs'] = init_mnl_model_sim_probs

    # mnl_model_probs = mnl_model.predict(long_sim_data)
    long_sim_data['init_mnl_model_probs'] = init_mnl_model_probs

    # Naive Probabilities
    long_sim_data_naive['total_travel_distance'] = perturb * long_sim_data_naive['total_travel_distance']

    naive_probabilities = mnl_model_sim.predict(long_sim_data_naive,
                                                param_list=mnl_model_sim_param_list)

    long_sim_data_naive['naive_probabilities'] = naive_probabilities

    # Estimated Probabilities

    long_sim_data_causal['total_travel_distance'] = perturb * long_sim_data['total_travel_distance']

    # Perturb Variables based on Assumed Causal Graph

    # Drive Alone
    long_sim_data_causal.loc[long_sim_data_causal['mode_id'] == 1, 'total_travel_time'] =\
        lin_reg_pred(long_sim_data_causal.loc[long_sim_data_causal['mode_id'] == 1, 'total_travel_distance'],
                     drive_alone_reg['total_travel_time_on_total_travel_distance'],
                     long_sim_data_causal.loc[long_sim_data_causal['mode_id'] == 1].shape[0])

    long_sim_data_causal.loc[long_sim_data_causal['mode_id'] == 1, 'total_travel_cost'] =\
        lin_reg_pred(long_sim_data_causal.loc[long_sim_data_causal['mode_id'] == 1, 'total_travel_distance'],
                     drive_alone_reg['total_travel_cost_on_total_travel_distance'],
                     long_sim_data_causal.loc[long_sim_data_causal['mode_id'] == 1].shape[0])

    # Shared-2
    long_sim_data_causal.loc[long_sim_data_causal['mode_id'] == 2, 'total_travel_time'] =\
        lin_reg_pred(long_sim_data_causal.loc[long_sim_data_causal['mode_id'] == 2, 'total_travel_distance'],
                     shared_2_reg['total_travel_time_on_total_travel_distance'],
                     long_sim_data_causal.loc[long_sim_data_causal['mode_id'] == 2].shape[0])

    long_sim_data_causal.loc[long_sim_data_causal['mode_id'] == 2, 'total_travel_cost'] =\
        lin_reg_pred(long_sim_data_causal.loc[long_sim_data_causal['mode_id'] == 2, 'total_travel_distance'],
                     shared_2_reg['total_travel_cost_on_total_travel_distance'],
                     long_sim_data_causal.loc[long_sim_data_causal['mode_id'] == 2].shape[0])

    # Shared 3+
    long_sim_data_causal.loc[long_sim_data_causal['mode_id'] == 3, 'total_travel_time'] =\
        lin_reg_pred(long_sim_data_causal.loc[long_sim_data_causal['mode_id'] == 3, 'total_travel_distance'],
                     shared_3p_reg['total_travel_time_on_total_travel_distance'],
                     long_sim_data_causal.loc[long_sim_data_causal['mode_id'] == 3].shape[0])

    long_sim_data_causal.loc[long_sim_data_causal['mode_id'] == 3, 'total_travel_cost'] =\
        lin_reg_pred(long_sim_data_causal.loc[long_sim_data_causal['mode_id'] == 3, 'total_travel_distance'],
                     shared_3p_reg['total_travel_cost_on_total_travel_distance'],
                     long_sim_data_causal.loc[long_sim_data_causal['mode_id'] == 3].shape[0])

    # Walk-Transit-Walk
    long_sim_data_causal.loc[long_sim_data_causal['mode_id'] == 4, 'total_travel_cost'] =\
        lin_reg_pred(long_sim_data_causal.loc[long_sim_data_causal['mode_id'] == 4, 'total_travel_time'],
                     wtw_reg['total_travel_cost_on_total_travel_time'],
                     long_sim_data_causal.loc[long_sim_data_causal['mode_id'] == 4].shape[0])

    # Drive-Transit-Walk
    long_sim_data_causal.loc[long_sim_data_causal['mode_id'] == 5, 'total_travel_cost'] =\
        lin_reg_pred(long_sim_data_causal.loc[long_sim_data_causal['mode_id'] == 5, 'total_travel_time'],
                     dtw_reg['total_travel_cost_on_total_travel_time'],
                     long_sim_data_causal.loc[long_sim_data_causal['mode_id'] == 5].shape[0])

    # Walk-Transit-Drive
    long_sim_data_causal.loc[long_sim_data_causal['mode_id'] == 6, 'total_travel_cost'] =\
        lin_reg_pred(long_sim_data_causal.loc[long_sim_data_causal['mode_id'] == 6, 'total_travel_time'],
                     wtd_reg['total_travel_cost_on_total_travel_time'],
                     long_sim_data_causal.loc[long_sim_data_causal['mode_id'] == 6].shape[0])

    # Compute Estimated Probabilities
    estimated_probabilities = mnl_model_sim.predict(long_sim_data_causal,
                                                    param_list=mnl_model_sim_param_list)
    long_sim_data_causal['estimated_probabilities'] = estimated_probabilities

    # True Probabilities
    true_probabilities = mnl_model.predict(long_sim_data_causal)
    long_sim_data_causal['true_probabilities'] = true_probabilities

    simulation_data[number] = {}
    simulation_data[number]['long_sim_data'] = long_sim_data
    simulation_data[number]['long_sim_data_causal'] = long_sim_data_causal
    simulation_data[number]['long_sim_data_naive'] = long_sim_data_naive

    print('Simulation number', number, 'is complete!')
    print('==========================================')
    print('==========================================')

### Causal Effect Estimation

In [None]:
for number in sim_number:
    
    initial_data = simulation_data[number]['long_sim_data']
    naive_data = simulation_data[number]['long_sim_data_naive']
    causal_data = simulation_data[number]['long_sim_data_causal']

    naive_effect = naive_data.loc[naive_data['mode_id'].isin([1, 2, 3]), 'naive_probabilities'] -\
        initial_data.loc[initial_data['mode_id'].isin([1, 2, 3]), 'init_mnl_model_sim_probs']
    estimated_effect = causal_data.loc[causal_data['mode_id'].isin([1, 2, 3]), 'estimated_probabilities'] -\
        initial_data.loc[initial_data['mode_id'].isin([1, 2, 3]), 'init_mnl_model_sim_probs']
    true_effect = causal_data.loc[causal_data['mode_id'].isin([1, 2, 3]), 'true_probabilities'] -\
        initial_data.loc[initial_data['mode_id'].isin([1, 2, 3]), 'init_mnl_model_probs']

    causal_effects = causal_effects.append({'true_effect': true_effect.mean(),
                                            'estimated_effect': estimated_effect.mean(),
                                            'naive_effect': naive_effect.mean()}, ignore_index=True)

### Generating Plots

In [None]:
plt.figure(figsize=(20, 10))
sns.distplot(causal_effects.true_effect, label='True Effect', kde=False, color='#005AB5')
sns.distplot(causal_effects.naive_effect, label='Naive Effect', kde=False, color='#DC3220')
plt.title('True Effect vs. Naive Effect', fontdict={'fontsize': 14, 'fontweight': 'bold'})
plt.ylabel('Frequency', rotation=90, labelpad=5, fontdict={'fontsize': 12, 'fontweight': 'bold'})
plt.xlabel('Average Causal Effect', fontdict={'fontsize': 12, 'fontweight': 'bold'})
plt.legend(prop={'size': 14})

plt.figure(figsize=(15, 5))
sns.distplot(causal_effects.true_effect, label='True Effect', kde=False, color='#005AB5')
sns.distplot(causal_effects.estimated_effect, label='Estimated Effect', kde=False, color='#994F00')
plt.title('True Effect vs. Estimated Effect', fontdict={'fontsize': 14, 'fontweight': 'bold'})
plt.ylabel('Frequency', rotation=90, labelpad=5, fontdict={'fontsize': 12, 'fontweight': 'bold'})
plt.xlabel('Average Causal Effect', fontdict={'fontsize': 12, 'fontweight': 'bold'})
plt.legend(prop={'size': 14})