# Project 3: Getting Started 

This notebook is intended to help you get off to a flying start with the cars dataset. You don't have to use this notebook and you can discard any parts you do not like, they are purely intended as a help to get started. 

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import scipy.stats 
sns.set_theme()

# optimization
from scipy import optimize
import estimation as est
import clogit
import functions as fn

import statsmodels.formula.api as smf

# Read in data

The dataset, `cars.csv`, contains cleaned and processed data. If you want to make changes, the notebook, `materialize.ipynb`, creates the data from the raw source datsets. 

In [2]:
cars = pd.read_csv('cars.csv')
lbl_vars = pd.read_csv('labels_variables.csv')
lbl_vals = pd.read_csv('labels_values.csv')

# convert from dataframe to dict
lbl_vals = {c: lbl_vals[c].dropna().to_dict() for c in lbl_vals.columns}

In [3]:
lbl_vars.set_index('variable', inplace=True)

## Overview of the dataset

In [4]:
lbl_vars.join(cars.mean(numeric_only=True).apply(lambda x: f'{x: .2f}').to_frame('Mean'))

Unnamed: 0_level_0,label,Mean
variable,Unnamed: 1_level_1,Unnamed: 2_level_1
ye,year (=first dimension of panel),84.5
ma,market (=second dimension of panel),3.0
co,model code (=third dimension of panel),207.5
zcode,alternative model code (predecessors and succe...,177.76
brd,brand code,16.79
type,name of brand and model,
brand,name of brand,
model,name of model,
org,"origin code (demand side, country with which c...",2.72
loc,"location code (production side, country where ...",5.17


# Set up for analysis

In [5]:
# a. pic the prince variable 
price_var = 'princ'

# b. new variable: log of price
cars['logp'] = np.log(cars[price_var])

# c. new variable: price elasticity heterogeneous for home-region 
cars['logp_x_home'] = cars[price_var] * cars['home']

### Dummy variables

For working with matrices, we want to have a column for each dummy variable. 

In [6]:
categorical_var = 'brand' # name of categorical variable
dummies = pd.get_dummies(cars[categorical_var]) # creates a matrix of dummies for each value of dummyvar
x_vars_dummies = list(dummies.columns[1:].values) # omit a reference category, here it is the first (hence columns[1:])

# add dummies to the dataframe 
assert dummies.columns[0] not in cars.columns, f'It looks like you have already added this dummy to the dataframe. Avoid duplicates! '
# cars = pd.concat([cars, dummies], axis=1)


### `x_vars`: List of regressors to be used 

In [7]:
# 
x_vars = ['logp', 'home', 'he', 'cy', 'hp', 'we', 'li'] #+ x_vars_dummies # <--- !!! choose your preferred variables here 
# x_vars = ['logp', 'home', 'he', 'sp', 'cy', 'hp', 'we', 'li'] #+ x_vars_dummies # <--- !!! choose your preferred variables here 
print(f'K = {len(x_vars)} variables selected.')

# add price elasticity heterogeneous for home-region 
x_vars_new = x_vars + ['logp_x_home']
x_vars_new_new = x_vars_new[2:]

K = 7 variables selected.


In [8]:
# a. set up the data for the estimation
K = len(x_vars)
N = cars.ma.nunique() * cars.ye.nunique()
J = 40 
x = cars[x_vars].values.reshape((N,J,K)).astype(np.float64)
# b. define new x with the interaction term
x_new = cars[x_vars_new].values.reshape((N,J,K+1)).astype(np.float64)
x_new_new = cars[x_vars_new_new].values.reshape((N,J,K-1)).astype(np.float64)
# c. define y
y = cars['s'].values.reshape((N,J))

In [9]:
if 'sp' in x_vars: 
    sp_i = x_vars.index('sp')
    y = y[~np.isnan(x[:, :, sp_i]).any(axis=1)]
    x = x[~np.isnan(x[:, :, sp_i]).any(axis=1)]
    x_new = x_new[~np.isnan(x_new[:, :, sp_i]).any(axis=1)]
    if 'logp' not in x_vars_new_new:
        x_new_new = x_new_new[~np.isnan(x_new_new[:, :, sp_i]).any(axis=1)]
        y = y[~np.isnan(x_new_new[:, :, sp_i]).any(axis=1)]

    N = len(y)


# Conditional Logit

## Estimate of x

In [10]:
# Starting values 
theta0 = clogit.starting_values(y, x)

In [11]:
# Find results
res = est.estimate(clogit.q, theta0, y, x, cov_type = 'Sandwich')

Optimization terminated successfully.
         Current function value: 3.501058
         Iterations: 26
         Function evaluations: 280
         Gradient evaluations: 35


In [12]:
# Estimate the model, standard errors, and t-values
est.print_table(x_vars,res)

Optimizer succeded after 26 iter. (280 func. evals.). Final criterion:    3.501.
Results


Unnamed: 0,theta,se,t
logp,0.2455,0.1143,2.1466
home,1.2962,0.0318,40.7915
he,-0.0077,0.0026,-2.9959
cy,0.0002,0.0001,2.7433
hp,-0.0285,0.002,-14.2986
we,0.0005,0.0001,3.4855
li,-0.0172,0.0152,-1.1315


## Estimate of x_new 

In [13]:
# Starting values 
theta0_new = clogit.starting_values(y,x_new)

In [14]:
# Find residuals
res_new = est.estimate(clogit.q, theta0_new, y, x_new, cov_type = 'Sandwich')

Optimization terminated successfully.
         Current function value: 3.498881
         Iterations: 30
         Function evaluations: 324
         Gradient evaluations: 36


In [15]:
# Estimate the model, standard errors, and t-values
est.print_table(x_vars_new,res_new)

Optimizer succeded after 30 iter. (324 func. evals.). Final criterion:    3.499.
Results


Unnamed: 0,theta,se,t
logp,0.2883,0.107,2.6949
home,1.6563,0.0559,29.6076
he,-0.007,0.0026,-2.6893
cy,0.0002,0.0001,2.7935
hp,-0.0254,0.0021,-12.048
we,0.0005,0.0001,3.616
li,-0.0182,0.0157,-1.1584
logp_x_home,-0.4965,0.0572,-8.6825


## X_new_new

In [16]:
theta0_new_new = clogit.starting_values(y,x_new_new)

In [17]:
# Find results
res_new_new = est.estimate(clogit.q, theta0_new_new, y, x_new_new, cov_type = 'Sandwich')

Optimization terminated successfully.
         Current function value: 3.539787
         Iterations: 17
         Function evaluations: 161
         Gradient evaluations: 23


In [18]:
# Estimate the model, standard errors, and t-values
est.print_table(x_vars_new_new,res_new_new)

Optimizer succeded after 17 iter. (161 func. evals.). Final criterion:     3.54.
Results


Unnamed: 0,theta,se,t
he,-0.0114,0.0028,-4.1035
cy,0.0003,0.0001,2.4938
hp,-0.0374,0.0019,-20.023
we,0.0007,0.0001,5.2033
li,-0.016,0.0187,-0.8586
logp_x_home,1.3943,0.0455,30.6304


In [38]:
def APE(x, y, res, inter: bool):

    N, J, K = x.shape

    car_home = x.copy()
    car_home[:,:,1] = 1


    x_new = np.zeros((N, J+1, K))
    x_new[:, :-1, :] = x.copy()
    x_new[:, -1, 1] = 0

    theta_new = clogit.starting_values(y,x_new)
    res_new = est.estimate(clogit.q, theta_new, y, x_new, cov_type = 'Sandwich')

    ccp_home = clogit.choice_prob(res_new['theta'], x_new)


    # x_new = x.copy()

    # for i in range(N):
    #     for j in range(J):
    #         x_new[i,j,1] = 0

    # a. compute the APE

    # car_home = x.copy()
    # car_home[:,:,1] = 1

    # x_new = np.zeros((N, J, K))
    # x_new[:, :, :] = x.copy()
    # x_new[:, :, :] = car_home

    # theta_new = clogit.starting_values(y,x_new)
    # res_new = est.estimate(clogit.q, theta_new, y, x_new, cov_type = 'Sandwich')

    # ccp_home = clogit.choice_prob(res_new['theta'], x_new)

    # x_new[:, :, :] = car_foreign

    # theta = clogit.starting_values(y,x_new)
    # res = est.estimate(clogit.q, theta, y, x_new, cov_type = 'Sandwich')

    ccp_foreign = clogit.choice_prob(res['theta'], x)

    pe = ccp_home[:,-1,:] - ccp_foreign[:,-1,:]

    ape = np.mean(pe) 

    return ape

In [39]:
APE(x, y, res, inter = False)

ValueError: operands could not be broadcast together with shapes (150,40) (150,41) 

# Estimation of Parameters of Intrest

In [19]:
# Without interaction term
fn.result(x,res['theta'],res['cov'],print_out = True,se=True,inter=False,N=N,x_vars=x_vars)

Unnamed: 0,Estimate,se,CI low,CI high,p-value
PEA,0.0355,0.001,0.0335,0.0375,0.0
MWP,5.2807,2.5045,0.3719,10.1895,0.0372
OPE,0.2392,0.1115,0.0207,0.4577,0.034
CPE,-0.0061,0.0029,-0.0117,-0.0005,0.035


In [20]:
# With interaction term
fn.result(x_new,res_new['theta'],res_new['cov'],print_out = True,se=True,inter=True, N=N, x_vars=x_vars_new)

Unnamed: 0,Estimate,se,CI low,CI high,p-value
PEA,0.0693,0.0053,0.0589,0.0797,0.0
MWP,5.7443,2.2102,1.4123,10.0763,0.0106
OPE,-0.6337,0.073,-0.7767,-0.4907,0.0
CPE,0.0193,0.0022,0.0149,0.0237,0.0


In [21]:
# With interaction term
fn.result(x_new_new,res_new_new['theta'],res_new_new['cov'],print_out = True,se=True,inter=True, N=N, x_vars=x_vars_new_new)

Unnamed: 0,Estimate,se,CI low,CI high,p-value
PEA,0.9894,0.0015,0.9864,0.9924,0.0
MWP,0.0236,0.0113,0.0015,0.0458,0.039
OPE,2.7212,0.089,2.5468,2.8956,0.0
CPE,-0.0698,0.0023,-0.0743,-0.0653,0.0
