# Project 3: Getting Started 

This notebook is intended to help you get off to a flying start with the cars dataset. You don't have to use this notebook and you can discard any parts you do not like, they are purely intended as a help to get started. 

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
sns.set_theme()

# optimization
from scipy import optimize
import estimation as est
import clogit

import statsmodels.formula.api as smf

# Read in data

The dataset, `cars.csv`, contains cleaned and processed data. If you want to make changes, the notebook, `materialize.ipynb`, creates the data from the raw source datsets. 

In [2]:
cars = pd.read_csv('cars.csv')
lbl_vars = pd.read_csv('labels_variables.csv')
lbl_vals = pd.read_csv('labels_values.csv')

# convert from dataframe to dict
lbl_vals = {c: lbl_vals[c].dropna().to_dict() for c in lbl_vals.columns}

In [3]:
lbl_vars.set_index('variable', inplace=True)

## Overview of the dataset

In [4]:
lbl_vars.join(cars.mean(numeric_only=True).apply(lambda x: f'{x: .2f}').to_frame('Mean'))

Unnamed: 0_level_0,label,Mean
variable,Unnamed: 1_level_1,Unnamed: 2_level_1
ye,year (=first dimension of panel),84.5
ma,market (=second dimension of panel),3.0
co,model code (=third dimension of panel),207.5
zcode,alternative model code (predecessors and succe...,177.76
brd,brand code,16.79
type,name of brand and model,
brand,name of brand,
model,name of model,
org,"origin code (demand side, country with which c...",2.72
loc,"location code (production side, country where ...",5.17


# Set up for analysis

In [5]:
# Pic the prince variable 
price_var = 'princ'

# new variable: log of price
cars['logp'] = np.log(cars[price_var])

# new variable: price elasticity heterogeneous for home-region 
cars['logp_x_home'] = cars[price_var] * cars['home']

### Dummy variables

For working with matrices, we want to have a column for each dummy variable. 

In [6]:
categorical_var = 'brand' # name of categorical variable
dummies = pd.get_dummies(cars[categorical_var]) # creates a matrix of dummies for each value of dummyvar
x_vars_dummies = list(dummies.columns[1:].values) # omit a reference category, here it is the first (hence columns[1:])

# add dummies to the dataframe 
assert dummies.columns[0] not in cars.columns, f'It looks like you have already added this dummy to the dataframe. Avoid duplicates! '
cars = pd.concat([cars, dummies], axis=1)

### `x_vars`: List of regressors to be used 

In [7]:
x_vars = ['logp', 'home', 'cy', 'hp', 'we', 'li'] + x_vars_dummies # <--- !!! choose your preferred variables here 
print(f'K = {len(x_vars)} variables selected.')

# add price elasticity heterogeneous for home-region 
x_vars_new = x_vars + ['logp_x_home']

K = 38 variables selected.


In [8]:
K = len(x_vars)
N = cars.ma.nunique() * cars.ye.nunique()
J = 40 
x = cars[x_vars].values.reshape((N,J,K))
x_new = cars[x_vars_new].values.reshape((N,J,K+1))
y = np.log(cars['s'].values.reshape((N,J)))

# standardize x
# x = ((x - x.mean(0).mean(0))/(x.std(0).std(0)))

In [9]:
# x[x == False] = 0.0
# x[x == True] = 1.0

## Rescaling parameters

### To prevent values optimizer issues

In [10]:
# ['logp', 'home', 'cy', 'hp', 'we', 'li'] 

# Now in cy/1000
cy_i = x_vars.index('cy')
x[:,:,cy_i] = x[:,:,cy_i]/10_000
x_new[:,:,cy_i] = x_new[:,:,cy_i]/10_000

# now in kW/1000
hp_i = x_vars.index('hp')
x[:,:,hp_i] = x[:,:,hp_i].astype(float)
x_new[:,:,hp_i] = x_new[:,:,hp_i].astype(float)

# Now in tons 
we_i = x_vars.index('we')
x[:,:,we_i] = x[:,:,we_i]/10_000
x_new[:,:,we_i] = x_new[:,:,we_i]/10_000

# now in kW/100
li_i = x_vars.index('li')
x[:,:,li_i] = x[:,:,li_i]/100
x_new[:,:,li_i] = x_new[:,:,li_i]/100

In [11]:
x = x.astype(float)


In [19]:
x_new = x_new.astype(float)

# Conditional Logit

## Estimate of x

In [12]:
# Starting values 
theta0 = clogit.starting_values(y, x)

In [13]:
# Find results
res = est.estimate(clogit.q, theta0, np.exp(y), x)

Optimization terminated successfully.
         Current function value: 3.473342
         Iterations: 290
         Function evaluations: 11427
         Gradient evaluations: 293


In [14]:
# Estimate the model, standard errors, and t-values
est.print_table(x_vars,res)

Optimizer succeded after 290 iter. (11427 func. evals.). Final criterion:    3.473.
Results


Unnamed: 0,theta,se,t
logp,-0.1584,11.0771,-0.0143
home,1.3619,2.631,0.5176
cy,-1.3369,91.3883,-0.0146
hp,-0.0142,0.2144,-0.0663
we,5.0702,170.0685,0.0298
li,-3.1062,149.0195,-0.0208
MCC,-1.3345,860.0832,-0.0016
VW,0.1832,12.7026,0.0144
alfa romeo,-0.7125,18.9185,-0.0377
audi,-0.1051,18.3227,-0.0057


## Estimate of x_new 

In [20]:
# Starting values 
theta0_new = clogit.starting_values(y,x_new)

In [23]:
# Find residuals
res_new = est.estimate(clogit.q, theta0_new, np.exp(y), x_new)

Optimization terminated successfully.
         Current function value: 3.473121
         Iterations: 297
         Function evaluations: 12000
         Gradient evaluations: 300


In [24]:
# Estimate the model, standard errors, and t-values
est.print_table(x_vars_new,res_new)

Optimizer succeded after 297 iter. (12000 func. evals.). Final criterion:    3.473.
Results


Unnamed: 0,theta,se,t
logp,-0.1234,11.5426,-0.0107
home,1.48,7.1957,0.2057
cy,-1.3145,92.9873,-0.0141
hp,-0.0134,0.222,-0.0603
we,5.1411,179.3579,0.0287
li,-3.1472,149.8047,-0.021
MCC,-1.3087,746.9562,-0.0018
VW,0.2037,13.3697,0.0152
alfa romeo,-0.6588,19.3619,-0.034
audi,-0.0956,18.8295,-0.0051


# Elastricities

In [31]:
thetahat = res['theta']

In [32]:
# Original choice probabilites
ccp1 = clogit.choice_prob(thetahat, x)

In [33]:
E_own   = np.zeros((N, J))
E_cross = np.zeros((N, J))
log_price = 0 

for j in range(J):
    # A. copy 
    x2 = x.copy()
    
    # B. increase price just for car j 
    rel_change_x = 1e-3
    x2[:, j, log_price] *= (1.0+rel_change_x)
    
    # C. evaluate CCPs
    ccp2 = clogit.choice_prob(thetahat, x2)
    
    # D. percentage change in CCPs 
    rel_change_y = ccp2 / ccp1 - 1.0 
    
    # E. elasticities 
    elasticity = rel_change_y / rel_change_x 
    
    E_own[:, j] = elasticity[:, j]
    
    k_not_j = [k for k in range(J) if k != j]
    E_cross[:, j] = elasticity[:, k_not_j].mean(axis=1)

In [37]:
print(f'Own-price elasticity:  {np.mean(E_own).round(4)}')
print(f'Cross-price elasticity: {np.mean(E_cross).round(4)}')
print(f'Difference in price elasticity: {np.mean(E_own).round(4) - np.mean(E_cross).round(4)}')

Own-price elasticity:  0.0555
Cross-price elasticity: -0.0017
Difference in price elasticity: 0.0572


# Estimation of Parameters of Intrest

In [35]:
# Partiel effects at the average 
def PEA(x,thetahat,inter:bool):
    
    # Average car characteristics on market i
    avg_car_i = np.mean(x, axis = 1) 
    
    avg_car_home = avg_car_i.copy()
    avg_car_home[:,1] = 1

    avg_car_for = avg_car_i.copy()
    avg_car_for[:,1] = 0

    if inter:   # If interaction term is included the data point has to be corrected for the home value as well
        avg_car_home[:,-1] = avg_car_home[:,0]
        avg_car_for[:,-1] = 0

    # For each market introduce the average car and calculate market share
    # Note: Introducing both the average car from both home and foreign 
    # in the same market will not yield significantly different results
    N, J, K = x.shape

    x_new = np.zeros((N,J+1,K))
    x_new[:,:-1,:] = x.copy()
    x_new[:,-1,:] = avg_car_for
    ccp_for = clogit.choice_prob(thetahat,x_new)

    x_new[:,-1,:] = avg_car_home
    ccp_home = clogit.choice_prob(thetahat,x_new)

    bias = np.mean(ccp_home[:,-1]-ccp_for[:,-1])
    return bias

In [None]:
# Marginal willingness to pay
def MWP(thetahat):
    return abs(thetahat[1]/thetahat[0])

In [None]:
# Elasticities
def elas_home(x,thetahat,inter):
    beta = thetahat[0]
    if inter:
        beta += thetahat[-1]
    E_own = (1-clogit.choice_prob(thetahat,x))*beta
    elas_h = np.mean(E_own[x[:,:,1]==1])
    return elas_h

def elas_for(x,thetahat,inter):
    beta = thetahat[0]
    E_own = (1-clogit.choice_prob(thetahat,x))*beta
    elas_f = np.mean(E_own[x[:,:,1]!=1])
    return elas_f
    
def elas_diff(x,thetahat,inter):
    beta = thetahat[0]
    if inter:
        beta += thetahat[-1]
    E_own_f = (1-clogit.choice_prob(thetahat,x))*thetahat[0]
    elas_f = np.mean(E_own_f[x[:,:,1]!=1])
    E_own_h = (1-clogit.choice_prob(thetahat,x))*beta
    elas_h = np.mean(E_own_h[x[:,:,1]==1])
    return abs(elas_f-elas_h)

def elas(x, thetahat,inter):
    elas_h = elas_home(x,thetahat,inter)
    elas_f = elas_for(x,thetahat,inter)
    elas_d = elas_diff(x,thetahat,inter)
    return elas_h, elas_f, elas_d

In [None]:
def properties(x, thetahat, cov, print_out:bool, se:bool, inter:bool):
    """ Aggregate function to calculate and collect all parameters of interest as well as standard errors
    """
    pea = PEA(x,thetahat,inter)
    mwp = MWP(thetahat)
    el_h, el_f, el_d = elas(x, thetahat,inter)

    # Calculate standard errors with delta method
    if se:
        # home bias
        qq0 = lambda theta: PEA(x,theta,inter)
        g0 = est.centered_grad(qq0, thetahat)
        se_home = np.sqrt(g0@cov@g0.T/N)

        # marginal willingness to pay
        qq1 = lambda theta: MWP(theta)
        g1 = est.centered_grad(qq1,thetahat)
        se_mwp = np.sqrt(g1@cov@g1.T/N)

        # elasticities
        # Home 
        qq20 = lambda theta: elas_home(x,theta,inter)
        g20 = est.centered_grad(qq20,thetahat)
        se_20 = np.sqrt(g20@cov@g20.T/N)
        # Foreign 
        qq21 = lambda theta: elas_for(x,theta,inter)
        g21 = est.centered_grad(qq21,thetahat)
        se_21 = np.sqrt(g21@cov@g21.T/N)
        # Difference
        qq22 = lambda theta: elas_diff(x,theta,inter)
        g22 = est.centered_grad(qq22,thetahat)
        se_22 = np.sqrt(g22@cov@g22.T/N)

        data = np.concatenate((np.column_stack((pea,mwp,el_h,el_f,el_d)),np.column_stack((se_home,se_mwp,se_20,se_21,se_22))),axis = 0)

    if print_out:
        df = pd.DataFrame(data = data.T,index = ['PEA','MWP','Own price elasiticity (Home)','Own price elasticity (Foreign)','Own price elasitciity (diff)'],columns = ['Estimate','se'])
        df = df.round(4)
        return df
    else:
        return data

In [None]:
properties(x,res['theta'],res['cov'],print_out = True,se=True,inter=False)

In [None]:
properties(x_new,res_new['theta'],res_new['cov'],print_out = True,se=True,inter=True)