# Project 3: Getting Started 

This notebook is intended to help you get off to a flying start with the cars dataset. You don't have to use this notebook and you can discard any parts you do not like, they are purely intended as a help to get started. 

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
sns.set_theme()

# optimization
from scipy import optimize
import estimation as est
import clogit

import statsmodels.formula.api as smf

# Read in data

The dataset, `cars.csv`, contains cleaned and processed data. If you want to make changes, the notebook, `materialize.ipynb`, creates the data from the raw source datsets. 

In [2]:
cars = pd.read_csv('cars.csv')
lbl_vars = pd.read_csv('labels_variables.csv')
lbl_vals = pd.read_csv('labels_values.csv')

# convert from dataframe to dict
lbl_vals = {c: lbl_vals[c].dropna().to_dict() for c in lbl_vals.columns}

In [3]:
lbl_vars.set_index('variable', inplace=True)

## Overview of the dataset

In [4]:
lbl_vars.join(cars.mean(numeric_only=True).apply(lambda x: f'{x: .2f}').to_frame('Mean'))

Unnamed: 0_level_0,label,Mean
variable,Unnamed: 1_level_1,Unnamed: 2_level_1
ye,year (=first dimension of panel),84.5
ma,market (=second dimension of panel),3.0
co,model code (=third dimension of panel),207.5
zcode,alternative model code (predecessors and succe...,177.76
brd,brand code,16.79
type,name of brand and model,
brand,name of brand,
model,name of model,
org,"origin code (demand side, country with which c...",2.72
loc,"location code (production side, country where ...",5.17


# Set up for analysis

In [5]:
# Pic the prince variable 
price_var = 'princ'

# new variable: log of price
cars['logp'] = np.log(cars[price_var])

# new variable: price elasticity heterogeneous for home-region 
cars['logp_x_home'] = cars[price_var] * cars['home']

### Dummy variables

For working with matrices, we want to have a column for each dummy variable. 

In [6]:
categorical_var = 'brand' # name of categorical variable
dummies = pd.get_dummies(cars[categorical_var]) # creates a matrix of dummies for each value of dummyvar
x_vars_dummies = list(dummies.columns[1:].values) # omit a reference category, here it is the first (hence columns[1:])

# add dummies to the dataframe 
assert dummies.columns[0] not in cars.columns, f'It looks like you have already added this dummy to the dataframe. Avoid duplicates! '
cars = pd.concat([cars, dummies], axis=1)

### `x_vars`: List of regressors to be used 

In [7]:
# 
x_vars = ['logp', 'home', 'cy', 'hp', 'we', 'li'] + x_vars_dummies # <--- !!! choose your preferred variables here 
print(f'K = {len(x_vars)} variables selected.')

# add price elasticity heterogeneous for home-region 
x_vars_new = x_vars + ['logp_x_home']

K = 38 variables selected.


In [8]:
K = len(x_vars)
N = cars.ma.nunique() * cars.ye.nunique()
J = 40 
x = cars[x_vars].values.reshape((N,J,K))
x_new = cars[x_vars_new].values.reshape((N,J,K+1))
# y = np.log(cars['s'].values.reshape((N,J)))
y = cars['s'].values.reshape((N,J))

## Rescaling parameters

### To prevent values optimizer issues

In [9]:
# ['logp', 'home', 'cy', 'hp', 'we', 'li'] 

# Now in cy/1000
cy_i = x_vars.index('cy')
x[:,:,cy_i] = x[:,:,cy_i]/1_000
x_new[:,:,cy_i] = x_new[:,:,cy_i]/1_000

# now in kW/1000
hp_i = x_vars.index('hp')
x[:,:,hp_i] = x[:,:,hp_i]/1_000
x_new[:,:,hp_i] = x_new[:,:,hp_i]/1_000

# Now in tons 
we_i = x_vars.index('we')
x[:,:,we_i] = x[:,:,we_i]/1_000
x_new[:,:,we_i] = x_new[:,:,we_i]/1_000

# now in kW/100
li_i = x_vars.index('li')
x[:,:,li_i] = x[:,:,li_i]/10
x_new[:,:,li_i] = x_new[:,:,li_i]/10

In [10]:
x = x.astype(float)
x_new = x_new.astype(float)

# Conditional Logit

## Estimate of x

In [11]:
# Starting values 
theta0 = clogit.starting_values(y, x)

In [12]:
# Find results
res = est.estimate(clogit.q, theta0, y, x, cov_type = 'Sandwich')

Optimization terminated successfully.
         Current function value: 3.473347
         Iterations: 322
         Function evaluations: 12675
         Gradient evaluations: 325


In [13]:
# Estimate the model, standard errors, and t-values
est.print_table(x_vars,res)

Optimizer succeded after 322 iter. (12675 func. evals.). Final criterion:    3.473.
Results


Unnamed: 0,theta,se,t
logp,-0.1627,0.1164,-1.398
home,1.3618,0.0322,42.2356
cy,-0.1281,0.086,-1.4904
hp,-14.2278,1.8644,-7.6313
we,0.5074,0.1796,2.8256
li,-0.3123,0.1729,-1.8058
MCC,-1.3423,0.1844,-7.2788
VW,0.185,0.0288,6.4156
alfa romeo,-0.7098,0.0608,-11.675
audi,-0.103,0.0429,-2.3984


## Estimate of x_new 

In [14]:
# Starting values 
theta0_new = clogit.starting_values(y,x_new)

In [15]:
# Find residuals
res_new = est.estimate(clogit.q, theta0_new, y, x_new, cov_type = 'Sandwich')

Optimization terminated successfully.
         Current function value: 3.473126
         Iterations: 334
         Function evaluations: 13440
         Gradient evaluations: 336


In [16]:
# Estimate the model, standard errors, and t-values
est.print_table(x_vars_new,res_new)

Optimizer succeded after 334 iter. (13440 func. evals.). Final criterion:    3.473.
Results


Unnamed: 0,theta,se,t
logp,-0.1264,0.1079,-1.1714
home,1.4803,0.0621,23.8302
cy,-0.1249,0.0793,-1.5759
hp,-13.4698,2.2573,-5.9672
we,0.5155,0.1557,3.31
li,-0.3174,0.1638,-1.937
MCC,-1.3348,0.125,-10.6779
VW,0.2028,0.0681,2.9798
alfa romeo,-0.6577,0.0737,-8.9215
audi,-0.0962,0.0674,-1.4268


# Elasticities


In [24]:
# Function to calculate elasticities
def elasticities(thetahat, x, inter):
    N, J, _ = x.shape
    ccp1 = clogit.choice_prob(thetahat, x)
    log_price = 0
    
    if inter:
        log_price = -1

    E_own = np.zeros((N, J))
    E_cross = np.zeros((N, J))

    for j in range(J):
        # A. copy 
        x2 = x.copy()

        # B. increase price just for car j 
        rel_change_x = 1e-3
        x2[:, j, log_price] += np.log(1.0 + rel_change_x)

        # C. evaluate CCPs
        ccp2 = clogit.choice_prob(thetahat, x2)

        # D. percentage change in CCPs 
        rel_change_y = ccp2 / ccp1 - 1.0 

        # E. elasticities 
        elasticity = rel_change_y / rel_change_x 

        E_own[:, j] = elasticity[:, j]

        k_not_j = [k for k in range(J) if k != j]
        E_cross[:, j] = elasticity[:, k_not_j].mean(axis=1)

    own_price_elasticity = np.mean(E_own).round(4)
    cross_price_elasticity = np.mean(E_cross).round(4)

    return own_price_elasticity, cross_price_elasticity

# Estimation of Parameters of Intrest

In [25]:
# Partiel effects at the average 
def PEA(x,thetahat,inter:bool):
    
    # Average car characteristics on market i
    avg_car_i = np.mean(x, axis = 1) 
    
    avg_car_home = avg_car_i.copy()
    avg_car_home[:,1] = 1

    avg_car_for = avg_car_i.copy()
    avg_car_for[:,1] = 0

    if inter:   # If interaction term is included the data point has to be corrected for the home value as well
        avg_car_home[:,-1] = avg_car_home[:,0]
        avg_car_for[:,-1] = 0

    N, J, K = x.shape

    x_new = np.zeros((N,J+1,K))
    x_new[:,:-1,:] = x.copy()
    x_new[:,-1,:] = avg_car_for
    ccp_for = clogit.choice_prob(thetahat,x_new)

    x_new[:,-1,:] = avg_car_home
    ccp_home = clogit.choice_prob(thetahat,x_new)

    bias = np.mean(ccp_home[:,-1]-ccp_for[:,-1])
    return bias

In [26]:
# Marginal willingness to pay
def MWP(thetahat):
    return abs(thetahat[1]/thetahat[0])

In [27]:
def properties(x, thetahat, cov, print_out:bool, se:bool, inter:bool):
    """ Aggregate function to calculate and collect all parameters of interest as well as standard errors
    """
    pea = PEA(x,thetahat,inter)
    mwp = MWP(thetahat)
    el_h, el_f = elasticities(thetahat, x, inter)
    
    # Calculate standard errors with delta method
    # if se:
    #     # home bias
    #     qq0 = lambda theta: PEA(x,theta,inter)
    #     g0 = est.centered_grad(qq0, thetahat)
    #     se_home = np.sqrt(g0@cov@g0.T/N)

    #     # marginal willingness to pay
    #     qq1 = lambda theta: MWP(theta)
    #     g1 = est.centered_grad(qq1,thetahat)
    #     se_mwp = np.sqrt(g1@cov@g1.T/N)

    #     # elasticities
    #     # Home 
    #     qq20 = lambda theta: el_h(theta,x,inter)
    #     g20 = est.centered_grad(qq20,thetahat)
    #     se_20 = np.sqrt(g20@cov@g20.T/N)
    #     # Foreign 
    #     qq21 = lambda theta: el_f(theta,x,inter)
    #     g21 = est.centered_grad(qq21,thetahat)
    #     se_21 = np.sqrt(g21@cov@g21.T/N)
    #     # Difference
    #     # qq22 = lambda theta: el_d(theta,x,inter)
    #     # g22 = est.centered_grad(qq22,thetahat)
    #     # se_22 = np.sqrt(g22@cov@g22.T/N)

    data = np.concatenate(np.column_stack((pea,mwp,el_h,el_f)),axis = 0)
        # data = np.concatenate((np.column_stack((pea,mwp,el_h[0],el_f[0],el_d[0])),np.column_stack((se_home,se_mwp,se_20,se_21,se_22))),axis = 0)

    if print_out:
        # df = pd.DataFrame(data = data.T,index = ['PEA','MWP','Own price elasiticity','Cross price elasticity'],columns = ['Estimate','se'])
        df = pd.DataFrame(data = data.T,index = ['PEA','MWP','Own price elasiticity','Cross price elasticity'],columns = ['Estimate'])
        df = df.round(4)
        return df
    else:
        return data

In [28]:
properties(x,res['theta'],res['cov'],print_out = True,se=True,inter=False)

Unnamed: 0,Estimate
PEA,0.0372
MWP,8.3696
Own price elasiticity,-0.1585
Cross price elasticity,0.0041


In [29]:
properties(x_new,res_new['theta'],res_new['cov'],print_out = True,se=True,inter=True)

Unnamed: 0,Estimate
PEA,0.047
MWP,11.7134
Own price elasiticity,-0.166
Cross price elasticity,0.0043
