In [610]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
sns.set_theme()

# optimization
from scipy import optimize

# For quick OLS
#import statsmodels.formula.api as smf

# Import our toolbox
import clogit as clogit
import mlogit_ante as mlogit
import estimation as est

# Read in data

The dataset, `cars.csv`, contains cleaned and processed data. If you want to make changes, the notebook, `materialize.ipynb`, creates the data from the raw source datsets. 

In [611]:
cars = pd.read_csv('cars.csv')
lbl_vars = pd.read_csv('labels_variables.csv')
lbl_vals = pd.read_csv('labels_values.csv')

# convert from dataframe to dict
lbl_vals = {c: lbl_vals[c].dropna().to_dict() for c in lbl_vals.columns}

In [612]:
lbl_vars.set_index('variable', inplace=True)

## Overview of the dataset

In [613]:
lbl_vars.join(cars.mean(numeric_only=True).apply(lambda x: f'{x: .2f}').to_frame('Mean'))

Unnamed: 0_level_0,label,Mean
variable,Unnamed: 1_level_1,Unnamed: 2_level_1
ye,year (=first dimension of panel),84.5
ma,market (=second dimension of panel),3.0
co,model code (=third dimension of panel),207.5
zcode,alternative model code (predecessors and succe...,177.76
brd,brand code,16.79
type,name of brand and model,
brand,name of brand,
model,name of model,
org,"origin code (demand side, country with which c...",2.72
loc,"location code (production side, country where ...",5.17


In [614]:
temp = lbl_vars.join(cars.mean(numeric_only=True).apply(lambda x: f'{x: .2f}').to_frame('Mean'))
temp['label']['princ']

'=pr/(ngdp/pop): price relative to per capita income (often used in demand model)'

# Set up for analysis

In [615]:
price_var = 'princ'

In [616]:
cars['logp'] = np.log(cars[price_var])

In [617]:
# new variable: price elasticity heterogeneous for home-region 
cars['logp_x_home'] = cars[price_var] * cars['home']

### Dummy variables

For working with matrices, we want to have a column for each dummy variable. 

In [618]:
categorical_var = 'brand' # name of categorical variable
dummies = pd.get_dummies(cars[categorical_var]) # creates a matrix of dummies for each value of dummyvar
x_vars_dummies = list(dummies.columns[1:].values) # omit a reference category, here it is the first (hence columns[1:])

# add dummies to the dataframe 
assert dummies.columns[0] not in cars.columns, f'It looks like you have already added this dummy to the dataframe. Avoid duplicates! '
cars = pd.concat([cars,dummies], axis=1)

In [619]:
# NB! Let's take a look at the reference category
dummies.columns[0] #This is BMW
# Something might be going on 

'BMW'

In [620]:
# Dropping columns
cars = cars.dropna(axis=1)

### `x_vars`: List of regressors to be used 

In [621]:
x_vars = ['logp', 'home', 'logp_x_home', 'cy', 'hp', 'we', 'li', 'he'] + x_vars_dummies
# If we include dummies for the brand, the 3-D matrix x becomes singular. Why?
# 
print(f'K = {len(x_vars)} variables selected.')

K = 40 variables selected.


In [622]:
cars['brand'].nunique() #Different car brands

33

In [623]:
cars['co'].nunique() #Different car models

285

In [624]:
K = len(x_vars) #The "household" characteristics
N = cars.ma.nunique() * cars.ye.nunique() #The market-year 'i'
J = 40 #The 40 different cars
x = cars[x_vars].values.reshape((N,J,K))

In [625]:
x[0, 0, 1]

0.0

In [626]:
y = cars['s'].values #The market shares 
y = y.reshape((N,J)) #The market shares in market-year N for J=40 different cars

In [627]:
# Let's look at a particular 'market-year' to see what is going on

test = cars[(cars['ma']==1) & (cars['ye']==70)] 


# Interpretation: Each 'market-year', the top 40 most selling cars make up about 99.99% of all cars sold.
    # Each market-year share column (called "s" in the data) sums to one


In [628]:
test

Unnamed: 0,ye,ma,co,zcode,brd,type,brand,model,org,loc,...,seat,skoda,suzuki,tal/hillman,tal/matra,tal/simca,tal/sunb,talbot,toyota,volvo
0,70,1,15,14,2,audi 100/200,audi,100/200,2,4,...,0,0,0,0,0,0,0,0,0,0
1,70,1,26,35,4,citroen 2 CV 6 - 2 CV 4,citroen,2CV6,1,3,...,0,0,0,0,0,0,0,0,0,0
2,70,1,36,36,4,citroen dyane,citroen,dyane,1,3,...,0,0,0,0,0,0,0,0,0,0
3,70,1,64,67,7,fiat 128,fiat,128,3,5,...,0,0,0,0,0,0,0,0,0,0
4,70,1,71,80,8,ford escort,ford,escort,2,4,...,0,0,0,0,0,0,0,0,0,0
5,70,1,134,159,16,mercedes 200-300,mercedes,200,2,4,...,0,0,0,0,0,0,0,0,0,0
6,70,1,165,197,19,opel kadett,opel,kadet,2,4,...,0,0,0,0,0,0,0,0,0,0
7,70,1,172,194,19,opel rekord,opel,record,2,4,...,0,0,0,0,0,0,0,0,0,0
8,70,1,186,202,20,peugeot 504,peugeot,504,1,3,...,0,0,0,0,0,0,0,0,0,0
9,70,1,187,207,20,peugeot 304,peugeot,304,1,3,...,0,0,0,0,0,0,0,0,0,0


In [629]:
test['s'].sum() #The share of total sales for the top 40 most cars sum to one

0.9999999999999979

$$
u_{i j h}=\mathbf{x}_{i j} \boldsymbol{\beta}_o+\varepsilon_{i j h}, \quad j=1, \ldots, J
$$

where: 

- $i$ is the $\textit{country-year}$ pair
- $j$ is the alternative car
- $h$ is the household

First off: Are we: 

    1. interested in the marginal utility of a car's characteristic (conditional logit) or 
    
    2. the change in utility of car $j$ relative to car 1 given a change in household characteristics?

In this assignment, we are examning home bias - that is the propensity to choose a car manufactured in the home country. We are therefore interested in 1) and will use a conditional logit model.

In [630]:
x.shape

(150, 40, 40)

In [631]:
# The starting values to be passed to our optimizer
theta_start = clogit.starting_values(y, x)
theta_start.shape

(40,)

In [632]:
u = clogit.util(theta_start, x)
u.shape

(150, 40)

In [633]:
# Our conditional choice probabilities
# For coefficients ('theta') starting at zero, these must be equal to zero
# Intuition: No utility is gained by any car characteristics, thus market shares must be equal. Let's check this.
ccp = clogit.choice_prob(theta_start, x)
(ccp == 1/J) # all choice probs are equal to each other.

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [634]:
ccp

array([[0.025, 0.025, 0.025, ..., 0.025, 0.025, 0.025],
       [0.025, 0.025, 0.025, ..., 0.025, 0.025, 0.025],
       [0.025, 0.025, 0.025, ..., 0.025, 0.025, 0.025],
       ...,
       [0.025, 0.025, 0.025, ..., 0.025, 0.025, 0.025],
       [0.025, 0.025, 0.025, ..., 0.025, 0.025, 0.025],
       [0.025, 0.025, 0.025, ..., 0.025, 0.025, 0.025]])

In [635]:
# In the first market-year, what are the choice-probs? (Given individuals place no weight on any car characteristics)
ccp.sum(axis=1) #Summing over the columns J equal one. Goods

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [636]:
print(ccp.shape)
np.unique(ccp) #Checking if we have any NaN

(150, 40)


array([0.025])

In [637]:
temp_df = pd.DataFrame(x[50]) #Looking at the 50th market-year
temp_df.columns=x_vars
temp_df.head()

Unnamed: 0,logp,home,logp_x_home,cy,hp,we,li,he,MCC,VW,...,seat,skoda,suzuki,tal/hillman,tal/matra,tal/simca,tal/sunb,talbot,toyota,volvo
0,-0.43865,0.0,0.0,1300.0,63.0,940.0,7.8,135.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.226922,0.0,0.0,1800.0,66.0,1050.0,7.6,139.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.143563,0.0,0.0,1600.0,75.0,1080.0,8.5,138.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.407682,0.0,0.0,2000.0,110.0,1425.0,9.8,141.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.891653,1.0,0.409978,1000.0,33.0,640.0,5.0,135.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [638]:
res = est.estimate(clogit.q2, theta_start, y, x)

  ll_i = np.sum(y*np.log(ccp),axis=1, keepdims=False) #Conducting element-wise multiplication and summing over columns (car alternatives)
  df = fun(x) - f0
  ll_i = np.sum(y*np.log(ccp),axis=1, keepdims=False) #Conducting element-wise multiplication and summing over columns (car alternatives)


Optimization terminated successfully.
         Current function value: 3.472063
         Iterations: 307
         Function evaluations: 12883
         Gradient evaluations: 314


In [639]:
res['theta']

array([-1.69073894e-01,  1.45987521e+00, -1.40314657e-01, -1.19939499e-04,
       -1.39493298e-02,  6.60411771e-04, -3.72314564e-02, -1.13843288e-02,
       -1.20737170e+00,  2.11808466e-01, -6.74183045e-01, -1.05530076e-01,
       -4.32039677e-01, -4.38987999e-01, -7.11760477e-01, -9.15691153e-02,
        5.75311014e-02, -2.60254958e-01, -6.62326934e-01, -1.31041674e+00,
       -8.85436401e-01, -3.41803419e-01,  5.06043834e-01, -4.32003932e-01,
       -1.29140069e-01,  1.22592059e-02, -4.17758806e-02,  5.36114041e-02,
       -3.67581760e-01, -6.89864687e-01, -6.85071878e-01, -7.92354337e-01,
       -1.07853555e+00, -1.23367151e+00, -2.46205693e+00, -3.94831094e-01,
       -3.95032854e+00, -3.78729311e-01, -2.43115850e-02, -1.46205456e-02])

In [640]:
res['se'].shape

(40,)

In [641]:
tab_1 = pd.DataFrame(res['theta'], index=x_vars)
tab_1

Unnamed: 0,0
logp,-0.169074
home,1.459875
logp_x_home,-0.140315
cy,-0.00012
hp,-0.013949
we,0.00066
li,-0.037231
he,-0.011384
MCC,-1.207372
VW,0.211808


In [642]:
theta_hat = res['theta']

ccp_res = clogit.choice_prob(theta_hat, x)

In [649]:
own_price_elas=(1-ccp_res)*res['theta'][0]

In [650]:
own_price_elas.mean()

-0.164847046684782

In [558]:
theta_hat

array([-1.28710841e+00,  1.35287385e+00, -6.03408430e-05, -6.76435466e-03,
        1.04151445e-03, -4.10670431e-02, -9.94273106e-03, -1.19005749e+00,
        1.44340155e-01, -7.79484838e-01, -1.05946374e-01, -5.26993725e-01,
       -7.09021756e-01, -8.09258828e-01, -2.41322931e-01, -7.25280122e-02,
       -4.15710773e-01, -9.42957188e-01, -1.52654608e+00, -9.56835817e-01,
       -5.31458366e-01,  6.77207474e-01, -6.02592126e-01, -2.98415505e-01,
       -1.10808279e-01, -1.62827634e-01, -6.30631670e-02, -5.08875081e-01,
       -6.53225221e-01, -8.62164551e-01, -1.06878249e+00, -1.18967911e+00,
       -1.43544032e+00, -2.33444394e+00, -5.90915138e-01, -4.01407356e+00,
       -5.76277338e-01, -2.10913823e-01, -5.78961811e-02])

In [559]:
ccp_res.shape

(150, 40)

In [560]:
theta_hat.shape

(39,)

In [561]:
(ccp_res*(1-ccp_res)*theta_hat[0]).mean()

-0.030965891511816664

In [562]:
market_share = cars['s'].values.reshape((N,J))
market_share[0] # For the first market-year, what are the observed market shares of the 40 different cars?

array([0.01129646, 0.01464355, 0.02803195, 0.03263421, 0.03974679,
       0.02786459, 0.04393066, 0.0460226 , 0.02091936, 0.01025049,
       0.03598131, 0.02991469, 0.03347098, 0.01861823, 0.03075147,
       0.03911921, 0.05857422, 0.01255162, 0.00920452, 0.00878613,
       0.00920452, 0.01548033, 0.01945501, 0.03127445, 0.01799065,
       0.01380678, 0.00962291, 0.03012388, 0.03179743, 0.01045968,
       0.01108726, 0.02259291, 0.02322049, 0.02510324, 0.02656759,
       0.05899261, 0.01589872, 0.02091936, 0.03233297, 0.02175614])

In [563]:
ccp_res[0] # For the first market-year, what are the estimated market shares of the 40 different cars given our control variables?

array([0.01732509, 0.02253426, 0.02327225, 0.02574827, 0.03758245,
       0.02341019, 0.03041833, 0.02699023, 0.01625888, 0.02042037,
       0.03822229, 0.02575881, 0.03424574, 0.02923568, 0.02339779,
       0.02416328, 0.04340487, 0.02458078, 0.01825473, 0.01835161,
       0.01410599, 0.02223997, 0.01272561, 0.01926162, 0.02370462,
       0.02059064, 0.01649524, 0.03393284, 0.03343699, 0.01831916,
       0.02185908, 0.03308168, 0.02730683, 0.02019078, 0.0368947 ,
       0.02384392, 0.01954934, 0.03703371, 0.01584704, 0.02600437])

In [566]:
E_own   = np.zeros((N, J))
E_cross = np.zeros((N, J))
dpdx    = np.zeros((N, J))
k_price = 0

for j in range(J):
    # A. copy 
    x2 = x.copy()
    
    # B. increase price just for car j 
    rel_change_x = 1e-3
    x2[:, j, k_price] *= (1+rel_change_x) # Fill in 

    # C. evaluate CCPs
    ccp2 = clogit.choice_prob(theta_hat, x2) # Fill in 
    
    # D. percentage change in CCPs 
    rel_change_y = ccp2/ccp_res-1 # Fill in 
    
    # E. elasticities 
    elasticity = rel_change_y/rel_change_x # Fill in 
    
    E_own[:, j] = elasticity[:, j] # Fill in 
    
    k_not_j = [k for k in range(J) if k != j] # indices for all other cars than j -> this list changes as we loop through j
    E_cross[:, j] = elasticity[:, k_not_j].mean(axis=1) # Fill in: Avg. among the cars k_not_j, taking the average over axis=1 (the cars, not the individuals!)

In [567]:
print(f'Own-price elasticity:  {np.mean(E_own).round(4)}')
print(f'Cross-price elasticity: {np.mean(E_cross).round(4)}')

Own-price elasticity:  -0.9497
Cross-price elasticity: 0.0224


In [568]:
E_own   = np.zeros((N, J))
E_cross = np.zeros((N, J))
dpdx    = np.zeros((N, J))
k_price = 2

for j in range(J):
    # A. copy 
    x2 = x.copy()
    
    # B. increase price just for car j 
    rel_change_x = 1e-3
    x2[:, j, k_price] *= (1+rel_change_x) # Fill in 

    # C. evaluate CCPs
    ccp2 = clogit.choice_prob(theta_hat, x2) # Fill in 
    
    # D. percentage change in CCPs 
    rel_change_y = ccp2/ccp_res-1 # Fill in 
    
    # E. elasticities 
    elasticity = rel_change_y/rel_change_x # Fill in 
    
    E_own[:, j] = elasticity[:, j] # Fill in 
    
    k_not_j = [k for k in range(J) if k != j] # indices for all other cars than j -> this list changes as we loop through j
    E_cross[:, j] = elasticity[:, k_not_j].mean(axis=1) # Fill in: Avg. among the cars k_not_j, taking the average over axis=1 (the cars, not the individuals!)

In [569]:
print(f'Own-price elasticity:  {np.mean(E_own).round(4)}')
print(f'Cross-price elasticity: {np.mean(E_cross).round(4)}')

Own-price elasticity:  -0.0788
Cross-price elasticity: 0.0019


In [570]:
def marg_effect_discrete(betahat, x0, x1):
    # Calculate the probabilities of x1 and x0, using betahat of the latest model estimation.
    # Then substract the probability vector of x0 from the probability vector of x1.
    prob_x1 = clogit.choice_prob(betahat, x1) # Fill in: choice probability matrix at x1 
    prob_x0 = clogit.choice_prob(betahat, x0) # Fill in: choice probability matrix at x0
    return (prob_x1 - prob_x0)

In [571]:
# Consider some person
x0 = x.copy()
x0[:, :, 1]=0

x1 = x0.copy()

x1[:, :, 1] = 1.0 

In [572]:
marg_effect_discrete(theta_hat, x0, x1).shape

(150, 40)

---

Old stuff

---

In [573]:
np.linalg.matrix_rank(x) # something is not right when you include the car-brand dummies -> rank-condition is not fulfilled -> matrix becomes singular
# Reference category is 'BMW'
# Could be that none of the top 40 cars sold in a given year and market is home-made and/or that they are BMW's -> some column(s) then become(s) zero.

array([21, 21, 23, 21, 23, 24, 24, 23, 23, 21, 22, 22, 23, 21, 21, 21, 22,
       22, 22, 21, 21, 20, 21, 21, 22, 21, 21, 22, 22, 22, 20, 22, 22, 21,
       23, 23, 22, 19, 19, 19, 20, 20, 20, 19, 20, 21, 20, 20, 21, 21, 20,
       20, 19, 19, 20, 20, 21, 20, 20, 19, 20, 21, 21, 23, 21, 20, 20, 21,
       23, 21, 22, 22, 22, 21, 21, 21, 21, 21, 21, 22, 22, 22, 23, 24, 24,
       23, 24, 23, 23, 22, 20, 21, 22, 19, 20, 20, 20, 20, 20, 20, 21, 21,
       21, 20, 20, 21, 21, 20, 21, 18, 20, 20, 20, 21, 22, 23, 24, 23, 26,
       24, 22, 22, 24, 26, 24, 24, 23, 20, 21, 20, 21, 23, 21, 21, 20, 21,
       21, 21, 21, 20, 20, 21, 20, 21, 21, 20, 22, 22, 23, 21],
      dtype=int64)

In [574]:
# So in each year, let's define 40 different choices from j=0,...,40 
# NB! Remember that one particular car model may appear multiple times over different years and/or markets.
# This is slightly confusing -> basically, we model the 40 different choices, even though the same choice may appear again later

In [575]:
# Repeating the choice of 40 different cars in each market-year --> 6000 obs
# Making a variable that starts from 0,..., J-1
# Each market-year, agents are subject to 40 different discrete market choices
cars['y'] = np.resize(np.arange(0,J), N*J)
y = cars['y'].values.reshape((N,J))

In [576]:
y[:, 0] #40 different choices vary for each market-year

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [577]:
res_alt = est.estimate(clogit.q, theta_start, y[:, 0], x)

         Current function value: 0.976207
         Iterations: 95
         Function evaluations: 5972
         Gradient evaluations: 149
Failed to compute std. errs.: got error "Singular matrix"


In [578]:
theta_res_alt = res_alt['theta']

ccp_res_alt = clogit.choice_prob(theta_res_alt, x)

In [579]:
#cars['y'] = np.resize(np.arange(0, M*T), N*J)
#y = cars['y'].values.reshape((N,J))

#cars['y'] = pd.factorize(cars['co'].values)[0]
#y = pd.factorize(cars['co'].values)[0]
#y = y.reshape((N,J))

#cars['brand']=pd.factorize(cars['brand'])[0]
#y = cars['co'].values.reshape((N,J))
#cars['co']=pd.factorize(cars['co'])[0]