In [47]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
sns.set_theme()

# optimization
from scipy import optimize

# For quick OLS
#import statsmodels.formula.api as smf

# Import our toolbox
import clogit as clogit
import mlogit_ante as mlogit
import estimation as est

# Read in data

The dataset, `cars.csv`, contains cleaned and processed data. If you want to make changes, the notebook, `materialize.ipynb`, creates the data from the raw source datsets. 

In [48]:
cars = pd.read_csv('cars.csv')
lbl_vars = pd.read_csv('labels_variables.csv')
lbl_vals = pd.read_csv('labels_values.csv')

# convert from dataframe to dict
lbl_vals = {c: lbl_vals[c].dropna().to_dict() for c in lbl_vals.columns}

In [49]:
lbl_vars.set_index('variable', inplace=True)

## Overview of the dataset

In [50]:
lbl_vars.join(cars.mean(numeric_only=True).apply(lambda x: f'{x: .2f}').to_frame('Mean'))

Unnamed: 0_level_0,label,Mean
variable,Unnamed: 1_level_1,Unnamed: 2_level_1
ye,year (=first dimension of panel),84.5
ma,market (=second dimension of panel),3.0
co,model code (=third dimension of panel),207.5
zcode,alternative model code (predecessors and succe...,177.76
brd,brand code,16.79
type,name of brand and model,
brand,name of brand,
model,name of model,
org,"origin code (demand side, country with which c...",2.72
loc,"location code (production side, country where ...",5.17


In [51]:
cars['s'].nunique()

5986

# Set up for analysis

In [52]:
price_var = 'princ'

In [53]:
cars['logp'] = np.log(cars[price_var])

In [54]:
# new variable: price elasticity heterogeneous for home-region 
cars['logp_x_home'] = cars[price_var] * cars['home']

### Dummy variables

For working with matrices, we want to have a column for each dummy variable. 

In [55]:
categorical_var = 'brand' # name of categorical variable
dummies = pd.get_dummies(cars[categorical_var]) # creates a matrix of dummies for each value of dummyvar
x_vars_dummies = list(dummies.columns[1:].values) # omit a reference category, here it is the first (hence columns[1:])

# add dummies to the dataframe 
assert dummies.columns[0] not in cars.columns, f'It looks like you have already added this dummy to the dataframe. Avoid duplicates! '
cars = pd.concat([cars,dummies], axis=1)

In [56]:
# NB! Let's take a look at the reference category
dummies.columns[0] #This is BMW
# Something might be going on 

'BMW'

### `x_vars`: List of regressors to be used 

In [57]:
x_vars = ['logp', 'home', 'cy', 'hp', 'we', 'li'] + ['tax', 'pop', 'inc'] #+ x_vars_dummies 
# If we include dummies for the brand, the 3-D matrix x becomes singular. Why?
# 
print(f'K = {len(x_vars)} variables selected.')

K = 9 variables selected.


In [58]:
cars = cars.dropna(axis=1)

In [59]:
cars['brand'].nunique() #Different car brands

33

In [60]:
cars['co'].nunique() #Different car models

285

In [61]:
M = 30
T = 5
N = M*T
J = 40

In [62]:
K = len(x_vars) #The "household" characteristics
N = cars.ma.nunique() * cars.ye.nunique() #The market-year 'i'
J = 40 #The 40 different cars
x = cars[x_vars].values.reshape((N,J,K))

#cars['y'] = np.resize(np.arange(0, M*T), N*J)
#y = cars['y'].values.reshape((N,J))

#cars['y'] = pd.factorize(cars['co'].values)[0]
#y = pd.factorize(cars['co'].values)[0]
#y = y.reshape((N,J))

cars['y'] = np.resize(np.arange(0,J), N*J)
y = cars['y'].values.reshape((N,J))

#cars['brand']=pd.factorize(cars['brand'])[0]
#y = cars['co'].values.reshape((N,J))
#cars['co']=pd.factorize(cars['co'])[0]

In [63]:
np.unique(y)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39])

In [64]:
y.shape

(150, 40)

In [65]:
y[1,:]

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39])

In [66]:
test = cars[(cars['ma']==1) & (cars['ye']==70)]

In [67]:
test

Unnamed: 0,ye,ma,co,zcode,brd,type,brand,model,org,loc,...,skoda,suzuki,tal/hillman,tal/matra,tal/simca,tal/sunb,talbot,toyota,volvo,y
0,70,1,15,14,2,audi 100/200,audi,100/200,2,4,...,0,0,0,0,0,0,0,0,0,0
1,70,1,26,35,4,citroen 2 CV 6 - 2 CV 4,citroen,2CV6,1,3,...,0,0,0,0,0,0,0,0,0,1
2,70,1,36,36,4,citroen dyane,citroen,dyane,1,3,...,0,0,0,0,0,0,0,0,0,2
3,70,1,64,67,7,fiat 128,fiat,128,3,5,...,0,0,0,0,0,0,0,0,0,3
4,70,1,71,80,8,ford escort,ford,escort,2,4,...,0,0,0,0,0,0,0,0,0,4
5,70,1,134,159,16,mercedes 200-300,mercedes,200,2,4,...,0,0,0,0,0,0,0,0,0,5
6,70,1,165,197,19,opel kadett,opel,kadet,2,4,...,0,0,0,0,0,0,0,0,0,6
7,70,1,172,194,19,opel rekord,opel,record,2,4,...,0,0,0,0,0,0,0,0,0,7
8,70,1,186,202,20,peugeot 504,peugeot,504,1,3,...,0,0,0,0,0,0,0,0,0,8
9,70,1,187,207,20,peugeot 304,peugeot,304,1,3,...,0,0,0,0,0,0,0,0,0,9


In [68]:
test['y'].nunique()

40

In [69]:
test['s'].sum()

0.9999999999999979

$$
u_{i j h}=\mathbf{x}_{i j} \boldsymbol{\beta}_o+\varepsilon_{i j h}, \quad j=1, \ldots, J
$$

where: 

- $i$ is the $\textit{country-year}$ pair
- $j$ is the alternative car
- $h$ is the household

First off: Are we: 

    1. interested in the marginal utility of a car's characteristic (conditional logit) or 
    
    2. the change in utility of car $j$ relative to car 1 given a change in household characteristics?

In this assignment, we are examning home bias - that is the propensity to choose a car manufactured in the home country. We are therefore interested in 1) and will use a conditional logit model.

In [70]:
x.shape

(150, 40, 9)

In [71]:
# The starting values to be passed to our optimizer
theta_start = clogit.starting_values(y, x)
theta_start.shape

(9,)

In [72]:
u = clogit.util(theta_start, x)
u.shape

(150, 40)

In [73]:
# Our conditional choice probabilities
# For coefficients ('theta') starting at zero, these must be equal to zero
# Intuition: No utility is gained by any car characteristics, thus market shares must be equal. Let's check this.
ccp = clogit.choice_prob(theta_start, x)
(ccp == 1/J) # all choice probs are equal to each other.

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [74]:
# In the first market-year, what are the choice-probs? (Given individuals place no weight on any car characteristics)
ccp.sum(axis=1)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [75]:
print(ccp.shape)
np.unique(ccp) #Why do we have NaN?

(150, 40)


array([0.025])

In [76]:
print(theta_start.shape, y.shape, x.shape)

(9,) (150, 40) (150, 40, 9)


In [77]:
np.linalg.matrix_rank(x) # something is not right when you include the car-brand dummies -> rank-condition is not fulfilled -> matrix becomes singular
# Reference category is 'BMW'
# Could be that none of the top 40 cars sold in a given year and market is home-made and/or that they are BMW's -> column(s) becomes zero.

array([6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7], dtype=int64)

In [78]:
test2 = pd.DataFrame(x[50])
test2.columns=x_vars
test2

Unnamed: 0,logp,home,cy,hp,we,li,tax,pop,inc
0,-0.43865,0.0,1300.0,63.0,940.0,7.8,0.25,56730000.0,29513.0262
1,-0.226922,0.0,1800.0,66.0,1050.0,7.6,0.25,56730000.0,29513.0262
2,-0.143563,0.0,1600.0,75.0,1080.0,8.5,0.25,56730000.0,29513.0262
3,0.407682,0.0,2000.0,110.0,1425.0,9.8,0.25,56730000.0,29513.0262
4,-0.891653,1.0,1000.0,33.0,640.0,5.0,0.25,56730000.0,29513.0262
5,-0.473815,1.0,1400.0,53.0,900.0,6.9,0.25,56730000.0,29513.0262
6,0.053274,1.0,2000.0,85.0,1280.0,8.3,0.25,56730000.0,29513.0262
7,-1.134503,0.0,750.0,25.0,700.0,5.6,0.25,56730000.0,29513.0262
8,-0.986177,0.0,900.0,33.0,740.0,6.2,0.25,56730000.0,29513.0262
9,-0.58227,0.0,1400.0,53.0,945.0,7.1,0.25,56730000.0,29513.0262


In [79]:
res = est.estimate(clogit.q, theta_start, y[:, 0], x)

Optimization terminated successfully.
         Current function value: 3.192185
         Iterations: 21
         Function evaluations: 270
         Gradient evaluations: 27


In [80]:
res['theta']

array([ 6.42855096e+00,  1.21097154e-01,  2.04601799e-04,  8.99830539e-03,
       -8.71445571e-03, -3.25885047e-01, -1.20239951e-06, -1.85070248e-07,
        0.00000000e+00])

In [81]:
theta_res = res['theta']

ccp_res = clogit.choice_prob(theta_res, x)

In [82]:
ccp_res

array([[0.03042623, 0.00955542, 0.01248148, ..., 0.00516778, 0.00748987,
        0.00374895],
       [0.0280647 , 0.04121252, 0.03539991, ..., 0.02544964, 0.01216756,
        0.00995351],
       [0.07507805, 0.02401308, 0.02165588, ..., 0.01201194, 0.02049831,
        0.00643405],
       ...,
       [0.01987101, 0.0575612 , 0.03038645, ..., 0.02091369, 0.00750684,
        0.00701568],
       [0.02411644, 0.06776901, 0.01009842, ..., 0.00275334, 0.00756803,
        0.01486795],
       [0.0339414 , 0.09698301, 0.00933931, ..., 0.01292417, 0.02049899,
        0.01357216]])

In [83]:
market_share = cars['s'].values.reshape((N,J))
market_share[0]

array([0.01129646, 0.01464355, 0.02803195, 0.03263421, 0.03974679,
       0.02786459, 0.04393066, 0.0460226 , 0.02091936, 0.01025049,
       0.03598131, 0.02991469, 0.03347098, 0.01861823, 0.03075147,
       0.03911921, 0.05857422, 0.01255162, 0.00920452, 0.00878613,
       0.00920452, 0.01548033, 0.01945501, 0.03127445, 0.01799065,
       0.01380678, 0.00962291, 0.03012388, 0.03179743, 0.01045968,
       0.01108726, 0.02259291, 0.02322049, 0.02510324, 0.02656759,
       0.05899261, 0.01589872, 0.02091936, 0.03233297, 0.02175614])

In [84]:
ccp_res[0]

array([0.03042623, 0.00955542, 0.01248148, 0.0094963 , 0.00351616,
       0.0139783 , 0.00733293, 0.00256075, 0.00382393, 0.01444181,
       0.00562273, 0.00490313, 0.00911352, 0.00927682, 0.01277412,
       0.017501  , 0.00455071, 0.01176699, 0.52052067, 0.03567286,
       0.08219593, 0.01358915, 0.00242163, 0.00813832, 0.00734331,
       0.01284128, 0.01010135, 0.00880175, 0.00637684, 0.01262221,
       0.03729414, 0.00764773, 0.01604368, 0.0044836 , 0.0073164 ,
       0.00343477, 0.00362545, 0.00516778, 0.00748987, 0.00374895])

In [85]:
market_share.shape

(150, 40)

In [86]:
ccp_res.shape

(150, 40)

### How do you calc the log-lik cont acc to the notation in the assignment??

In [87]:
log_like_market = market_share * np.log(ccp)

In [88]:
log_like_market.shape

(150, 40)

In [89]:
log_like_market

array([[-0.04167127, -0.05401831, -0.10340648, ..., -0.07716901,
        -0.11927243, -0.08025577],
       [-0.06970409, -0.06428441, -0.09806126, ..., -0.04512321,
        -0.11367343, -0.09678501],
       [-0.03003248, -0.08001481, -0.06243417, ..., -0.05524576,
        -0.04033048, -0.10681655],
       ...,
       [-0.08623794, -0.03200978, -0.03263872, ..., -0.04278096,
        -0.06317241, -0.03111557],
       [-0.07847612, -0.03822282, -0.02797553, ..., -0.03170546,
        -0.07697698, -0.05334074],
       [-0.10661032, -0.03589541, -0.09121368, ..., -0.13260145,
        -0.04959585, -0.03861339]])

In [90]:
ccp.shape

(150, 40)

In [91]:
market_share.shape

(150, 40)

In [92]:
log_like_market = market_share * ccp_res

In [93]:
log_like_market.shape

(150, 40)