In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
sns.set_theme()

# optimization
from scipy import optimize

# For quick OLS
#import statsmodels.formula.api as smf

# Import our toolbox
import clogit as clogit
import estimation as est

# Read in data

The dataset, `cars.csv`, contains cleaned and processed data. If you want to make changes, the notebook, `materialize.ipynb`, creates the data from the raw source datsets. 

In [2]:
cars = pd.read_csv('cars.csv')
lbl_vars = pd.read_csv('labels_variables.csv')
lbl_vals = pd.read_csv('labels_values.csv')

# convert from dataframe to dict
lbl_vals = {c: lbl_vals[c].dropna().to_dict() for c in lbl_vals.columns}

In [3]:
lbl_vars.set_index('variable', inplace=True)

## Overview of the dataset

In [4]:
lbl_vars.join(cars.mean(numeric_only=True).apply(lambda x: f'{x: .2f}').to_frame('Mean'))

Unnamed: 0_level_0,label,Mean
variable,Unnamed: 1_level_1,Unnamed: 2_level_1
ye,year (=first dimension of panel),84.5
ma,market (=second dimension of panel),3.0
co,model code (=third dimension of panel),207.5
zcode,alternative model code (predecessors and succe...,177.76
brd,brand code,16.79
type,name of brand and model,
brand,name of brand,
model,name of model,
org,"origin code (demand side, country with which c...",2.72
loc,"location code (production side, country where ...",5.17


In [5]:
cars['pr']

0       149875.0
1        59937.5
2        64937.5
3        87375.0
4        81125.0
          ...   
5995     14495.0
5996     13005.0
5997      8500.0
5998     14025.0
5999      7495.0
Name: pr, Length: 6000, dtype: float64

In [6]:
cars['s'].nunique()

5986

# Set up for analysis

In [7]:
price_var = 'princ'

In [8]:
cars['logp'] = np.log(cars[price_var])

In [9]:
# new variable: price elasticity heterogeneous for home-region 
cars['logp_x_home'] = cars[price_var] * cars['home']

### Dummy variables

For working with matrices, we want to have a column for each dummy variable. 

In [10]:
categorical_var = 'brand' # name of categorical variable
dummies = pd.get_dummies(cars[categorical_var]) # creates a matrix of dummies for each value of dummyvar
x_vars_dummies = list(dummies.columns[1:].values) # omit a reference category, here it is the first (hence columns[1:])

# add dummies to the dataframe 
assert dummies.columns[0] not in cars.columns, f'It looks like you have already added this dummy to the dataframe. Avoid duplicates! '
cars = pd.concat([cars,dummies], axis=1)

### `x_vars`: List of regressors to be used 

In [11]:
x_vars = ['logp', 'home', 'cy', 'hp', 'we', 'li'] + x_vars_dummies + ['li', 'sp', 'cy', 'tax'] # <--- !!! choose your preferred variables here 
print(f'K = {len(x_vars)} variables selected.')

K = 42 variables selected.


In [12]:
cars['brand'].nunique() #Different car brands

33

In [13]:
cars['co'].nunique() #Different car models

285

In [14]:
cars['co'].unique()

array([ 15,  26,  36,  64,  71, 134, 165, 172, 186, 187, 196, 212, 213,
       214, 217, 269, 291, 406, 407, 410, 412, 417, 418, 422, 429, 430,
       431, 435, 437, 439, 468, 488, 490, 491, 497, 521, 524, 536, 541,
       544,  35,  77, 170, 314, 315, 469, 537,  61,  76, 423, 440, 460,
        13,  50,  62, 158, 197, 276, 495, 150, 268, 288, 464,   9,  30,
        63, 229, 285, 297,  18,  37,  80, 139, 184, 284, 525, 540,  70,
        83, 122, 141, 209, 183, 208, 306,  21,  32,  60, 123, 138, 206,
       234, 181, 277, 294,  51, 153, 286, 204, 218,  28,  52,  73, 164,
       205,  55, 133, 149, 178,  72, 200,  74, 179, 199, 243, 298,  27,
       168, 180, 270,  53, 166, 198,  29, 295,  54, 121, 154, 203,  38,
       177, 188, 247, 293,  39, 136, 137, 190, 215, 216, 264,  69,  78,
       818, 303, 800, 804, 850, 851, 875, 876, 882, 893, 808, 910, 901,
       924, 953, 900, 914, 936, 951, 978, 980, 400, 402, 419, 434, 453,
       455, 458, 481, 500, 118, 499, 401, 526, 210, 409, 185, 41

In [15]:
cars['type'].nunique()

323

In [16]:
cars['zcode'].nunique()

207

In [17]:
cars['brand'].unique()

array(['audi', 'citroen', 'fiat', 'ford', 'mercedes', 'opel', 'peugeot',
       'renault', 'rover', 'toyota', 'VW', 'BMW', 'daf', 'nissan',
       'tal/simca', 'volvo', 'talbot', 'mazda', 'alfa romeo', 'honda',
       'mitsubishi', 'seat', 'suzuki', 'lancia', 'tal/matra', 'skoda',
       'daewoo', 'hyundai', 'MCC', 'innocenti', 'tal/sunb', 'tal/hillman',
       'saab'], dtype=object)

In [49]:
K = len(x_vars) #The "household" characteristics
N = cars.ma.nunique() * cars.ye.nunique() #The market-year 'i'
J = 40 #The 40 different cars
x = cars[x_vars].values.reshape((N,J,K))


cars['y'] = np.resize(np.arange(0,J), N*J)
y = cars['y'].values.reshape((N,J))

#cars['brand']=pd.factorize(cars['brand'])[0]
#y = cars['co'].values.reshape((N,J))
#cars['co']=pd.factorize(cars['co'])[0]

In [48]:
cars['y'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39])

In [51]:
y.shape

(150, 40)

In [52]:
y[:, 0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [19]:
cars[x_vars]

Unnamed: 0,logp,home,cy,hp,we,li,MCC,VW,alfa romeo,audi,...,tal/matra,tal/simca,tal/sunb,talbot,toyota,volvo,li.1,sp,cy.1,tax
0,0.122399,0,1760.0,59.0,1050,8.900000,0,0,0,1,...,0,0,0,0,0,0,8.900000,156.0,1760.0,0.250
1,-0.794100,0,435.0,17.5,560,5.500000,0,0,0,0,...,0,0,0,0,0,0,5.500000,102.0,435.0,0.250
2,-0.713977,0,435.0,19.0,590,5.500000,0,0,0,0,...,0,0,0,0,0,0,5.500000,104.0,435.0,0.250
3,-0.417193,0,1116.0,40.5,785,8.000000,0,0,0,0,...,0,0,0,0,0,0,8.000000,135.0,1116.0,0.250
4,-0.491411,0,1098.0,29.5,825,8.200000,0,0,0,0,...,0,0,0,0,0,0,8.200000,125.0,1098.0,0.250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,-0.053386,0,1397.0,59.0,1020,8.400000,0,0,0,0,...,0,0,0,0,0,0,8.400000,170.0,1397.0,0.175
5996,-0.161856,1,1388.0,54.0,1070,7.400000,0,0,0,0,...,0,0,0,0,0,0,7.400000,171.0,1388.0,0.175
5997,-0.587124,0,1124.0,44.0,910,7.166667,0,0,0,0,...,0,0,0,0,0,0,7.166667,158.0,1124.0,0.175
5998,-0.086349,0,1587.0,73.0,1195,9.100000,0,0,0,0,...,0,0,0,0,1,0,9.100000,195.0,1587.0,0.175


In [59]:
y.T.shape

(40, 150)

In [53]:
test = cars[(cars['ye']==70) & (cars['ma']==2)]

In [58]:
test['y'].nunique()

40

In [23]:
test['s'].sum()

0.999999999999998

In [24]:
np.arange(0,5).shape

(5,)

In [25]:
cars.shape

(6000, 86)

In [26]:
M = 5
T = 30
N = M*T

In [27]:
cars['ma'].unique()

array([1, 2, 3, 4, 5], dtype=int64)

In [28]:
cars['ye'].unique()

array([70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86,
       87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], dtype=int64)

In [29]:
#y = np.log(cars['s'].values.reshape((N,J)))

# standardize x
# x = ((x - x.mean(0).mean(0))/(x.std(0).std(0)))

### Understanding the sorting 

Just to be sure that we understand the relation between the pandas dataframe and the numpy 3d array, consider the following: 

In [30]:
# let's check that we get the same row from x as we can find in the original pandas dataframe
# we'll pick the first 5 "observations"
j = 1
k = 0 
x[:5, j, k] == cars.groupby(['ma','ye']).nth(j)[x_vars[k]].head(5).values

array([ True,  True,  True,  True,  True])

In [31]:
# ... and let's check it for the 5 first cars (in the first market)
k = 0
x[0, :5, k] == cars[x_vars[k]].head(5).values
# note that with i = 3 (4th element), x[i,t,k] gives ma=1 and ye=73 (first market, fourth year)
x[3, :5, k] == cars.query('(ma == 1) & (ye == 73)')[x_vars[k]].head(5).values

array([ True,  True,  True,  True,  True])

In [32]:
# and let's print out some rows along with some labels 
obs_labs = cars[['ma', 'ye', 'type', 's']].values.reshape(N,J,4) # notice that we are extracting the values from the dataframe in the same way as we did for x

i=3 # obs. index 3 is the first market in the fourth (3+1) year, i.e. 73
print(obs_labs[i,:5,:])

i = 130 # obs. index 130 is the 5th country (130/30>4) and the 11th year (130%30 = index 10)
print(obs_labs[i,:5,:])

[[1 73 'audi 80/90' 0.0198967806548532]
 [1 73 'audi 100/200' 0.0115738123314003]
 [1 73 'citroen 2 CV 6 - 2 CV 4' 0.020470221461224]
 [1 73 'citroen GSA/GSX' 0.0231960844492545]
 [1 73 'citroen dyane' 0.0232687741289353]]
[[5 80 'alfasud' 0.0061322294468038]
 [5 80 'citroen GSA' 0.0097859984028077]
 [5 80 'fiat 127' 0.0082314207084408]
 [5 80 'fiat 131F' 0.0099803206146036]
 [5 80 'ford fiesta' 0.0781217905939526]]


... and just checking that we can find those same columns in the pandas dataframe

In [33]:
cars.query('(ma == 5) & (ye == 80) & (type == "ford fiesta")')

Unnamed: 0,ye,ma,co,zcode,brd,type,brand,model,org,loc,...,skoda,suzuki,tal/hillman,tal/matra,tal/simca,tal/sunb,talbot,toyota,volvo,y
5204,80,5,70,82,8,ford fiesta,ford,fiesta,7,12,...,0,0,0,0,0,0,0,0,0,4


$$
u_{i j h}=\mathbf{x}_{i j} \boldsymbol{\beta}_o+\varepsilon_{i j h}, \quad j=1, \ldots, J
$$

where: 

- $i$ is the $\textit{country-year}$ pair
- $j$ is the alternative car
- $h$ is the household

First off: Are we: 

    1. interested in the marginal utility of a car's characteristic (conditional logit) or 
    
    2. the change in utility of car $j$ relative to car 1 given a change in household characteristics?

In this assignment, we are examning home bias - that is the propensity to choose a car manufactured in the home country. We are therefore interested in 1) and will use a conditional logit model.

In [34]:
y.shape

(150, 40)

In [35]:
x.shape

(150, 40, 42)

In [36]:
# The starting values to be passed to our optimizer
theta_start = clogit.starting_values(y, x)
theta_start.shape

(42,)

In [37]:
u = clogit.util(theta_start, x)
u

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [38]:
# Our conditional choice probabilities
# For coefficients ('theta') starting at zero, these must be equal to zero
# Intuition: No utility is gained by any car characteristics, thus market shares must be equal. Let's check this.
ccp = clogit.choice_prob(theta_start, x)
(ccp == 1/J) # all choice probs are equal to each other.

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [39]:
theta_start.shape

(42,)

In [40]:
x.shape

(150, 40, 42)

In [41]:
y[0, :]

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39])

In [42]:
y[1,:].shape

(40,)

In [43]:
y.shape

(150, 40)

In [61]:
y.T[:, 0]

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39])

In [62]:
y.T[:, 1]

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39])

In [44]:
x.shape

(150, 40, 42)

In [60]:
est.estimate(clogit.q, theta_start, y.T[:, 0], x)

IndexError: shape mismatch: indexing arrays could not be broadcast together with shapes (150,) (40,) 