# Project 3: Getting Started 

This notebook is intended to help you get off to a flying start with the cars dataset. You don't have to use this notebook and you can discard any parts you do not like, they are purely intended as a help to get started. 

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import scipy.stats 
sns.set_theme()

# optimization
from scipy import optimize
import estimation as est
import clogit
import functions as fn

import statsmodels.formula.api as smf

# Read in data

The dataset, `cars.csv`, contains cleaned and processed data. If you want to make changes, the notebook, `materialize.ipynb`, creates the data from the raw source datsets. 

In [2]:
cars = pd.read_csv('cars.csv')
lbl_vars = pd.read_csv('labels_variables.csv')
lbl_vals = pd.read_csv('labels_values.csv')

# convert from dataframe to dict
lbl_vals = {c: lbl_vals[c].dropna().to_dict() for c in lbl_vals.columns}

In [3]:
lbl_vars.set_index('variable', inplace=True)

## Overview of the dataset

In [4]:
lbl_vars.join(cars.mean(numeric_only=True).apply(lambda x: f'{x: .2f}').to_frame('Mean'))

Unnamed: 0_level_0,label,Mean
variable,Unnamed: 1_level_1,Unnamed: 2_level_1
ye,year (=first dimension of panel),84.5
ma,market (=second dimension of panel),3.0
co,model code (=third dimension of panel),207.5
zcode,alternative model code (predecessors and succe...,177.76
brd,brand code,16.79
type,name of brand and model,
brand,name of brand,
model,name of model,
org,"origin code (demand side, country with which c...",2.72
loc,"location code (production side, country where ...",5.17


# Set up for analysis

In [5]:
# a. pic the prince variable 
price_var = 'princ'

# b. new variable: log of price
cars['logp'] = np.log(cars[price_var])

# c. new variable: price elasticity heterogeneous for home-region 
cars['logp_x_home'] = cars[price_var] * cars['home']

### Dummy variables

For working with matrices, we want to have a column for each dummy variable. 

In [6]:
categorical_var = 'brand' # name of categorical variable
dummies = pd.get_dummies(cars[categorical_var]) # creates a matrix of dummies for each value of dummyvar
x_vars_dummies = list(dummies.columns[1:].values) # omit a reference category, here it is the first (hence columns[1:])

# add dummies to the dataframe 
assert dummies.columns[0] not in cars.columns, f'It looks like you have already added this dummy to the dataframe. Avoid duplicates! '
# cars = pd.concat([cars, dummies], axis=1)


### `x_vars`: List of regressors to be used 

In [7]:
# 
# x_vars = ['logp', 'home', 'cy', 'hp', 'we', 'li'] + x_vars_dummies # <--- !!! choose your preferred variables here 
x_vars = ['logp', 'home', 'cy', 'sp', 'hp', 'we', 'li'] #+ x_vars_dummies # <--- !!! choose your preferred variables here 
print(f'K = {len(x_vars)} variables selected.')

# add price elasticity heterogeneous for home-region 
x_vars_new = x_vars + ['logp_x_home']
x_vars_new_new = x_vars_new[2:]

K = 7 variables selected.


In [8]:
# a. set up the data for the estimation
K = len(x_vars)
N = cars.ma.nunique() * cars.ye.nunique()
J = 40 
x = cars[x_vars].values.reshape((N,J,K)).astype(np.float64)
# b. define new x with the interaction term
x_new = cars[x_vars_new].values.reshape((N,J,K+1)).astype(np.float64)
x_new_new = cars[x_vars_new_new].values.reshape((N,J,K-1)).astype(np.float64)
# c. define y
y = cars['s'].values.reshape((N,J))

In [9]:
if 'sp' in x_vars: 
    sp_i = x_vars.index('sp')
    y = y[~np.isnan(x[:, :, sp_i]).any(axis=1)]
    x = x[~np.isnan(x[:, :, sp_i]).any(axis=1)]
    x_new = x_new[~np.isnan(x_new[:, :, sp_i]).any(axis=1)]
    # if 'logp' not in x_vars_new_new:
    #     x_new_new = x_new_new[~np.isnan(x_new_new[:, :, sp_i]).any(axis=1)]
    #     y = y[~np.isnan(x_new_new[:, :, sp_i]).any(axis=1)]

    N = len(y)

if 'ac' in x_vars: 
    sp_i = x_vars.index('ac')
    y = y[~np.isnan(x[:, :, sp_i]).any(axis=1)]
    x = x[~np.isnan(x[:, :, sp_i]).any(axis=1)]
    x_new = x_new[~np.isnan(x_new[:, :, sp_i]).any(axis=1)]
    # if 'logp' not in x_vars_new_new:
    #     x_new_new = x_new_new[~np.isnan(x_new_new[:, :, sp_i]).any(axis=1)]
    #     y = y[~np.isnan(x_new_new[:, :, sp_i]).any(axis=1)]

    N = len(y)


# Conditional Logit

## Estimate of x

In [10]:
# Starting values 
theta0 = clogit.starting_values(y, x)

In [11]:
# Find results
res = est.estimate(clogit.q, theta0, y, x, cov_type = 'Sandwich')

         Current function value: 3.499129
         Iterations: 24
         Function evaluations: 492
         Gradient evaluations: 60


  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


In [12]:
# Estimate the model, standard errors, and t-values
est.print_table(x_vars,res)

Optimizer failed after 24 iter. (492 func. evals.). Final criterion:    3.499.
Results


Unnamed: 0,theta,se,t
logp,0.0941,0.1127,0.8352
home,1.3082,0.0325,40.2263
cy,0.0002,0.0001,2.5776
sp,0.0098,0.0023,4.3146
hp,-0.035,0.0029,-11.9743
we,0.0005,0.0001,3.3491
li,-0.014,0.0158,-0.8861


## Estimate of x_new 

In [13]:
# Starting values 
theta0_new = clogit.starting_values(y,x_new)

In [14]:
# Find residuals
res_new = est.estimate(clogit.q, theta0_new, y, x_new, cov_type = 'Sandwich')

         Current function value: 3.496764
         Iterations: 30
         Function evaluations: 669
         Gradient evaluations: 73


  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


In [15]:
# Estimate the model, standard errors, and t-values
est.print_table(x_vars_new,res_new)

Optimizer failed after 30 iter. (669 func. evals.). Final criterion:    3.497.
Results


Unnamed: 0,theta,se,t
logp,0.1421,0.1017,1.3965
home,1.687,0.0577,29.232
cy,0.0002,0.0001,2.2776
sp,0.0099,0.0023,4.3417
hp,-0.0319,0.003,-10.5636
we,0.0005,0.0001,3.5996
li,-0.0161,0.0174,-0.9269
logp_x_home,-0.5239,0.0605,-8.6635


# Estimation of Parameters of Intrest

In [16]:
# Without interaction term
fn.result(x,res['theta'],res['cov'],print_out = True,se=True,inter=False,N=N,x_vars=x_vars)

Unnamed: 0,Estimate,se,CI low,CI high,p-value
PEA,0.0358,0.001,0.0337,0.0378,0.0
OPE,0.0917,0.1098,-0.1235,0.3069,0.4056
CPE,-0.0024,0.0028,-0.0079,0.0031,0.3958


In [17]:
# With interaction term
fn.result(x_new,res_new['theta'],res_new['cov'],print_out = True,se=True,inter=True, N=N, x_vars=x_vars_new)

Unnamed: 0,Estimate,se,CI low,CI high,p-value
PEA,0.0719,0.0057,0.0608,0.0831,0.0
OPE,-0.6694,0.0772,-0.8208,-0.518,0.0
CPE,0.0204,0.0024,0.0158,0.025,0.0
