In [1]:
import pymc3 as pm
import numpy as np
import scipy.stats as stats
import scipy.special as special
import matplotlib.pyplot as plt
import arviz as az
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf 
import numpy.matlib
import theano.tensor as tt
import warnings

from sklearn import preprocessing
from theano import shared
from scipy.special import expit as logistic
from scipy.special import softmax
from scipy.stats import betabinom
from matplotlib.patches import Ellipse, transforms

In [2]:
%config InlineBackend.figure_format = 'retina'
warnings.simplefilter(action='ignore', category=(FutureWarning, UserWarning))
az.style.use('arviz-darkgrid')

In [3]:
RANDOM_SEED = 42
np.random.seed(42)

In [4]:
import sys, IPython, scipy, matplotlib, pandas, seaborn, patsy, platform, theano, sklearn, statsmodels
print("""This notebook was created using:
Python {}
IPython {}
PyMC3 {}
ArviZ {}
NumPy {}
SciPy {}
Pandas {}
Seaborn {}
Patsy {}
Matplotlib {}
Theano {}
Sklearn {}
Statsmodels {}\n""".format(sys.version[:5], 
                             IPython.__version__, 
                             pm.__version__, 
                             az.__version__, 
                             np.__version__, 
                             scipy.__version__, 
                             pandas.__version__, 
                             seaborn.__version__, 
                             patsy.__version__, 
                             matplotlib.__version__, 
                             theano.__version__, 
                             sklearn.__version__, 
                             statsmodels.api.__version__))

This notebook was created using:
Python 3.7.4
IPython 7.13.0
PyMC3 3.9.2
ArviZ 0.9.0
NumPy 1.18.2
SciPy 1.4.1
Pandas 1.0.3
Seaborn 0.10.0
Patsy 0.5.1
Matplotlib 3.2.1
Theano 1.0.4
Sklearn 0.22.2.post1
Statsmodels 0.11.1



### 1. In the Trolleydata—data(Trolley)—we saw how education level (modeled as an ordered category) is associated with responses. Is this association causal? One plausible confound is that education is also associated with age, through a causal process: People are older when they finish school than when they begin it.

### Reconsider the Trolley data in this light. Draw a DAG that represents hypothetical causal relationships among response, education, and age. Which statical model or models do you need to evaluate the causal influence of education on responses? Fit these models to the trolley data. What do you conclude about the causal relationships among these three variables?

In [5]:
df = pd.read_csv('Data/Trolley.csv', ';')
df

Unnamed: 0,case,response,order,id,age,male,edu,action,intention,contact,story,action2
0,cfaqu,4,2,96;434,14,0,Middle School,0,0,1,aqu,1
1,cfbur,3,31,96;434,14,0,Middle School,0,0,1,bur,1
2,cfrub,4,16,96;434,14,0,Middle School,0,0,1,rub,1
3,cibox,3,32,96;434,14,0,Middle School,0,1,1,box,1
4,cibur,3,4,96;434,14,0,Middle School,0,1,1,bur,1
...,...,...,...,...,...,...,...,...,...,...,...,...
9925,ilpon,3,23,98;299,66,1,Graduate Degree,0,1,0,pon,0
9926,ilsha,6,15,98;299,66,1,Graduate Degree,0,1,0,sha,0
9927,ilshi,7,7,98;299,66,1,Graduate Degree,0,1,0,shi,0
9928,ilswi,2,18,98;299,66,1,Graduate Degree,0,1,0,swi,0


In [6]:
df['edu_new'] = pd.Categorical(
    df['edu'].values,
    categories=[
        "Elementary School",
        "Middle School",
        "Some High School",
        "High School Graduate",
        "Some College",
        "Bachelor's Degree",
        "Master's Degree",
        "Graduate Degree",
    ],
    ordered=True,
)

In [9]:
df['edu_new'] = df['edu_new'].cat.codes

In [11]:
R = df['response'].values - 1
A = df['action'].values
I = df['intention'].values
C = df['contact'].values
E = df['edu_new'].values
age = preprocessing.scale(df['age'].values)

In [17]:
# define model: R ~ ordered logistic regression
with pm.Model() as m7_1:
    
    # fixed priors
    bA = pm.Normal('bA', mu=0, sigma=0.5)
    bI = pm.Normal('bI', mu=0, sigma=0.5)
    bC = pm.Normal('bC', mu=0, sigma=0.5)
    bE = pm.Normal('bE', mu=0, sigma=0.5)
    bAge = pm.Normal('bAge', mu=0, sigma=0.5)
    bIC = pm.Normal('bIC', mu=0, sigma=0.5)
    bIA = pm.Normal('bIA', mu=0, sigma=0.5)
    
    kappa = pm.Normal('kappa', mu=0, sigma=1.5, shape=6,
                      transform=pm.distributions.transforms.ordered,
                      testval=np.arange(6))
    
    delta = pm.Dirichlet('delta', np.repeat(2.0, 7), shape=7)
    delta_j = tt.concatenate([tt.zeros(1), delta])
    delta_j_cumulative = tt.cumsum(delta_j)
    
    # linear model
    BI = bI + bIA*A + bIC*C
    phi = bE*delta_j_cumulative[E] + bA*A + bC*C + BI*I + bAge*age 

    # likelihood
    R_est = pm.OrderedLogistic('R_est', phi, cutpoints=kappa, observed=R)
    
    trace_m7_1 = pm.sample(1000, tune=1000, random_seed=RANDOM_SEED)
    idata_m7_1 = az.from_pymc3(trace_m7_1)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
INFO (theano.gof.compilelock): Refreshing lock /Users/yifei-wang/.theano/compiledir_Darwin-18.7.0-x86_64-i386-64bit-i386-3.7.4-64/lock_dir/lock
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [delta, kappa, bIA, bIC, bAge, bE, bC, bI, bA]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 336 seconds.
INFO (theano.gof.compilelock): Refreshing lock /Users/yifei-wang/.theano/compiledir_Darwin-18.7.0-x86_64-i386-64bit-i386-3.7.4-64/lock_dir/lock
There were 4 divergences after tuning. Increase `target_accept` or reparameterize.
The acceptance probability does not match the target. It is 0.7141393029226992, but should be close to 0.8. Try to increase the number of tuning steps.
There was 1 divergence after tuning. Increase `target_accept` or reparameterize.
The number of effective samples is smaller than 25% for some parameters.


In [18]:
# show model summary
varnames = ['bAge','bE','bIC','bIA','bC','bI','bA']
az.summary(idata_m7_1, varnames, kind='stats', round_to=2, hdi_prob=0.89)

Unnamed: 0,mean,sd,hdi_5.5%,hdi_94.5%
bAge,-0.1,0.02,-0.14,-0.07
bE,0.23,0.12,0.1,0.37
bIC,-1.24,0.1,-1.39,-1.09
bIA,-0.43,0.08,-0.56,-0.31
bC,-0.34,0.07,-0.45,-0.23
bI,-0.29,0.06,-0.38,-0.2
bA,-0.48,0.05,-0.56,-0.39


You may recall from the chapter that education has a negative effect in the model without age. Now that we include age, education has a positive influence (with some overlap with zero). So age has indeed soaked up some of the previous influence assigned to education. The back-door may be real.

I’d summarize this model, assuming this DAG is true, as saying that age causes people to give slightly lower responses. This could be a cohort effect, and not a causal influence of age. Either way, it is small. Education seems to cause higher responses (more approval). This suggests that education trains people to see some or all of the features A,I,C as more permissible. A model that interacted education with each might shed more light on things. Remember: A DAG doesn’t say whether you need an interaction effect or not. That is a separate problem.