In [1]:
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import arviz as az

import stan_jupyter as stan
import pandas as pd

In [2]:
def waic(fit):

    log_lik = [n.mean() for n in fit['log_lik']]

    lppd = np.log(np.exp(log_lik).mean(axis=0)).sum()

    p_waic = np.var(log_lik, axis=0).sum()

    waic = -2*lppd + 2*p_waic

    return round(waic, 3)

## Question 1

### FROM https://github.com/pymc-devs/resources/blob/master/Rethinking_2/Chp_06.ipynb ###

In [3]:
def inv_logit(x):
    return np.exp(x) / (1 + np.exp(x))


def sim_happiness(N_years=100, seed=1234):
    np.random.seed(seed)

    popn = pd.DataFrame(np.zeros((20 * 65, 3)), columns=["age", "happiness", "married"])
    popn.loc[:, "age"] = np.repeat(np.arange(65), 20)
    popn.loc[:, "happiness"] = np.repeat(np.linspace(-2, 2, 20), 65)
    popn.loc[:, "married"] = np.array(popn.loc[:, "married"].values, dtype="bool")

    for i in range(N_years):
        # age population
        popn.loc[:, "age"] += 1
        # replace old folk with new folk
        ind = popn.age == 65
        popn.loc[ind, "age"] = 0
        popn.loc[ind, "married"] = False
        popn.loc[ind, "happiness"] = np.linspace(-2, 2, 20)

        # do the work
        elligible = (popn.married == 0) & (popn.age >= 18)
        marry = np.random.binomial(1, inv_logit(popn.loc[elligible, "happiness"] - 4)) == 1
        popn.loc[elligible, "married"] = marry

    popn.sort_values("age", inplace=True, ignore_index=True)

    return popn



In [4]:
popn = sim_happiness()

df = popn.copy()
df["married"] = df["married"].astype(
    int
)  # this is necessary before using az.summary, which doesn't work with boolean columns.
az.summary(df.to_dict(orient="list"), kind="stats", round_to=2)

Unnamed: 0,mean,sd,hdi_3%,hdi_97%
age,32.0,18.77,0.0,61.0
happiness,-0.0,1.21,-2.0,1.79
married,0.28,0.45,0.0,1.0


__Model 6.9__

In [5]:
df.married +=1

In [6]:
df = df[df.age>17]
df["age_adj"] = (df.age - 18) / (65-18)

In [7]:
model_data = {"age":df.age_adj.tolist(),
              "happiness":df.happiness.tolist(),
              "mid":df.married.tolist(),
              "N":len(df),
              "MIDS":df.married.nunique()}

with open("models/w4_1.stan") as f:
    model_code = f.read()

In [8]:
%%capture
posterior = stan.build(model_code, model_data)

In [9]:
%%capture
fit = posterior.sample(num_chains=4, num_samples=1000)

In [10]:
az.summary(fit)[0:4]

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
alpha[0],-0.196,0.067,-0.324,-0.072,0.002,0.001,1720.0,2250.0,1.0
alpha[1],1.22,0.092,1.057,1.402,0.002,0.002,1690.0,2048.0,1.0
beta,-0.725,0.123,-0.96,-0.506,0.003,0.002,1546.0,2180.0,1.0
sigma,1.021,0.024,0.978,1.066,0.0,0.0,2659.0,2403.0,1.0


In [11]:
m6_9 = az.from_pystan(fit, log_likelihood="log_lik")

__Model 6.10__

In [12]:
model_data = {"age":df.age_adj.tolist(),
              "happiness":df.happiness.tolist(),
              "N":len(df)}

with open("models/w4_1b.stan") as f:
    model_code = f.read()

In [13]:
%%capture
posterior = stan.build(model_code, model_data)

In [14]:
%%capture
fit = posterior.sample(num_chains=4, num_samples=1000)

In [15]:
az.summary(fit)[0:3]

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
alpha,-0.004,0.078,-0.144,0.149,0.002,0.001,1523.0,1783.0,1.0
beta,0.006,0.138,-0.247,0.264,0.003,0.002,1566.0,1961.0,1.0
sigma,1.217,0.029,1.167,1.274,0.001,0.0,2061.0,2221.0,1.0


In [16]:
m6_10 = az.from_pystan(fit, log_likelihood="log_lik")

In [18]:
compare_dict = {"with marriage ID": m6_9, "pooled": m6_10}
az.compare(compare_dict)



Unnamed: 0,rank,loo,p_loo,d_loo,weight,se,dse,warning,loo_scale
with marriage ID,0,-1355.55052,3.729923,0.0,0.978383,18.671466,0.0,False,log
pooled,1,-1518.745009,2.362766,163.19449,0.021617,13.65272,16.535683,False,log


In [19]:
az.compare(compare_dict, ic="waic")



Unnamed: 0,rank,waic,p_waic,d_waic,weight,se,dse,warning,waic_scale
with marriage ID,0,-1355.550497,3.729901,0.0,0.978383,18.67146,0.0,False,log
pooled,1,-1518.745035,2.362791,163.194538,0.021617,13.652721,16.535677,False,log


According to the WAIC and PSIS values, the model that differentiates between married and unmarried observations should make better out-of-sample predictions. 

The model believes that someone at the minimum age in the sample (18) is considerably more likely to be happy if they are married. Both married and unmarried individuals become less happy as they age. 

## Question 2

In [36]:
df = pd.read_csv("../data/foxes.csv", delimiter=";")

df["std_food"] = (df.avgfood - df.avgfood.mean()) / df.avgfood.std()

model_data = {"N":len(df),
              "weight":df.weight.tolist(),
              "food":df.std_food.tolist()}

with open("models/w4_2.stan") as f:
    model_code = f.read()

In [37]:
%%capture
posterior = stan.build(model_code, model_data)

In [38]:
%%capture
fit = posterior.sample(num_chains=4, num_samples=1000)

In [39]:
m4_2 = az.from_pystan(fit, log_likelihood="log_lik")

In [40]:
df["std_groupsize"] = (df.groupsize - df.groupsize.mean()) / df.groupsize.std()

model_data = {"N":len(df),
              "weight":df.weight.tolist(),
              "food":df.std_food.tolist(),
              "group_size":df.std_groupsize.tolist()}

with open("models/w4_2b.stan") as f:
    model_code = f.read()

In [41]:
%%capture
posterior = stan.build(model_code, model_data)

In [42]:
%%capture
fit = posterior.sample(num_chains=4, num_samples=1000)

In [43]:
m4_2b = az.from_pystan(fit, log_likelihood="log_lik")

In [44]:
compare_dict = {"Pooled": m4_2, "Stratified by group size": m4_2b}
az.compare(compare_dict)



Unnamed: 0,rank,loo,p_loo,d_loo,weight,se,dse,warning,loo_scale
Stratified by group size,0,-181.41623,3.790638,0.0,0.865069,7.915418,0.0,False,log
Pooled,1,-186.386485,2.423869,4.970255,0.134931,6.654062,3.638411,False,log


In [45]:
az.compare(compare_dict, ic="waic")

See http://arxiv.org/abs/1507.04544 for details


Unnamed: 0,rank,waic,p_waic,d_waic,weight,se,dse,warning,waic_scale
Stratified by group size,0,-181.406179,3.780587,0.0,0.865757,7.91283,0.0,True,log
Pooled,1,-186.383721,2.421105,4.977542,0.134243,6.653564,3.636088,False,log


In [46]:
az.summary(m4_2b)

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
alpha,4.523,0.105,4.323,4.713,0.002,0.001,3100.0,2522.0,1.0
beta,0.671,0.230,0.234,1.102,0.005,0.004,2063.0,1991.0,1.0
gamma,-0.786,0.229,-1.231,-0.370,0.005,0.004,2061.0,1976.0,1.0
sigma,1.135,0.077,0.991,1.280,0.001,0.001,2754.0,2189.0,1.0
mu[0],4.430,0.234,4.012,4.898,0.004,0.003,3684.0,3542.0,1.0
...,...,...,...,...,...,...,...,...,...
log_lik[111],-1.108,0.070,-1.231,-0.972,0.001,0.001,2769.0,2284.0,1.0
log_lik[112],-1.140,0.072,-1.269,-1.001,0.001,0.001,2648.0,2336.0,1.0
log_lik[113],-1.384,0.188,-1.732,-1.066,0.004,0.003,2490.0,2687.0,1.0
log_lik[114],-1.708,0.263,-2.208,-1.258,0.005,0.004,2513.0,2572.0,1.0


In this case, the model that stratifies by group size will be a better out-of-sample predictor according to both information criteria. Since both predictors are standardized, the intercept `alpha` represents expected weight when both predictors are at their mean values. The beta coefficient means a standard deviation change in the average quantity of food available corresponds to an expected weight of +.67 units. The gamma coefficient means that a standard deviation change in group size corresponds to an expected weight of -.79 units. The standard deviations for both parameters are quite large, which suggests the model is not particularly certain about the magnitude of the effects, though it is quite certain about the direction.

## Question 3