In [None]:
import numpy as np
import pandas as pd
import pymc3 as pm
from scipy import stats
from sklearn.metrics import mean_squared_error
import theano
import random



import plotly.graph_objects as go

# 1

> The weights listed below were recorded in the !Kung census, but heights were not recorded for these individuals. Provide predicted heights and 89% compatibility intervals for each of these individuals. That is, fill in the table below, using model-based predictions.

In [None]:
to_predict = [45, 40, 65, 31, 53]

First we want to load the data used to fit our model:

In [None]:
howell = pd.read_csv('../../data/Howell1.csv', sep = ';')
howell.head()

Filter to only adults and standardize weight:

In [None]:
howell_adults = howell[howell.age >= 18].copy()

In [None]:
howell_adults['std_weight'] = howell_adults.weight - howell_adults.weight.mean()

Define and fit model:

In [None]:
with pm.Model() as model_1:
    # Extract data
    weight = pm.Data('weight', howell_adults['std_weight'].values)
    height = pm.Data('height', howell_adults['height'].values)
    
    # Define priors
    alpha = pm.Normal('alpha', mu=178, sd=20)
    beta = pm.Normal('beta', mu=0, sd=10)
    sigma = pm.Uniform('sigma', lower=0, upper=50)
    
    # Define regression model
    mu = alpha + beta * weight
    height_hat = pm.Normal('height_hat', mu=mu, sd=sigma, observed=height)
    
    # Prior sampling, trace definition and posterior sampling
    prior = pm.sample_prior_predictive()
    posterior_1 = pm.sample(draws=1000, tune=1000)
    posterior_pred_1 = pm.sample_posterior_predictive(posterior_1)

In [None]:
pm.summary(posterior_1, alpha=.10).round(2)

In [None]:
pm.traceplot(posterior_1);

In the code above, after fitting the model we also used draws from the posterior to predict heights for each supplied weight point. We can average the predictions for each point to get a predictive point estimate, and assess our model's fit that way:

In [None]:
height_hats = posterior_pred_1['height_hat'].mean(axis = 0)

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x = howell_adults['height'],
        y = height_hats,
        mode = 'markers'
    )
)


fig.update_layout(
    xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text="height",
        )
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text="height_hat",
        )
    )
)

In [None]:
np.sqrt(mean_squared_error(height_hats, howell_adults['height']))

Only 5cm - not bad!

We can also get the HPDI for each sample:

In [None]:
pm.hpd(posterior_pred_1['height_hat'], alpha = .11)

Now we can predict for our out-of-sample weight points. We need to standardize them and add them as data to the model:

In [None]:
posterior_pred_1['height_hat'].shape

In [None]:
x_test = (np.array(to_predict) - howell.weight.mean())
weight.set_value(x_test)

In [None]:
posterior_pred_1 = pm.sample_posterior_predictive(trace = posterior_1, samples = 500, model = model_1)

In [None]:
test = pd.DataFrame(to_predict, columns = ['weight'])

In [None]:
test['height_hat'] = posterior_pred_1['height_hat'].mean(axis = 0)

In [None]:
test = test.join(
    pd.DataFrame(
        pm.hpd(posterior_pred_1['height_hat'], alpha = .11), 
        columns = ['hpd5.5', 'hpd95.5']
    )
)

test

# 2

> Model the relationship between height (cm) and the natural logarithm of weight (log-kg): `log(weight)`. Use the entire `Howell1` data frame, all 544 rows, adults and non-adults. Use any model type from Chapter 4 that you think useful: an ordinary linear regression, a polynomial or a spline. Plot the posterior predictions against the raw data.

In [None]:
howell['log_std_weight'] = np.log(howell['weight']) - np.log(howell['weight'].mean())

In [None]:
with pm.Model() as model_2:
    # Extract data
    log_weight = pm.Data('log_weight', howell['log_std_weight'].values)
    height = pm.Data('height', howell['height'].values)
    
    # Define priors
    alpha = pm.Normal('alpha', mu=178, sd=20)
    beta = pm.Normal('beta', mu=0, sd=10)
    sigma = pm.Uniform('sigma', lower=0, upper=50)
    
    # Define regression model
    mu = alpha + beta * log_weight
    height_hat = pm.Normal('log_height_hat', mu=mu, sd=sigma, observed=height)
    
    # Prior sampling, trace definition and posterior sampling
    prior = pm.sample_prior_predictive()
    posterior_2 = pm.sample(draws=1000, tune=1000)
    posterior_pred_2 = pm.sample_posterior_predictive(posterior_2)