In [None]:
import pandas as pd
import numpy as np
import pymc3 as pm
from random import choices
from scipy import stats

import plotly.express as px
import plotly.graph_objects as go

## Intro

Playing with lecture/book globe throwing example:

In [None]:
n = 9 # number of total throws
k = 6 # number of 'water' hits
p = 0.7 # proportion of water

In [None]:
# given above parameters, generate samples from the binomial distribution
np.random.binomial(1, p, n)

In [None]:
# density function gives likelihood of k hits in n throws under prob p
stats.binom.pmf(k, n, p)

Grid approximation:

In [None]:
# define how many points to evaluation in the space for p
grid_points = 100
grid_p = np.linspace(0, 1, grid_points)

In [None]:
# define prior for the grid values(uniform in this case)
prior = np.ones(grid_points)

In [None]:
# compute likelihood for each grid value
# likelihood of getting k hits in n throws for grid values of p
grid_lik = stats.binom.pmf(k, n, grid_p)

In [None]:
# compute unstandardized posterior for each grid val
# (equal to likelihood here due to uniform prior)
grid_unstd_post = grid_lik * prior

In [None]:
# standardize posterior by dividing by the sum (normalize)
grid_post = grid_unstd_post/grid_unstd_post.sum()

In [None]:
# plot grid approximation of posterior distribution
fig = go.Figure(
    data=go.Scatter(x=grid_p, y=grid_post)
)

fig.update_layout(
    title=go.layout.Title(
        text=f"n = {grid_points}"
    ),
    xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text="probability of water",
        )
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text="posterior probability",
        )
    )
)
fig.show()

In [None]:
# we can sample from this posterior too
n_picks = 5000
samples = np.random.choice(grid_p, n_picks, p = grid_post)

In [None]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x = np.arange(0, n_picks),
        y = samples,
        mode = 'markers'
    )
)

fig.show()


In [None]:
fig = go.Figure()

fig.add_trace(
    go.Histogram(
        x = samples
    )
)

Defined Boundary Intervals:

In [None]:
# What is the posterior probability that less than 50% of the earth is covered with water?
sum(samples<.5) / n_picks

In [None]:
# What is the posterior probability that between 50 and 75% of the earth is covered with water?
sum((samples > .5) & (samples < .75)) / n_picks

Intervals of Defined Mass (Compatibility Intervals):

In [None]:
# What is the bottom 80% interval of the posterior?
pm.stats.quantiles(samples, qlist = [80])

__Percentile Intervals__ assign equal probability mass to each tail

In [None]:
# What is the middle 80% interval of the posterior?
pm.stats.quantiles(samples, qlist = [10, 90])

__Highest Posterior Density Interval__ computes narrowest interval containing the specified probability mass. Better suited for highly asymmetrical distributions.

In [None]:
# What is the densest 80% interval of the posterior?
pm.stats.hpd(samples, alpha = .2)

## 1

> 1. Suppose the globe tossing data had turned out to be 8 water in 15 tosses.  
Construct the posterior distribution, using grid approximation. Use the same flat prior as before.

In [None]:
n = 15 # tosses
k = 8 # waters

In [None]:
n_grid = 1000
grid_p = np.linspace(0, 1, n_grid) # p grid points
grid_prior = np.ones(n_grid) # flat prior
grid_lik = stats.binom.pmf(k, n, grid_p) #likelihood

In [None]:
posterior = (grid_lik * grid_prior)
posterior = posterior/posterior.sum()

In [None]:
# plot grid approximation of posterior distribution
fig = go.Figure(
    data=go.Scatter(x=grid_p, y=posterior)
)

fig.update_layout(
    title=go.layout.Title(
        text=f"n = {n_grid}"
    ),
    xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text="probability of water",
        )
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text="posterior probability",
        )
    )
)
fig.show()

The posterior is pretty normal, centered around $p=.5$

# 2

> Start over in $1$, but now use a prior that is zero below $p = 0.5$ and a constant above $p = 0.5$. This corresponds to prior information that a majority of the Earth’s surface is water. What difference does the better prior make? If it helps, compare posterior distributions (using both priors) to the true value $p = 0.7$.

In [None]:
n = 15
k = 8

In [None]:
n_grid = 1000
grid_p = np.linspace(0, 1, n_grid)
grid_prior = np.where(grid_p < .5, 0, 1)

In [None]:
grid_lik = stats.binom.pmf(k, n, grid_p)
posterior2 = grid_lik * grid_prior
posterior2 = posterior2 / posterior2.sum()

In [None]:
# plot grid approximation of posterior distribution
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x = grid_p, y = posterior
    )
)


fig.add_trace(
    go.Scatter(
        x = grid_p, y = posterior2
    )
)

fig.update_layout(
    title=go.layout.Title(
        text=f"n = {n_grid}"
    ),
    xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text="probability of water",
        )
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text="posterior probability",
        )
    ),
    shapes=[
        # Line Vertical
        go.layout.Shape(
            type="line",
            x0=.7,
            x1 = .7,
            y0 = 0,
            y1 = .006,
            line=dict(
                color="green",
                width=1,
                dash = 'dash'
            )
        ),
    ]
)
fig.show()

As we would expect, a truncated prior leads to a truncated posterior. But plotting the two posteriors together reveals that inference leads to a spikier distribution for the truncated prior.

In [None]:
print(np.mean(np.random.choice(grid_p, 1000, p = posterior)))
print(np.mean(np.random.choice(grid_p, 1000, p = posterior2)))

The posterior mean for the second distribution (with the informed prior) yields a result closer to the 'true' value of $p=.7$, indicating that the truncated prior indeed produces more accurate inference.

# 3

> This problem is more open-ended than the others. Feel free to collaborate on the solution. Suppose you want to estimate the Earth's proportion of water very precisely. Specifically, you want the 99% percentile interval of the posterior distribution of $p$ to be only $0.05$ wide. This means the distance between the upper and lower bound of the interval should be $0.05$. How many times will you have to toss the globe to do this? I won’t require a precise answer. I’m honestly more interested in your approach.

We can iterate through a bunch of tosses and assess the 99% interval width to get a sense of the scale:

In [None]:
p = .7 # to generate data
n_grid = 1000
post_samples = 5000
n_vals = [50, 100, 200, 500, 1000, 1500, 2000, 2500, 3000, 5000]
intervals = np.zeros([len(n_vals), 2])

for i, n in enumerate(n_vals):
    # sample to get data
    k = np.random.binomial(n, p)
    
    # grid approx
    grid_p = np.linspace(0, 1, n_grid)
    grid_prior = np.ones(n_grid)
    grid_lik = stats.binom.pmf(k, n, p = grid_p)
    posterior = grid_prior * grid_lik
    posterior = posterior / posterior.sum()
    
    # sample from posterior
    samples = np.random.choice(grid_p, post_samples, p = posterior)
    
    # 99% interval
    intervals[i] = pm.stats.hpd(samples, alpha = .01)

res = pd.DataFrame(intervals, index = n_vals)
res['width'] = res[1] - res[0]
res
    

Somewhere between 2000 and 2500 tosses of the globe will produce a posterior where the 99% percentile interval is smaller than .05.