# Notebook for generating examples in lecture 5

In [None]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

In [None]:
np.random.seed(5)
X_sample = np.random.binomial(n=10, p=1/6, size=100)

In [None]:
X_sample

In [None]:
sixes = stats.binom(10, 1/6)  # n=10, p=1/6

In [None]:
# plot the probability mass function
x = np.arange(11)
plt.plot(x,sixes.pmf(x), 'ro', ms=8)
plt.vlines(x, 0, sixes.pmf(x), colors='r', lw=4)
plt.show()

In [None]:
_loc = np.arange(11)
_cnt = np.bincount(X_sample, minlength=11)
plt.plot(_loc, _cnt, 'ko', ms=8)
plt.vlines(_loc, 0, _cnt, colors='k', lw=4)
plt.xlim(-0.5, 10.5)

## Central limit theorem - confidence interval

In [None]:
X_sample.mean(), X_sample.var(ddof=1)

In [None]:
_loc = X_sample.mean()
_std = np.sqrt(X_sample.var(ddof=1) / len(X_sample))

_ci95_low = stats.norm.ppf(0.025, loc=_loc, scale=_std)
_ci95_high = stats.norm.ppf(0.975, loc=_loc, scale=_std)

print(_loc, _std)
print(_ci95_low, _ci95_high)

In [None]:
_loc = X_sample.mean()
_std = np.sqrt(X_sample.var(ddof=1) / len(X_sample))

_ci95_low = stats.norm.ppf(0.025, loc=_loc, scale=_std)
_ci95_high = stats.norm.ppf(0.975, loc=_loc, scale=_std)

## coarse estimates:
# _ci95_low = _loc - 2*_std
# _ci95_high =_loc + 2*_std

_xx = np.arange(1.3, 2.1, 0.01)
_pdf = stats.norm.pdf(_xx, loc=_loc, scale=_std)

fig = plt.figure(dpi=100)
plt.plot(_xx, _pdf)
plt.axvline(x=_loc, color='g', label='sample mean: %.2f' %(_loc))
plt.axvline(x=_ci95_low, color='tab:pink', ls='--', 
            label='95%%CI Low: %.2f' %(_ci95_low))
plt.axvline(x=_ci95_high, color='tab:orange', ls='--', 
            label='95%%CI High: %.2f' %(_ci95_high))
plt.legend(loc=2)
plt.ylabel('PDF')
plt.xlabel('Value for population mean')
plt.title('CLT estimation of population mean')
plt.show()

## Small samples with *t*-distribution
*Note, this requires the population follows a Normal distribution*
We use a sample of new born baby weight in HK:
3.08 ± 0.39 kg

In [None]:
np.random.seed(0)
X_small_sample = np.random.normal(loc=3.08, scale=0.39, size=10)
X_small_sample

In [None]:
[round(x, 2) for x in X_small_sample]

In [None]:
# X_small_sample = X_sample[:10]
_loc = X_small_sample.mean()
_std = np.sqrt(X_small_sample.var(ddof=1) / len(X_small_sample))

print(_loc, _std, X_small_sample.std())

_ci95_low = stats.t.ppf(0.025, df=len(X_small_sample)-1, loc=_loc, scale=_std)
_ci95_high = stats.t.ppf(0.975, df=len(X_small_sample)-1, loc=_loc, scale=_std)

_xx = np.arange(2.6, 3.9, 0.01)
_pdf = stats.t.pdf(_xx, df=len(X_small_sample)-1, loc=_loc, scale=_std)

fig = plt.figure(dpi=100)
plt.plot(_xx, _pdf)
plt.axvline(x=_loc, color='g', label='sample mean: %.2f' %(_loc))
plt.axvline(x=_ci95_low, color='tab:pink', ls='--', 
            label='95%%CI Low: %.2f' %(_ci95_low))
plt.axvline(x=_ci95_high, color='tab:orange', ls='--', 
            label='95%%CI High: %.2f' %(_ci95_high))
plt.legend(loc=2)
plt.ylabel('PDF')
plt.xlabel('Value for population mean')
plt.title('Estimation of population mean with t distribution (df=9)')
plt.show()

## Bootstrapping Confidence Interval

In [None]:
np.random.seed(3)
X_bs_samples = np.random.choice(X_sample, replace=True, size=(100, 1000))
X_bs = X_bs_samples.mean(axis=0)

In [None]:
np.mean(X_bs), np.var(X_bs, ddof=1)

In [None]:
np.quantile(X_bs, q=0.025), np.quantile(X_bs, q=0.975)

In [None]:
_low_bound = np.quantile(X_bs, q=0.025)
_high_bound = np.quantile(X_bs, q=0.975)

fig = plt.figure(dpi=100)
plt.hist(X_bs, bins=20, alpha=0.7)
plt.axvline(x=X_sample.mean(), color='g', 
            label='sample mean: %.2f' %(X_sample.mean()))
plt.axvline(x=X_bs.mean(), color='r', 
            label='mean of bootstrap: %.2f' %(X_bs.mean()))
plt.axvline(x=_low_bound, color='tab:pink', 
            ls='--', label='95%%CI Low: %.2f' %(_low_bound))
plt.axvline(x=_high_bound, color='tab:orange', 
            ls='--', label='95%%CI Hihg: %.2f' %(_high_bound))
plt.xlabel("Bootstrap mean")
plt.ylabel("Frequency")
plt.title("Distribution of bootstrap mean | 1000 iterations")
plt.legend(loc=2)
plt.show()

## Theoretical confidence interval

In [None]:
np.random.seed(3)
X_reps = np.random.binomial(n=10, p=1/6, size=(100, 1000))
X_mean_reps = X_reps.mean(axis=0)

In [None]:
np.quantile(X_mean_reps, q=0.025), np.quantile(X_mean_reps, q=0.975)

In [None]:
_low_bound = np.quantile(X_mean_reps, q=0.025)
_high_bound = np.quantile(X_mean_reps, q=0.975)

fig = plt.figure(dpi=100)
plt.hist(X_mean_reps, bins=20, alpha=0.7)
plt.axvline(x=X_mean_reps.mean(), color='r', 
            label='mean:%.2f' %(X_mean_reps.mean()))
plt.axvline(x=_low_bound, color='tab:pink', 
            ls='--', label='95%%CI Low: %.2f' %(_low_bound))
plt.axvline(x=_high_bound, color='tab:orange', 
            ls='--', label='95%%CI Hihg: %.2f' %(_high_bound))
plt.xlabel("Sample mean")
plt.ylabel("Frequency")
plt.title("True distribution of sample mean | 1000 sample sets")
plt.legend(loc=2)
plt.show()

## Example of sampling with replacement

In [None]:
X_unif = np.arange(10)
X_unif

In [None]:
np.random.choice(X_unif, replace=True, size=10)

In [None]:
X_unif_bs = np.random.choice(X_unif, replace=True, size=(10, 1000))

In [None]:
fig = plt.figure(dpi=100)
plt.hist(X_unif_bs.mean(axis=0), bins=30)
plt.xlabel("Bootstrap mean")
plt.ylabel("Frequency")
plt.title("Distribution of bootstrap mean | 1000 iterations")
plt.show()