### Problem

In this example, we create a dataset by sampling from normal distribution. We estimate the confidence interval for its mean using both z distribution and t distribution.

We also create an simulation experiment to demonstrate the meaning of confidence interval.

In [2]:
# Import python libraries
import numpy as np
from scipy.stats import norm, t

np.random.seed(1)

In [5]:
# Generate data from normal distribution

N = 1000
mu = 5
sigma = 2

# randn() generates data from a standard normal.
X = np.random.randn(N) * sigma + mu
print(X[0:5])

[4.69352768 0.13498297 6.01596867 4.35193534 1.97784678]


In [13]:
# z confidence interval

mu_hat = np.mean(X)
sigma_hat = np.std(X, ddof=1)

z_left = norm.ppf(0.025)
z_right = norm.ppf(0.975)
lower = mu_hat + z_left * sigma_hat / np.sqrt(N)
upper = mu_hat + z_right * sigma_hat / np.sqrt(N)

print(f'The 95% z-distribution confidence interval is [{lower:.2f}, {upper:.2f}]')

The 95% z-distribution confidence interval is [4.93, 5.18]


In [12]:
# t confidence interval

mu_hat = np.mean(X)
sigma_hat = np.std(X, ddof=1)

t_left = t.ppf(0.025, df=N - 1)
t_right = t.ppf(0.975, df=N - 1)
lower = mu_hat + t_left * sigma_hat / np.sqrt(N)
upper = mu_hat + t_right * sigma_hat / np.sqrt(N)

print(f'The 95% t-distribution confidence interval is [{lower:.2f}, {upper:.2f}]')

The 95% t-distribution confidence interval is [4.93, 5.18]


In [14]:
# The interpretation of confidence interval

def experiment():
    X = np.random.randn(N) * sigma + mu
    mu_hat = np.mean(X)
    sigma_hat = np.std(X, ddof=1)

    t_left = t.ppf(0.025, df=N - 1)
    t_right = t.ppf(0.975, df=N - 1)
    lower = mu_hat + t_left * sigma_hat / np.sqrt(N)
    upper = mu_hat + t_right * sigma_hat / np.sqrt(N)
    return (mu > lower) and (mu < upper)

def multi_experiment(M):
    results = [experiment() for _ in range(M)]
    return np.mean(results)

multi_experiment(10000)

0.9506

**Observe:** Out of 10000 trials, the confidence interval contains the true mean approximately 95% of the time.