## All of Statistics - Chapter 7 Exercise 3
 
  - Generate 100 observations from a N(0,1) distribution.
  - Compute a 95 percent confidence band for the cdf F (as described in the appendix).
  - Repeat this 1000 times and see how often the confidence band contains the true distribution function. 
  - Repeat using data from a Cauchy distribution.

In [1]:
import numpy as np
from scipy.stats import norm, cauchy
import plotly.express as px
import plotly.graph_objects as go

### Task 1: Generate 100 observations from a N(0,1) distribution.

In [2]:
# number of observations
n = 100

# alpha and epsilon
# confidence interval as in equation 7.3, page 99
alpha = 0.05
epsilon = np.sqrt(
    (1/(2*n)) * np.log(2/alpha)
)

# generating 100 observations from a N(0,1) distribution
sample = np.sort(norm.rvs(loc = 0, scale = 1, size = 100, random_state = 42))


In [5]:
# plotting the histogram of the sample
fig = px.histogram(x = sample, nbins = 50)
fig.update_layout(title = '100 обзервации од распределба N(0,1) distribution')
fig.show()

In [8]:
# distribution function (cdf function)
# Definition 7.1; Equation 7.1
Fn = lambda x: sum(sample < x) / n
Fn_hat = np.array(list(map(Fn, sample)))

In [9]:
# plotting the estimation of F
fig = px.line(
    x = sample, y = Fn_hat,
    title = 'Estimation of F with the empirical distribution function'
)
fig.show()

### Task 2: Compute a 95 percent confidence band for the cdf F

In [10]:
# lower bound
L_of_x = lambda x: max(Fn(x) - epsilon, 0)
L = np.array(list(map(L_of_x, sample)))

# upper bound
U_of_x = lambda x: min(Fn(x) + epsilon, 1)
U = np.array(list(map(U_of_x, sample)))

([0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.00418985 0.01418985 0.02418985 0.03418985
 0.04418985 0.05418985 0.06418985 0.07418985 0.08418985 0.09418985
 0.10418985 0.11418985 0.12418985 0.13418985 0.14418985 0.15418985
 0.16418985 0.17418985 0.18418985 0.19418985 0.20418985 0.21418985
 0.22418985 0.23418985 0.24418985 0.25418985 0.26418985 0.27418985
 0.28418985 0.29418985 0.30418985 0.31418985 0.32418985 0.33418985
 0.34418985 0.35418985 0.36418985 0.37418985 0.38418985 0.39418985
 0.40418985 0.41418985 0.42418985 0.43418985 0.44418985 0.45418985
 0.46418985 0.47418985 0.48418985 0.49418985 0.50418985 0.51418985
 0.52418985 0.53418985 0.54418985 0.55418985 0.56418985 0.57418985
 0.58418985 0.59418985 0.60418985 0.61418985 0.62418985 0.63418985
 0.64418985 0.65418985 0.66418985 0.67418985 0.68418985 0.69418985
 0.70418985 0.71418985 0.72418985 0.73418985 0.74418985 0.75418985
 0.7641898

In [15]:
# plotting the 95% confidence interval for the esimate of the cdf
fig = go.Figure()

fig.add_trace(go.Scatter(
    x = sample, y = Fn_hat, name = 'Fn_hat'
))

fig.add_trace(go.Scatter(
    x = sample, y = U, name = 'Upper bound'
))

fig.add_trace(go.Scatter(
    x = sample, y = L, name = 'Lower bound'
))

fig.update_layout(
    title = '95% confidence bound for the estimate of the cdf'
)

fig.show()

### Task 3:
Repeat this 1000 times and see how often
the confidence band contains the true distribution function.

In [20]:
# true distribution function
x = np.linspace(-3,3,100)
true_dist_func = [norm.cdf(x_i, loc = 0, scale = 1) for x_i in x]

fig = px.line(
    x = x, y = true_dist_func,
    title = 'True distribution function for N(0,1)'
)
fig.show()

In [29]:
from tqdm import tqdm
num_repeats = 1000
num_contained = 0

for i in tqdm(range(num_repeats)):
    # generating 100 observations from a N(0,1) distribution
    sample = np.sort(norm.rvs(loc = 0, scale = 1, size = 100))

    # estimate of the cdf
    Fn_hat = np.array(list(map(Fn, sample)))

    # lower and upper bounds
    L = np.array(list(map(L_of_x, sample)))
    U = np.array(list(map(U_of_x, sample)))

    # true cdf
    true_dist_func = np.array(list(map(norm.cdf, sample)))

    # generating graphs
    # fig = go.Figure()
    # fig.add_trace(go.Scatter(
    #     x = sample, y = true_dist_func, name = 'CDF'
    # ))
    # fig.add_trace(go.Scatter(
    #     x = sample, y = L, name = 'L'
    # ))
    # fig.add_trace(go.Scatter(
    #     x = sample, y = U, name = 'U'
    # ))
    # fig.show()

    contained = (true_dist_func >= L).all() and (true_dist_func <= U).all()
    num_contained += int(contained)

print(f"{num_contained / num_repeats * 100}% of the confidence intervalse contain"
      f"the true CDF")


100%|██████████| 1000/1000 [01:37<00:00, 10.24it/s]


95.19999999999999% of the confidence intervalse containthe true CDF


### Task 4: Repeat using data from a Cauchy distribution.


### Task 4.1: Generate 100 observations from a Cauchy(0,1) distribution.

In [52]:
# number of observations
n = 100

# alpha and epsilon
# confidence interval as in equation 7.3, page 99
alpha = 0.05
epsilon = np.sqrt(
    (1/(2*n)) * np.log(2/alpha)
)

# generating 100 observations from a Cauchy(0,1) distribution
sample = np.sort(cauchy.rvs(loc = 0, scale = 1, size = 100, random_state = 42))

In [53]:
# distribution function (cdf function)
# Definition 7.1; Equation 7.1
Fn = lambda x: sum(sample < x) / n
Fn_hat = np.array(list(map(Fn, sample)))

In [54]:
fig = px.line(
    x = sample, y = Fn_hat,
    title = 'Estimation of F with the empirical distribution function'
)
fig.show()

### Task 4.2: Compute a 95 percent confidence band for the cdf F

In [55]:
# lower bound
L_of_x = lambda x: max(Fn(x) - epsilon, 0)
L = np.array(list(map(L_of_x, sample)))

# upper bound
U_of_x = lambda x: min(Fn(x) + epsilon, 1)
U = np.array(list(map(U_of_x, sample)))

In [56]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x = sample, y = Fn_hat, name = 'Fn_hat'
))

fig.add_trace(go.Scatter(
    x = sample, y = U, name = 'Upper bound'
))

fig.add_trace(go.Scatter(
    x = sample, y = L, name = 'Lower bound'
))

fig.update_layout(
    title = '95% confidence bound for the estimate of the cdf'
)

fig.show()

### Task 3:
Repeat this 1000 times and see how often the confidence
band contains the true distribution function.

In [58]:
# true distribution function
x = np.linspace(-5,5,100)
true_dist_func = [cauchy.cdf(x_i, loc = 0, scale = 1) for x_i in x]

fig = go.Figure()
fig.add_trace(go.Scatter(
    x = x, y = true_dist_func, name = 'CDF',
))
fig.update_layout(
    title = 'True cauchy CDF'
)
fig.show()

In [61]:
from tqdm import tqdm
num_repeats = 1000
num_contained = 0

for i in tqdm(range(num_repeats)):
    # generating 100 observations from a N(0,1) distribution
    sample = np.sort(cauchy.rvs(loc = 0, scale = 1, size = 100, random_state = 42))

    # estimate of the cdf
    Fn_hat = np.array(list(map(Fn, sample)))

    # lower and upper bounds
    L = np.array(list(map(L_of_x, sample)))
    U = np.array(list(map(U_of_x, sample)))

    # true cdf
    true_dist_func = np.array(list(map(cauchy.cdf, sample)))

    # generating graphs
    # fig = go.Figure()
    # fig.add_trace(go.Scatter(
    #     x = sample, y = true_dist_func, name = 'CDF'
    # ))
    # fig.add_trace(go.Scatter(
    #     x = sample, y = L, name = 'L'
    # ))
    # fig.add_trace(go.Scatter(
    #     x = sample, y = U, name = 'U'
    # ))
    # fig.show()

    contained = (true_dist_func >= L).all() and (true_dist_func <= U).all()
    num_contained += int(contained)

print(f"{num_contained / num_repeats * 100}% of the confidence "
      f"intervalse contain the true CDF")

100%|██████████| 1000/1000 [01:53<00:00,  8.82it/s]


100.0% of the confidence intervalse contain the true CDF
