# 2.1 Frequentist Inference

### Setup

In [1]:
import numpy as np
import pandas as pd

Load data

In [2]:
# dataset is a single column; extract it as a Series
gfr = pd.read_csv("../data/gfr.txt", delimiter=" ", header=None, names=["gfr"])
gfr = gfr.gfr

In [3]:
gfr.head()

0    108
1     91
2     62
3     59
4     84
Name: gfr, dtype: int64

### Table 2.1

#### Mean

In [4]:
mean = gfr.mean()

In [5]:
n_gfr = gfr.shape[0]
se_mean = np.sqrt(((gfr - mean)**2).sum() / (n_gfr * (n_gfr - 1)))

#### Bootstrap samples

In [6]:
def winsorize25(values):
    q25, q75 = values.quantile(q=[0.25, 0.75])
    values = values.clip(lower=q25, upper=q75)
    return values.mean()

In [7]:
win_boots = []
med_boots = []
for _ in range(1000):
    boot_data = gfr.sample(n=gfr.shape[0], replace=True)
    win_boots.append(winsorize25(boot_data))
    med_boots.append(boot_data.median())

#### Winsorized mean

In [8]:
winsorized = winsorize25(gfr)

In [9]:
se_wins = np.var(win_boots, ddof=1)

#### Median

In [10]:
median = gfr.median()

In [11]:
se_median = np.var(med_boots, ddof=1)

#### Table

In [12]:
tmplt = "{:<20}  {:>0.2f}  {:>0.2f}"
print("                   Estimate    SE")
print("-" * 35)
print(tmplt.format("mean", mean, se_mean))
print(tmplt.format("25% Winsorized mean", winsorized, se_wins))
print(tmplt.format("median", median, se_median))

                   Estimate    SE
-----------------------------------
mean                  54.27  0.94
25% Winsorized mean   52.81  0.84
median                52.00  0.77


All of these values are slightly different from the book's values. For the bootstrap SE that's expected. For the others, I'm not sure why they're different. In particular, in repeated runs of the boostrap samples, the Winsorized mean usually has higher SE than the median.