In [None]:
from datascience import *
%matplotlib inline

import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import numpy as np

import warnings
warnings.simplefilter("ignore")

# Lecture 19 

## Percentiles ##

In [None]:
# Manually compute the 55th percentile.
x = make_array(43, 20, 51, 7, 28, 34)

In [None]:
# Step 1. Sort the data


In [None]:
ptbl = Table().with_columns(
    'Percentile', 100*(np.arange(1, len(x)+1))/len(x), 
    'Element', sorted_x
)
ptbl

In [None]:
# Step 2. Figure out where 55th percentile would be.
p = 55
ind = ...

The manual calculation is a bit confusing and brittle (try p=0).   
Instead use the `percentile` function.

In [None]:
# Alternatively: One line of code
percentile(55, x)

In [None]:
percentile?

In [None]:
ptbl

In [None]:
percentile(55, x)

<br><br><br><br>

---

<center> return to slides </center>

---

<br><br>

## Discussion Question

In [None]:
x = make_array(1, 3, 5, 7, 9)

In [None]:
percentile(10, s) == 0

In [None]:
percentile(39, s) == percentile(40, s)

In [None]:
percentile(40, s) == percentile(41, s)

In [None]:
percentile(50, s) == 5

<br><br><br><br>

---

<center> return to slides </center>

---

<br><br>

## Inference: Estimation


To demonstarte the process of estimating a parameter, let's examine data from 2019 public records in San Francisco (data from the SF Open Portal).  For the purposes of this exercise, we will assume that this is a census of all the compensation data, that is, it contains the compensation data for all public workers. 

In [None]:
sf = Table.read_table('san_francisco_2019.csv')
sf.show(3)

We are interested in looking at `Total Compensation`.  Let's first make a histogram to see the distribution of these values.

In [None]:
# Who made the most money


In [None]:
# Who made the least money


In [None]:
# $15/hr, 20 hr/wk, 50 weeks

min_salary = ...
# Filter out the rows that meet the minimum salary
sf = ...

In [None]:
# Population Distribution

sf_bins = np.arange(0, 726000, 25000)
sf.hist('Total Compensation', bins=sf_bins)

### The Population Parameter 

Here we have access to the population so we can complete parameters directly. 

For example, say we are interested in the median compensation.  We can compute it on the data. 

In [None]:
# Parameter: Median Total Compensation 

pop_median = percentile(50, sf.column('Total Compensation'))
pop_median

In many real world settings, you may not have access to the population, intead you would take a random sample.

Suppose we sample 400 people from our population. 

In [None]:
# An Empirical Distribution

our_sample = ...
our_sample.hist('Total Compensation', bins=sf_bins)

We can use the sample median (statistic) as an estimate of the population parameter. 

In [None]:
# Estimate: Median of a Sample

percentile(50, our_sample.column('Total Compensation'))

But in the real world we won't be able to keep going back to the population. How to generate a new random sample *without going back to the population?*

<br><br><br><br>

---

<center> return to slides </center>

---

<br><br>


## Variability of the Estimate

If we could get additional samples from the population, how much variability would there be in our estimate of the median?

In [None]:
def generate_sample_median(samp_size):
    ...
    return ...

In [None]:
generate_sample_median(400)

<br><br><br>
## Quantifying Uncertainty

Because we have access to the population, we can simulate many samples from the population. 

In [None]:
sample_medians = make_array()

for i in np.arange(1000):
    ...
    

In [None]:
med_bins = np.arange(120000, 160000, 2000)
Table().with_column('Sample Medians', sample_medians).hist(bins=med_bins)

plots.ylim(-0.000005, 0.00014)
plots.scatter(pop_median, 0, color='red');

But in the real world we won't be able to keep going back to the population.  How do we generate a new random sample *without going back to the population?*

<br><br><br><br>

---

<center> return to slides </center>

---

<br><br>


# Bootstrap

Sample randomly
 - from the original sample
 - with replacement
 - the same number of times as the original sample size

<br><br>
**Step 1** Sample the original sample, with replacement that same number of times as the original sample size.

In [None]:
# Default behavior of tbl.sample:
# at random with replacement,
# the same number of times as rows of tbl

bootstrap_sample = our_sample.sample()


In [None]:
bootstrap_sample.hist('Total Compensation', bins=sf_bins)
percentile(50, bootstrap_sample.column('Total Compensation'))

**Step 2** Compute statistic on bootstrap sample. 

In [None]:
...

**Repeat** the sampling process many times

In [None]:
def one_bootstrap_median():
    # draw the bootstrap sample
    resample = ...
    # return the median total compensation in the bootstrap sample
    return percentile(50, resample.column('Total Compensation'))

In [None]:
one_bootstrap_median()

In [None]:
# Generate the medians of 1000 bootstrap samples
num_repetitions = 1000
bstrap_medians = make_array()
for i in np.arange(num_repetitions):
    new_bstrap_median = one_bootstrap_median()
    bstrap_medians = np.append(bstrap_medians, new_bstrap_median)

In [None]:
resampled_medians = Table().with_column('Bootstrap Sample Median', bstrap_medians)
median_bins=np.arange(120000, 160000, 2000)
resampled_medians.hist(bins = median_bins)

# Plotting parameters; you can ignore this code
parameter_green = '#32CD32'
plots.ylim(-0.000005, 0.00014)
plots.scatter(pop_median, 0, color=parameter_green, s=40, zorder=2)
plots.title('Bootstrap Medians and the Parameter (Green Dot)');

<br><br><br><br>

---

<center> return to slides </center>

---

<br><br>

## Percentile Method: Middle 95% of the Bootstrap Estimates 

In [None]:
left = ...  # Use percentile
right = ... # Use percentile

make_array(left, right)

In [None]:
resampled_medians.hist(bins = median_bins)

# Plotting parameters; you can ignore this code
plots.ylim(-0.000005, 0.00014)
plots.plot(make_array(left, right), make_array(0, 0), color='yellow', lw=3, zorder=1)
plots.scatter(pop_median, 0, color=parameter_green, s=40, zorder=2);