In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

# Lecture 15 

## Review

In [None]:
united = Table.read_table('united.csv')
united = united.with_column('Row', np.arange(united.num_rows)).move_to_start('Row')

In [None]:
# (Population) Parameter
np.median(united.column('Delay'))

In [None]:
# (Sample) Statistic
np.median(united.sample(10).column('Delay'))

In [None]:
# (Sample) Statistic
np.median(united.sample(100).column('Delay'))

### Probability & Empirical Distributions of a Statistic

We use the following [steps to simulate a statistic (CIT 10.3.3)](https://inferentialthinking.com/chapters/10/3/Empirical_Distribution_of_a_Statistic.html#simulating-a-statistic).  Note, you can replace the sample size other values and sample median with calculations to look at other statistics. 

* **Step 1: Decide which statistics to simulate**  
We have decided to simulate the median of a random sample of size 10 drawn from the population of flight delays.

* **Step 2: Define a function that returns one simulated value of the statistics**  
Draw a random sample of the sample size and compute the statistic, here the median, of the sample.

* **Step 3: Decide how many simulated values to generate.**  
Here we will do 2,000 repetitions.

* **Step 4: Use a `for` loop to generate an array of simulated values**  
Create an empty array to collect the values. The body of the loop generates one simulated value of the statistic and appends it to the collection array.

The simulation may take some time because it is doing multiple repetitions and samples. 

In [None]:
def sample_median(size):
    return np.median(united.sample(size).column('Delay'))

In [None]:
sample_median(10)

In [None]:
num_simulations = 2000

In [None]:
sample_medians = make_array()

for i in np.arange(num_simulations):
    sample_medians = np.append(sample_medians, sample_median(10))

In [None]:
Table().with_column('Sample medians (size=10)', sample_medians).hist(bins=20)

#### Empirical Distributions of a Statistic (Overlayed)

In [None]:
sample_medians_10 = make_array()
sample_medians_100 = make_array()
sample_medians_1000 = make_array()

num_simulations = 2000

for i in np.arange(num_simulations):
    new_median_10 = sample_median(10)
    sample_medians_10 = np.append(sample_medians_10, new_median_10)
    new_median_100 = sample_median(100)
    sample_medians_100 = np.append(sample_medians_100, new_median_100)
    new_median_1000 = sample_median(1000)
    sample_medians_1000 = np.append(sample_medians_1000, new_median_1000)

In [None]:
sample_medians = Table().with_columns('Size 10', sample_medians_10, 
                                      'Size 100', sample_medians_100,
                                      'Size 1000', sample_medians_1000)

In [None]:
sample_medians.hist(bins = np.arange(-5, 30))

## Simulation of a Statistic 

We can take this idea and apply it to a new problem.  Let's look at the Billboard charts data in the 2010s, from lab 8. 

In [None]:
billboard = Table.read_table('billboard-2010.csv')
billboard.show(5)

#### Empirical Distribution of a Statistic

Let's say we are interested in the __*average number of weeks a song is on the chart*__.

First, we can calculate this for the entire data set. 

In [None]:
np.mean(billboard.column('Weeks.on.chart'))

We can also calculate this on a sample. 

In [None]:
np.mean(billboard.sample(50).column('Weeks.on.chart'))

Now, we will use the steps above to simulate a statistic - mean number of weeks on the chart, for a sample size of 1000.

In [None]:
def sample_mean(size):
    return ...

In [None]:
# Simulate the statisitc

sample_means = ... 



We can display the empirical distribution of the statistic.

In [None]:
simulated_means = Table().with_column('Sample Means', sample_means)
simulated_means.hist(bins = np.arange(11.5,15.5, 0.2))

Observe this distribution, referencing the mean across the entire data set. 

<details><summary>Click for Solution</summary>

```python
def sample_mean(size):
    return np.median(billboard.sample(size).column('Weeks.on.chart'))

# Simulate
num_repetitions = 5000

sample_means = make_array()

for i in np.arange(num_repetitions):
    sample_means = np.append(sample_means, sample_mean(1000))
```

</details>

<br><br><br><br>

---

<center> return to slides </center>

---

<br><br>

## Swain vs. Alabama ##

In [None]:
population_proportions = ...
population_proportions

In [None]:
sample_proportions(...)

In [None]:
def panel_proportion():
    return ...

In [None]:
panel_proportion()

In [None]:
panels = make_array()

for i in np.arange(10000):
    ...

In [None]:
Table().with_column(
    'Number of Black People on Panel of 100', panels
).hist(bins=np.arange(5.5,40.))

# Plotting details; ignore this code
plots.ylim(-0.002, 0.09)
plots.scatter(8, 0, color='red', s=30);

*Think about this happening by chance*

<br><br><br><br>

---

<center> return to slides </center>

---

<br><br>

## Mendel and Pea Flowers ##

In [None]:
predicted_proportions = make_array(.75, .25)
sample_proportions(929, predicted_proportions)

In [None]:
## Mendel had 929 plants, of which 709 had purple flowers
observed_purples = 709 / 929
observed_purples

In [None]:
def purple_flowers():
    return ...

purple_flowers()

In [None]:
purples = make_array()

for i in np.arange(10000):
    ...

In [None]:
# Observed distance 


<br><br><br><br>

---

<center> return to slides </center>

---

<br><br>