In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.mlab import csv2rec
import numpy as np
from numpy import mean, sqrt, std
import matplotlib
from scipy.stats import norm as ndist

# stats60 specific
from code.utils import (sample_density, 
                        probability_histogram)
from code.week1 import (stylized_density, 
                        standardize_right,
                        standardize_left,
                        standardize_interval,
                        CAdensity,
                        normal_curve,
                        SD_rule_of_thumb_normal,
                        percentile_figure)
from code.week2 import pearson_lee
from code import dice
from code.probability import (BoxModel, 
                              SampleMean, 
                              SampleSD,
                              Binomial)
figsize = (8,8)

## Summarizing continuous data

### California population by age

Age group | Count | Percentage
--- | --- | ---
0-20 | 10000000 | 29%
20-55 | 17500000 | 17500000 / 34000000 = 52%
55-75 | 4500000 | 13%
75+ | 2000000 | 6%
**Total** | 34000000 | 100 %


## Histogram

We summarize this information graphically with a special
type of bar graph called a **histogram.**

- Area of a bar corresponds to percentage.
- Total area (percentage) should be 100%.
- Height of bars is called **density**.
- Area of each bar is
    $$
    \text{area of bar = (height of bar) * (width of bar) }
    $$



In [None]:
%%capture
def CAdensity():
    bins = [0,20,55,75,100]
    count = [29,52,13,6]
    hist_fig = plt.figure(figsize=figsize)
    data = np.array([10]*29 + [30]*52 + [60]*13 + [80.]*6)
    hist_plot, dens, CDF = sample_density(data, bins=bins, alpha=0.5, ax=hist_fig.gca(),
                            facecolor='gray')
    hist_plot.set_ylabel('Percentage per year (%/year)', fontsize=20)
    hist_plot.set_xlabel('Age (years)', fontsize=20)
    hist_plot.set_title('California population by age groups', fontsize=22)
    def area(a, b):
        return np.round(100*(CDF(b) - CDF(a)), 1)
    return hist_fig, dens, area

hist_fig, CAdens, CAarea = CAdensity()

In [None]:
hist_fig

### Interpreting a histogram

$$\text{Percentage in 20-55 age group} = \left(1.5 \, \frac{\%}{\text{year}} \right) * \left( 35 \, \text{years} \right) \approx 52 \%$$

In [None]:
%%capture
hist_fig2, dens2, area2 = CAdensity()
ax = hist_fig2.gca()
interval = np.linspace(20.001,54.999,20)
ax.fill_between(interval, 0*interval, dens2(interval), 
                facecolor='yellow', hatch='/')


In [None]:
hist_fig2

In [None]:
CAarea(20,55)

## Histograms from a list of numbers

Given a *sample*, i.e. a list of numbers $X=[X_1,..., X_n]$ 
and a set of break points $[B_1,..., B_k]$ we can form a histogram by
computing percentages for a bin $[B_j,B_{j+1})$

      percentage of numbers in X greater than our equal to B[j] 
      but less than B[j+1]


The following is a sample of women's heights recorded in the early 20th
century in a study by Karl Pearson:

In [None]:
mother = pearson_lee().M
mother[:10]

In [None]:
len(mother)

Now, let's make some bins:

In [None]:
binpoints = range(50,80)
binpoints[:5]

In [None]:
%%capture
mother_fig = plt.figure(figsize=figsize)
sample_density(mother, bins=binpoints, facecolor='gray')
ax = mother_fig.gca()
ax.set_xlabel('Height (inches)', fontsize=15)
ax.set_ylabel('Percentage per inch (%/inch)', fontsize=15)

In [None]:
mother_fig

## Continuous histograms

The flat histograms we drew above can be thought of as
*approximations* to a *continuous histogram*.

If the population of people we used gets larger and larger, the histogram
might settle down to a curve.


In [None]:
%%capture
sample = np.random.standard_normal(50000)
plt.figure(figsize=figsize)
hist_finer, _, CDF_finer = sample_density(sample[:10000], bins=50)
hist_finer.set_xlabel('Units', fontsize=20)
hist_finer.set_ylabel('Density (% per unit change)', fontsize=20)

In [None]:
hist_finer.figure

In [None]:
%%capture
from scipy.stats import norm as ndist
X = np.linspace(-4,4,101)
hist_finer.plot(X, ndist.pdf(X), c='k', linewidth=4)
hist_finer.set_title('With more data, the discrete histogram converges to a curve', fontsize=15, color='red')


In [None]:
hist_finer.figure

In [None]:
%%capture
plt.figure(figsize=figsize)
hist_finest, _, CDF_finest = sample_density(sample, bins=200)
hist_finest.set_xlabel('Units', fontsize=20)
hist_finest.set_ylabel('Density (% per unit)', fontsize=20)
hist_finest.set_title('Even finer resolution...', fontsize=15, color='red')
hist_finest.plot(X, ndist.pdf(X), c='k', linewidth=4)

In [None]:
hist_finest.figure

### Shape of a histogram

This histogram is **skewed right (i.e. long right tail)**

In [None]:
%%capture
with plt.xkcd():
    skew_right = plt.figure(figsize=figsize)
    sample = list(np.random.beta(1, 2.2, size=25000)) + list(np.random.beta(1,1, size=5000)) 
    stylized_density(sample, ax=skew_right.gca())

In [None]:
skew_right

## Other important visual summaries

### Scatter plot

A plot with two axes:

- X-axis is the *independent variable*;
- Y-axis is the *dependent variable.*

In [None]:
%%capture
with plt.xkcd():
    wage_fig = plt.figure(figsize=figsize)
    wage_ax = wage_fig.gca()
    wage = csv2rec('data/wage.csv')
    wage_ax.scatter(wage['education'], wage['logwage'], facecolor='red', s=50)
    wage_ax.set_xlabel('Education (years)', fontsize=20)
    wage_ax.set_ylabel('log wage (log $)', fontsize=20)
    wage_ax.set_title('Scatterplot', fontsize=20)

In [None]:
wage_fig

## Numeric summaries

In the last set of notes, we saw *histograms* as a useful
way to summarize a *sample* of numbers.

In this set of notes, we will look at numeric summaries that boil
down the histogram to a set of a few numbers.

In particular, we will look at:
    
- **average** or **mean**;
- **SD** and **SD+**;
- **median**.
    
Along the way, we will discuss **standardized units.**

## Average

### Definition

The average of a list of numbers equals their sum, divided by how many there are. Average is also called the *mean*.

<font color="#820000">
Example: Compute the average of the sample: [1,4,6,7,8].
</font>
<br>
<font color="blue">
The answer is
$$\text{Average}([1,4,6,7,8]) = \frac{1+4+6+7+8}{5} = \frac{26}{5} = 5.2$$
</font>

In [None]:
mean([1,4,6,7,8])

## Median

This is another summary of a sample or a histogram. Like 
the average, it gives a quantitative *center* or *location* to the sample.


**The median
 of a histogram is the number with half the area to the left and half the area to the right.**
 

In [None]:
%%capture
from code.week1 import CAdensity
CAhist, CAdens, CAarea = CAdensity(figsize=figsize)

In [None]:
CAhist

In [None]:
CAmedian = 20 + (0.21 / 0.52) * (55-20) 
CAmedian, CAarea(0, CAmedian)

In [None]:
%%capture
ax = CAhist.gca()
interval = np.linspace(0.,CAmedian,501)
ax.fill_between(interval, 0*interval, CAdens(interval), 
                facecolor='blue', hatch='/')

In [None]:
CAhist

According to this histogram, the median age of Californias is 34.1 years.



### Median of a list of numbers

Defined similarly to a histogram: put half the data on the left and half the
data on the right.

1. Sort the numbers for smallest to largest.
2.  If the length of the list is odd, the median is the middle entry of the sorted values.
3. Else, the median is the average of the two middle entries.

<font color="#820000">
Example: median of $[1,4,2,9,8]$
</font>

1. The sorted values are [1,2,4,8,9].

2. Since the length of the list is 5, the median is the middle entry of the sorted values. **The median is 4.**

<font color="#820000">
Example: median of $[1,11,3,7,8,3]$
</font>

1. The sorted values are [1,3,3,7,8,11].
2. Since the length of the list is 6, the median is the average of the middle entries. **The median is (3+7)/2=5.**

### Median and average

Examples

- **What is the mean of $[3,7,4,11,5]$? The median?**

- **What is the mean of $[3,37,4,41,5]$? The median?**

- **What do these examples tell us about the mean and median?**

This tells us that the mean is more sensitive to changes away from 
the center than the mean. Statisticians call this *robustness*.

## Standard deviation

### Concept:

* A measure of **spread**. The larger the SD the more *spread out* the sample is.
* The SD says how far numbers on a list are away from their average.
* Its units are in the original units if the list.
* It is always a positive number.
* Most entries on the list will be somehere around one SD away from the average. Very few will be more than two or three SDs away.


### Rules of thumb based on SD

* **Roughly 68 % of the entries on a list are within one SD of the average.**
* **Roughly 95 % of the entries on a list are within two SDs of the average.**
* These rules hold for many data sets, not all.

### Rules of thumb

In [None]:
%%capture
sd_fig1 = plt.figure(figsize=figsize)
ax_sd1 = SD_rule_of_thumb_normal(1, facecolor='gray', alpha=0.5)

In [None]:
sd_fig1

### Root Mean Square

### Root Mean Square

    r.m.s.(list) = square_root(mean([value^2 for value in list]))
                 = mean([value^2 for value in list])**(1/2)


Example: `r.m.s.([0, 5, 8, -3])`

1. First compute `mean([0,5^2,8^2,(-3^2)]) = mean([0,25,64,9]) = 98/4 = 24.5`.

2. Take square root: $\sqrt{24.5} \approx 4.9$.

3. **The answer is 4.9.**

In [None]:
sqrt(mean([v**2 for v in [0,5,8,-3]]))

### Computing the SD

Now that we've learnt about the rule of thumb, let's compute the
SD of a list.

* Given a list, define 

      deviations from average(list) = [entry - average(list) for entry in list]
    
    
* The SD is computed as 

      SD(list) = r.m.s.(deviations from average(list))
    
* In $\Sigma$ notation: 
$$
\text{SD}([X_1,\dots,X_n]) = \sqrt{\frac{1}{n}\sum_{i=1}^n (X_i-\bar{X})^2}.$$

### Standard deviation

Example: Compute SD([20,30,25,25])

1. Compute the average: 

    average([20,30,25,25]) = (20+30+25+25)/4 = 100/4 = 25
    
2. Compute deviations from average:

    deviations from average([20,30,25,25]) = [20-25, 30-25, 25-25, 25-25] = [-5, 5, 0, 0]
    
3. Compute the root mean square of this last list:
$$
    \text{r.m.s.}([-5, 5, 0, 0]) = \sqrt{(-5)^2 + 5^2 + 0^2 + 0^2))/4} = \sqrt{50/4} \approx 3.5 $$
    
4. **The answer is 3.5.**

### SD versus SD$^+$

- Some calculators (and software) compute a different version of SD than we will use.
- The difference depends on the length of the list.
- If the length of the list is $n$, then 
$$
\text{SD(list)} = \sqrt{\frac{n-1}{n}} \cdot \text{SD$^+$(list)}.
$$

- In $\Sigma$ notation
$$
\text{SD$^+$}([X_1, \dots, X_n]) = \sqrt{\frac{1}{n-1}\sum_{i=1}^n (X_i-\bar{X})^2}.
$$


## Mean and SD under change of units

- Suppose we have a list of 50 measurements in "old units"

       [m_1, ..., m_50]

- We want to convert to new units

       [t_1, ...., t_50]

- The transformation of units can be represented as:

       t = a * m + b
       
- Then, the mean or average transforms like

       mean([t[i] for i in range(50)]) = 
                a * mean([m[i] for i in range(50)]) + b
       
- The SD transforms like:

       SD([t[i] for i in range(50)]) =
                |a| * SD([m[i] for i in range(50)])

## Standardized units

- Many calculations we will do involve converting a *list* or *sample* to
standardized units.

- The procedure involves both `average(list)` and `SD(list)`.


### Standardizing a list of numbers

- Subtracting the average of a list of numbers from each entry yields a new list with average 0.
- Dividing each entry in a list of numbers by its SD yields a new list with SD 1.
- Combining the two operations yields a *standardized* list, where entries are in *standard units*.
- Standard units describe how many SDs each each is above or below the average of the list.

### Rule for standardizing a *list*

The standardized list is

    [(entry - mean(list)) / SD(list) for entry in list]
    
### Example: standardize the list [161 , 166 , 171 , 172 ]

#### Table for computing average and SD

Entry | Data | Deviation | Deviation$^2$
------|-----|-----------|---------------
1     |  161 |    161-167.5=-6.5     |    (-6.5)^2=42.25
2     |  166 |     166-167.5=-1.5     |    (-1.5)^2=2.25
3     |  171 |     171-167.5=3.5     |    3.5^2=12.25
4     |  172 |     172-167.5=4.5     |    4.5^2=20.25
Total |  670 |    (not needed, but always 0)       |    77

Average=670/4=167.5, SD = $\sqrt{77/4}=\sqrt{19.25}\approx 4.4$.

### Standardized units in $\Sigma$ notation

* If our list is $X=[X_1, \dots, X_n]$ and we call our standardized list $$Z=[Z_1, \dots, Z_n].$$
* Then, $$Z_i = \frac{X_i - \bar{X}}{\text{SD}(X)}.$$

In [None]:
%%capture
normal_fig = plt.figure(figsize=(10,10))
ax = normal_curve()
ax.set_title('The normal curve', fontsize=20)


In [None]:
normal_fig

## Normal approximation for data

If we assume data follow the normal curve, then we can use Table
A-104 to approximate percentages.


### Example: heights of freshmen at Stanford

The heights of the male freshmen at Stanford averaged 68 inches with an SD of 3 inches. Use the normal curve to estimate the percentage of these men between 62 inches and 71 inches.



### Normal approximation for data

#### Step 1: draw the interval

In [None]:
%%capture
with plt.xkcd():
    bar_fig = plt.figure(figsize=(10,3))
    ax = bar_fig.gca() 
    standardize_interval(62,71,68,3, units='Inches', 
                        facecolor='#820000')
    

In [None]:
bar_fig

#### Step 2: add the average to the interval

In [None]:
%%capture
with plt.xkcd():
    bar_fig = plt.figure(figsize=(10,3))
    ax = bar_fig.gca() 
    standardize_interval(62,71,68,3, units='Inches', include_mean=True,
                         facecolor='#820000')
    

In [None]:
bar_fig

#### Step 3: standardize the endpoints

In [None]:
%%capture
with plt.xkcd():
    bar_fig = plt.figure(figsize=(10,3))
    ax = bar_fig.gca() 
    standardize_interval(62,71,68,3, units='Inches', include_mean=True,
                         facecolor='#820000',
                         standardized=True)
    

In [None]:
bar_fig

#### Step 4: look up percentage using Table A-104


## Percentiles

### Definition

* The 1st percentile of a histogram is the number with 1 % of the area to the left and 99 % of the area to the right.
* The 10th percentile of a histogram is the number with 10 % of the area to the left and 90 % of the area to the right.
* **The median is the 50th percentile.**
* The **first quartile**
   is the 25th percentile; the **third quartile**
   is the 75th percentile.
* The **inter-quartile range**
   is the difference between the third and first quartiles.

### Estimating percentiles

Among all applicants to Stanford the Math SAT scores averaged 560 with an SD of 100. The scores followed the normal distribution quite well. Estimate the 90th percentile of the score distribution.
Solution

**The estimated quantile is** 

     560 + 100 * 90th quantile of normal curve 
     = 560 + 100 * 1.28 
     = 688
     
**This is essentially the inverse of forming standardized units.**
   
We find the 90th quantile by looking at Table A-104 and finding an entry
with Area about 80%. **Why?** Above, my calculator computed this to be 1.28.

### Example: SAT scores

SAT usually follow a normal curve. Assume the average score is 1100 with an SD of 150. **What percent of students score above 1400?**

As we are told the scores follow a normal curve, we
convert 1400 to standardized units and find the area
above 1400.

In [None]:
%%capture
with plt.xkcd():
    SAT_fig = plt.figure(figsize=(10,5))
    standardize_right(1400, 1100, 150, units="SAT")

In [None]:
SAT_fig

In [None]:
%%capture
with plt.xkcd():
    SAT_fig = plt.figure(figsize=(10,6))
    standardize_right(1400, 1100, 150, standardized=True, units='SAT')

In [None]:
SAT_fig

In [None]:
%%capture
percentile_fig4 = plt.figure(figsize=(10,10))
ax = normal_curve()
interval = np.linspace(2., 4, 101)
ax.fill_between(interval, 0*interval, ndist.pdf(interval),
                hatch='o', color='green', alpha=0.5)
ax.set_title('Area is about 2.5%', color='green', fontsize=20)

In [None]:
percentile_fig4

### Estimating percentiles using normal curve

**What is the 60th percentile of the scores?**

In [None]:
%%capture
sixty_fig = plt.figure(figsize=(10,10))
percentile_figure(0.6, 1100, 150, units="SAT", hatch='+', facecolor='red')

In [None]:
sixty_fig

In [None]:
%%capture
sixty_fig = plt.figure(figsize=(10,10))
percentile_figure(0.6, 1100, 150, 
                  standardized=True,
                  units="SAT", hatch='+', facecolor='red')

In [None]:
sixty_fig

### Treatment and control groups

* In the smoking study: ** treatment = smokers**, ** control = non-smokers**
  .
* In a vaccine study: ** treatment = patients who are vaccinated**, ** control = non-vaccinated**.

* Ideally, the only difference between ** treatment**
   and ** control**
   is whether or not they receive the treatment.

### Randomized controlled experiments

* The best way to establish smoking causes lung cancer is a *randomized controlled* experiment.
* In such a study, patients would be assigned randomly to smoking or non-smoking group.
* Obviously, these experiments are not always possible: we can't force people to smoke.

### Why randomize?

* In the polio vaccine example described in the book, wealthy families whose children were more vulnerable to polio also were more likely to volunteer for vaccination.
* If ** treatment**
   is assigned based on whoever volunteers, this could bias the experiment against the vaccine, i.e. its apparent effectiveness is diminished.

* This means there will be differences between ** treatment**
   and ** control**
   groups other than just the vaccine.

## NFIP study


Group | Size | Rate
--- | --- | ---
Treatment | 225,000 | 25
Control | 725,000 | 54
No consent | 125,000 | 44

### Placebo effect and blinding

* Subjects in the ** control**
   group should be given a "treatment" with no effect. That is, they should ideally be *blinded*.
* Why? So the response is not due to the idea of a vaccine, but the vaccine itself.
* In the vaccination example, children were given an injection of salt and water.
* This treatment is called a ** placebo**.

## Smoking & socio-economic status

Smoking is related to socio-economic status. 

As a consequence, smokers:
* tend to be in  lower socio-economic status groups with less access to medical care;
* will tend to have higher incidence of some diseases based on this fact alone.

## Association is not causation

* In children, shoe size is associated to reading ability.
* However, having big feet does not cause children to score high on reading tests.

## Confounding

The big problem with observational studies is **confounding**

      Confounding means there is a difference between 
      the treatment and control groups – other than treatment – 
      which affects the response being studied. 
      
      A confounder is a third variable, associated with 
      exposure and with disease.


## Confounding in reading ability

In our example with show size and reading ability, **age is a confounder.**

- Both reading and ability are associated to age.
- As children age, their feet grow.
- As children age, their reading improves.

### Measurement


Measurement error (excerpt from book):

      No matter how carefully it was made, 
      a measurement could have come out a bit differently.

**But how much?**


- The best way to find out is to replicate the measurement.

- The SD of the replicates estimates the likely size of the chance error in a single measurement.

## Measurement model

The basic measurement model is

     individual measurement = exact value + chance error


### Repeated measurements

This is a situation in which an experiment is 
repeated several times.

Produces a list with $n$ entries 

$$\text{individual measurement}_i = \text{exact value} + \text{chance error}_i$$

### Greek notation

- Call our list of measurements $[M_1, \dots, M_n]$. 
- Then, $$M_i = \mu + \epsilon_i$$

### Histogram of the measurements

- Our measurement model says that the only thing changing between measurements is the chance error.
-  The histogram of the measurements will be the histogram of the *chance error*
  , shifted by the *exact value*.
- In standard units, the histogram of the *measurements*
   will look like the histogram of the *chance error*
   in standard units.
- If the normal curve fits the histogram of the *chance error*
   well, it will fit the histogram of the *measurements*
   well as well.

### Example: Weighing an apple

* Suppose we have an apple that weights exactly 8 ounces.
* Experiment: weigh the apple 100 different times.
* If we know the exact weight of the apple is 8 ounces, we can find the chance errors.

### Histogram of apples

In [None]:
%%capture
apple = np.random.standard_normal(5000)*1.2 + 8
apple_fig = plt.figure(figsize=(10,10))
apple_ax = apple_fig.gca()
sample_density(apple, bins=25, ax=apple_ax, alpha=0.5, 
             facecolor='green')

apple_ax.set_yticks([])
apple_ax.set_xlabel('Weight of apple (ounces)', fontsize=20)
apple_ax.set_title('Mean: %0.1f, SD: %0.1f' % (apple.mean(), apple.std()),
                   fontsize=20)

In [None]:
apple_fig

### Histogram of chance error

In [None]:
%%capture
error = apple - 8
error_fig = plt.figure(figsize=(10,10))
error_ax = error_fig.gca()
sample_density(error, bins=25, ax=error_ax, alpha=0.5, 
             facecolor='red')

error_ax.set_yticks([])
error_ax.set_xlabel('Chance error (ounces)', fontsize=20)
error_ax.set_title('Mean: %0.1f, SD: %0.1f' % (error.mean(), error.std()),
                   fontsize=20)


In [None]:
error_fig

The chance error *averages out* to be about 0.

## Outliers

- Not all individual measurements fit the normal curve.
- This could be because the histogram of measurements *shouldn’t*
   fit the normal curve exactly ...
- Or, an error was made in some of the measurements ...
- Usually impossible to tell which ...

### Bias or systematic error

Conceptual definition (excerpt from book):

       Bias affects all measurements the same way, pushing them 
       in the same direction. Chance errors change from measurement 
       to measurement, sometimes up and sometimes down.


## Measurement model

In light of the possibility of bias, we should
rephrase our measurement model as:

     individual measurement = exact value + bias + chance error

### Greek notation

- Call the bias $B$. Then
$$M = \mu + B + \epsilon$$


** A measurement has three parts:**

- **True value.** This is what we care about.
- **Chance error.** This is something unavoidable in the measurement process. With many measurements, this should average out.
- **Bias.** This is undesirable.

## Correlation (Chapters 8 and 9)

In the early 1900s Karl Pearson collected data on [heights](http://www.stat.cmu.edu/~roeder/stat707/=data/=data/data/Rlibraries/alr3/html/heights.html) of mothers
and daughters.

In [None]:
height = pearson_lee()
height.draw()


In [None]:
height.figure


There is a *positive association* between the two.



### What can correlation tell us? 

- From the plot, daughters born to taller mothers tend to be taller. 

- It has more concrete information that allows us to estimate
the *average* height of daughters born to mothers of a *given height*.

In [None]:
height.strip = 65
height.draw()
height.figure


What if we want to guess daughter’s height when mother = 65 in?

### Average within a strip

In [None]:
height.axes.set_xlim([64.5,65.5])
height.axes.set_title('Zooming in on the strip...', color='red', fontsize=15)
height.figure


We see some variability within the strip: the scatter plot is not exactly a line.

However, we can compute the average height within a given strip.

In [None]:
height.axes.set_title('The average within the strip is %0.1f' % height.mean_strip, fontsize=15)
height.figure

In [None]:
height.axes.set_xlim([54,72])
height.figure

Let's collect these averages within many strips

In [None]:
%%capture
averages = []
height.draw()
mother_heights = range(56,69)
for mother in mother_heights:
    height.strip = mother
    averages.append(height.mean_strip)

height.strip = None
height.axes.plot(mother_heights, averages, linewidth=5, c='k')
height.axes.set_title("Relationship is almost a straight line.", fontsize=15)
height.axes.scatter(mother_heights, averages, s=300, c='yellow', label='Average(strip)')
height.axes.legend(loc='lower right') 

In [None]:
height.figure

In [None]:
height.axes.set_title('Slope of the line is predicted by correlation...', fontsize=15)
height.axes.figure

## Correlation

### Conceptual definition

- A numerical summary of a scatterplot, i.e. a pair of lists.
- If there is a strong association between two variables, then knowing one helps a lot in predicting the other. But when there is a weak association, information about one variable does not help much in guessing the other.

The *correlation coefficient*
  , $r$ is a measure of the strength of this association.
* $r=+1$ if the variables are perfectly positively associated.
* $r=-1$ if the variables are perfectly negatively associated.

## Computing $r$, the correlation coefficient

* Given two lists, $X, Y$, convert them each to standardized units. Call these new lists $Z_X, Z_Y$.
* Make a new list $Z_{XY}$ whose entries are the products of the entries of $Z_X, Z_Y$.
* Then, $r = \text{average}(Z_{XY}).$
* Another way:
$$
   r = \frac{\text{average(products $X, Y$)} - \text{average}(X) \times \text{average}(Y)}{\text{SD}(X) \times \text{SD}(Y)}.
   $$
   

## Example

Take X = [1,4,6,9,3],
Y = [-2,2,8,0,1].

 $$\begin{aligned}
   \bar{X} &= 4.6 & \text{SD}(X) &= 2.72 \\
   \bar{Y} &= 1.8 & \text{SD}(Y) &= 3.37 \\
   \end{aligned}$$
   
The only thing new to compute is $\overline{XY}$. 
$$XY = [-2,8,48,0,3], \qquad \overline{XY}=(-2+8+48+3)/5=11.4$$

Therefore
$$ r = \frac{11.4 - 4.6 * 1.8}{2.72 * 3.37} \approx 0.34$$

In [None]:
X = [1,4,6,9,3]
Y = [-2,2,8,0,1]
print (mean(X), mean(Y), std(X), std(Y), mean([x*y for x,y in zip(X,Y)]))
R = (mean([x*y for x,y in zip(X,Y)]) - mean(X) * mean(Y)) / (std(X) * std(Y))
R, (11.4 - 4.6 * 1.8)/(2.72 * 3.37)

### Properties of correlation

* Correlation is unitless.
* Changing units of $X$ or $Y$ does not change the correlation.
* Correlation does not change if we interchange $X$ and $Y$: it is *symmetric*.

In [None]:
height.draw()
height.axes.set_title("$r=%0.2f$" % np.corrcoef([height.D, height.M])[0,1], fontsize=20)

In [None]:
%%capture
swapped = plt.figure(figsize=(8,8))
ax = swapped.gca()
ax.scatter(height.D, height.M, c='red', s=100, edgecolor='gray')
ax.set_ylabel("Mother's height (inches)", fontsize=15)
ax.set_xlabel("Daughter's height (inches)", fontsize=15)
ax.set_title("$r=%0.2f$" % np.corrcoef([height.D, height.M])[0,1], fontsize=20)

In [None]:
swapped

**This plot also illustrates the important principle:**

     Correlation is not causality!
     
Why?


### The SD line

* The SD line passes through the point of averages $(\bar{X}, \bar{Y})$ and has $\text{slope(SD line)} = \frac{\text{SD}(Y)}{\text{SD}(X)} \times \text{sign}(r(X,Y))$
* For every one standardized unit increase of $X$, the SD line changes by one standardized unit of $Y$. The direction of change is positive if $X$ and $Y$ are positively correlated, and negative if they are negatively correlated.

### Correlation

In [None]:
height.SDline()
height.figure


The point cloud seems to cluster around the SD line

Let's look at those means we computed within each strip above.

While the points are almost on a line, **they do not lie on the SD line.**

In [None]:
height.axes.plot(mother_heights, averages, linewidth=5, c='k', label='Strip averages')
height.axes.scatter(mother_heights, averages, s=300, c='yellow', label='Average(strip)')
height.axes.legend(loc='lower right', scatterpoints=1) 

In [None]:
height.figure

# Regression (Chapters 10-12)

In [None]:
%%capture
hfig = plt.figure(figsize=figsize)
height = pearson_lee()
height._figure = hfig
height.strip = 65
height.SDline()
height.axes.scatter([65], [height.mean_strip], s=300, c='yellow', label='Average(strip)')
height.axes.set_title('The average within the strip is %0.1f' % height.mean_strip, fontsize=15)
matplotlib.rcParams['legend.scatterpoints'] = 1
height.axes.legend(loc='lower right') 
daughter = height.D
mother = height.M

In [None]:
height.figure

Note that the average in the strip at 65in is below the SD line. Why?


## Regression line

- Instead of the SD line we choose a line that minimizes the "vertical distances" from each point to the line.
- The quality of a line is measured by the r.m.s. of these distances, called ** residuals**.
  
- Each choice of slope / intercept yields a new set of residuals.
- The regression line has the residuals with smallest r.m.s.
- This is using the **method of least squares**
   to choose the slope and intercept.

## Regression line

- The **regression line** is the line whose slope and intercept have the smallest
`r.m.s.` for the vertical deviations from that line.

- Define the **point of averages** of two lists $X$ and $Y$ as

      point of averages(X, Y) = (average(X), average(Y))

- The regression line passes through the point of averages and has slope:
$$\text{slope} = r(X,Y) \times \frac{SD(Y)}{SD(X)}.$$

- The intercept of the regression line is
 
      intercept = average(Y) - slope * average(X)

The regression line has better `r.m.s.` for the vertical deviations!



In [None]:
%%capture

def error(D,a,b):
    F = a*M+b
    return np.sqrt(np.sum((D-F)**2)/D.shape[0])

D, M = daughter, mother
r = np.corrcoef([D,M])[0,1]
slope_SD = D.std() / M.std()
intercept_SD = D.mean() - slope_SD * M.mean()

slope_r = r * D.std() / M.std()
intercept_r = D.mean() - slope_r * M.mean()

height.axes.set_title('Error(SD line)=%0.1f, Error(regression)=%0.1f' %
            (error(D, slope_SD, intercept_SD),
            error(D, slope_r, intercept_r)), fontsize=15)
height.regline(draw=False)
height.axes.legend(loc=('upper left'))
del(D); del(M)

In [None]:
height.figure

Let's look at those strip averages we computed in the notes on correlation.

In [None]:
mother_heights = range(56,69)
averages = []
for mother in mother_heights:
    height.strip = mother
    averages.append(height.mean_strip)
height.axes.scatter(mother_heights, averages, s=300, c='yellow')

In [None]:
height.figure

**The regression line almost sits on top of the strip averages!**


## Working with regression problems

 The following quantities are enough to do all regression problems
* $\text{average}(X), \text{SD}(X)$
* $\text{average}(Y), \text{SD}(Y)$
* $r(X,Y)$

### Blood alcohol example

It is believed that the more alcohol there is in a person’s blood stream, the slower is that person’s reaction times. An experiment with 10 subjects yields
- average amount of alcohol in blood $0.14\%$ with SD $0.04\%$;
- average reaction time 0.42 seconds, SD 0.1 seconds;
- correlation coefficient 0.8.

**Predict the reaction time of a person with an amount of alcohol of 0.22%.**

#### Answer

- We first compute the slope, intercept 
    $$\begin{aligned}
     \text{slope} &= \frac{0.8 \times 0.1}{0.04} = 2.0 \frac{\text{seconds}}{\%}     \\
     \text{intercept} &= 0.42 - 2. \times 0.14 = 0.14 \, \text{seconds}
     \end{aligned}$$ 
     
     Plugging in an alcohol level of 0.22 yields a predicted time of
     $$2 \times 0.22 + 0.14 = 0.58 \, \text{seconds}.$$
     
- Another way to arrive at the same answer is to note that $(0.14,0.42)$
is the point of averages which is on the regression line. For any other value of blood alcohol, $B$, the corresponding point $y$-value of reaction time on the regression line is 
$$
0.42 + (B - 0.14) * 2.
$$
Substituting $B=0.22$ yields
$$
0.42 + (0.22-0.14)*2. = 0.42 + 0.16 = 0.58.
$$

## Regression fallacy

* Note that someone in the 20th percentile of `blood alcohol` had predicted 27th percentile in `reaction time`.

* This is a general phenomenon, Galton referred to it as "regression to mediocrity."

### Test-retest version of regression fallacy (from book)

In a test-retest situation, usually the bottom group on the 
first test will show some improvement, and the top group will fall back.

It can also be seen from the regression of `daughter` on `mother`...

In [None]:
height.figure

The height of a daughter whose mother is 2 SD above the average height of mothers is only $0.49*2 \approx 1$ SD above the average height of daughters.

## Prediction and regression

- While there are two regression lines, the right way to remember which to use is **what do you want to predict?**
- The variable you want to predict goes on the vertical axis ($Y$-axis).
- The variable you want to base your prediction on goes on the vertical axis ($X$-axis).
- In many situations, it will be more natural to predict one variable instead of another.

# Accuracy of prediction (Chapter 11)

- In discussing experiments, we discussed the average of a set of measurements.

- These can be used to predict a new measurement: our best guess is just the average of the previous meausrements.

- A similar calculation is possible with regression.

## SD as a measure of accuracy

- The SD of the set of measurements gives us some idea of how accurate our prediction is 
$$\text{SD(list) = r.m.s.(deviations from average)}$$
- If our prediction is the average, then 
$$\text{SD(list) = r.m.s.(deviations from predictions)}$$
- With regression, we have a new way to predict: using the regression line.

## Accuracy of prediction in regression

- In regression, we are using more information: the fact that tall mothers tend to give birth to taller daughters (but not quite as tall).
- This should improve the accuracy of our prediction of the daughter’s height (which uses the mother’s height).
- Our regression line was chosen to minimize
   the r.m.s. of the residuals of all ines
   $$
   \begin{aligned}
   \text{r.m.s.}(\text{regression $Y$ on $X$}) &= \text{r.m.s.(residuals)} \\
   &= \sqrt{\text{average}(\text{residuals}^2)} \\
   \end{aligned}
   $$
- This r.m.s. is always less than the SD of
   the dependent variable ($Y$) alone
   $$
   \text{r.m.s.}(\text{regression $Y$ on $X$}) = \sqrt{1-r^2} \times \text{SD}(Y)
   $$
- In mother/daughter example, this factor is 2.3 / 2.6 $\approx$ 87% = $\sqrt{1 - 0.49^2}$.

## Example: using regression r.m.s. in vertical strips

* Given the following information:


Variable | Average | SD
---------|---------|----
mother   | 62.4in    | 2.3in
daughter   | 63.8in   | 2.6in

Correlation $r=0.49$.

** Of mothers of height 66in, what percentage of their daughters will have height above 67in?**

### Answer

* Slope of regression line is $$\text{slope} = 0.49 \times \frac{2.6}{2.3} = 0.54$$
* The average height of daughters of mothers of height 66in is $$63.8 + 0.54 \times (66 - 62.4) = 65.7$$
* The SD is taken to be r.m.s. of regression $$\sqrt{1 - 0.49^2} \times 2.6 = 0.87 \times 2.6 = 2.3.$$
* 67 in corresponds to 
$$(67-65.7)/2.3=0.6$$ standardized units.
* From the normal table (like Table A-104), the percentage is roughly 27%.

## Probability

### Frequency definition of chances

* If you flip a fair coin many times, the long-run proportion of heads will be 50%.
* Rolling a fair 6-sided die,
will result in a long-run proportion of 1’s of 
$$1/6=16 \frac{2}{3}\%.$$

In [None]:
dice.roll_one_die.trial()
dice.roll_one_die

In [None]:
dice.roll_one_die.sample(10)

### Probability as frequency

In [None]:
dice_sample = dice.roll_one_die.sample(5000)
sum([d == 1 for d in dice_sample]) / 5000., 1/6.

## Some rules of probability

- *Range of values:* Chances are between 0 % and 100 %.

- *Opposites:* The chance of something equals 100 % minus the chance of the opposite thing. 

### Example: 

The chance of not getting a 1 when rolling a die is $(100 - 16 \frac{2}{3})\% = 83 \frac{1}{3} \%$.


## Example

Suppose our experiment consists of drawing a ticket out of a hat with 20
tickets in it. We are going to draw 3 tickets.

- Describe the hat after each draw if we draw *with replacement*. 
What are the possible outcomes of the experiment?

         Before and after each draw, the hat has 20 tickets in it.
         The possible outcomes are triples of numbers from 1 to 20:
         (1,1,4),(2,3,4), etc.

-  Describe the hat after each draw if we draw *without replacement*. 
What are possible outcomes of the experiment?

         After the first draw, the hat has 19 tickets in it,
         after the second 18, and after the third 17.
         The possible outcomes are all triples of numbers from
         1 to 20 but there can be no ties: (1,1,4) is impossible.

## Conditional probability

* Observing some information can *change*
   the chances of something.
* We already saw this in the marble example. If drawing without replacement, suppose the first draw was red. What are the chances a blue marble is drawn on the second draw?
* What if we draw with replacement?
* In this examples, we are *given*
   that the first draw was red. These chances are *conditioned*
   on knowing the first draw was red.

## Multiplication rule

The chance that two things will both happen equals the chance that the first will happen, multiplied by the chance that the second will happen *given* the first has happened.

### Example

- *In the box with 3 blue and 2 red marbles, what is the probability the first blue marble drawn is on the second draw when drawing without replacement?*
- If the first blue marble drawn was the second, then we know
     - the first was red;
     - the second was blue.
- By the multiplication rule 
$$\mathrm{chances} = \frac{2}{5} \times \frac{3}{4} = \frac{3}{10}$$

## Mathematical notation

* "first blue drawn is on the second draw" is called an *event*
  ;
* "first draw is red" and "second draw is blue" are also events;
* We usually write $P$ for "chances". For an event $E$ 
$$ P(E) = \mathrm{chances} \ E \ \mathrm{occurs}. $$
* Conditional probability of an event $A$ given $B$, i.e.  
the chances $A$ occurs given $B$ occurs, is
written as 
$$
P(A \vert B).
$$

- Multiplication rule can be written as 
$$P(A \cap B) = P(A \, \text{and} \, B) = P(A \vert B) * P(B). $$

## Law of total mass

* The chances of *something* occuring are 100%.

* Example: when we drew marbles, the chances we draw a marble whose color is blue or red is 100 %.
* In mathematical notation, we often use $S$ for "something" or the "sample space" $$P(S) = 100\% \qquad (= 1)$$.


## Mass function

- What is `mass_function`? 

- It is a description of the probabilities of the various possible
outcomes.

- Later, the book will call this a *probability histogram*.

## Law of total mass

- When drawing from `small` without replacement, we will draw a blue ball within the first three draws. 
$$P(\text{one of the first three balls is blue}) = 100 \%$$

- Let's verify the law of total mass 
 $$\begin{aligned}
     P(\text{first blue ball is on 1st draw }) &= \frac{3}{5} \\
     P(\text{first blue ball is on 2nd draw}) &= \frac{2}{5} \times \frac{3}{4} = \frac{3}{10}  \\
     P(\text{first blue ball is on 3rd draw}) &= \frac{2}{5} \times \frac{1}{4} = \frac{1}{10}  \\
  \end{aligned}$$
- Summing the probablities $$\frac{3}{5} + \frac{3}{10} +
 \frac{1}{10} = 1.$$

## Addition rule

**When can we add probabilities of different events?**


We can add probabilities of events when the events are *disjoint*
   or *mutually exclusive*

### Example 

When rolling a die, the events $E_1= \,${roll is 4} , $E_2=\, ${roll is 3} are mutually exclusive because the result of the roll cannot be and simultaneously.

### Mutually exclusive events

In [None]:
%%capture
disjoint = plt.figure(figsize=figsize)
cir = matplotlib.patches.Circle
ax = plt.gca()
E1 = cir((0.5,0.5), 0.4,ec="black", facecolor='yellow',lw=2, alpha=0.4)
E2 = cir((-0.2,-0.2), 0.4,ec="black", facecolor='blue',lw=2, alpha=0.4)
ax.add_patch(E1)
ax.add_patch(E2)
ax.set_xticks([])
ax.set_yticks([])
ax.set_xlim([-0.7,1])
ax.set_ylim([-0.7,1])
ax.set_title('Two mutually exclusive events. Addition OK.', fontsize=15, color='green')

In [None]:
disjoint

### Non-mutually exclusive events

In [None]:
%%capture
nondisjoint = plt.figure(figsize=figsize)
cir = matplotlib.patches.Circle
ax = plt.gca()
E1 = cir((0.5,0.5), 0.4,ec="black", facecolor='yellow',lw=2, alpha=0.4)
E2 = cir((0.2,0.2), 0.4,ec="black", facecolor='blue',lw=2, alpha=0.4)
ax.add_patch(E1)
ax.add_patch(E2)
ax.set_xticks([])
ax.set_yticks([])
ax.set_xlim([-0.7,1])
ax.set_ylim([-0.7,1])
ax.set_title('Two non-mutually exclusive events. Addition not OK.', 
             fontsize=15, color='red')


In [None]:
nondisjoint

## Addition rule

- If the events $E_1, E_2$ are mutually exclusive, then 
$$ P(E_1 \ \mathrm{or} \ E_2 \  \mathrm{occurs}) = P(E_1) + P(E_2).$$
- This rule works for more than two: if $[E_1, \dots, E_n]$ are mutually exclusive, then $$ P(E_1  \ \mathrm{or} \  E_2  \ \mathrm{or} \  \dots  \ \mathrm{or} \  E_n) = \sum_{i=1}^nP(E_i).$$

- The events $E_1, E_2$ are mutually exclusive if $E_1 \cap E_2$ is empty.
- We often write "$E_1$ or $E_2$" as $E_1 \cup E_2$ and 
"$E_1$ and $E_2$" as $E_1 \cap E_2$.

- From the Venn diagram, we can deduce the general form of the addition rule
$$P(E_1 \cup E_2) = P(E_1) + P(E_2) - P(E_1 \cap E_2).$$

- There are also rules that involve more than 2 events.

In [None]:
%%capture
three_events = plt.figure(figsize=figsize)
ax = three_events.gca()
E1 = cir((0.5,0.5), 0.4,ec="black", facecolor='yellow',lw=2, alpha=0.4)
E2 = cir((0.2,0.2), 0.4,ec="black", facecolor='blue',lw=2, alpha=0.4)
E3 = cir((0.2,0.5), 0.4,ec="black", facecolor='red',lw=2, alpha=0.4)
ax.add_patch(E1)
ax.add_patch(E2)
ax.add_patch(E3)
ax.set_xticks([])
ax.set_yticks([])
ax.set_xlim([-0.3,1])
ax.set_ylim([-0.3,1])
ax.set_title('Three intersecting events.', 
             fontsize=15)


In [None]:
three_events

$$
\begin{aligned}
P(A \cup B \cup C) &= P(A) + P(B) + P(C) \\
& - P(A \cap B) - P(A \cap C) - P(B \cap C) \\
& + P(A \cap B \cap C)
\end{aligned}
$$

## Multiplication rule & independence

* Intuitively, an event $A$ is independent of $B$ if given $B$, the odds of $A$ are unaffected.
* In mathematical notation, we express this notion as 
$$P(A \vert B)=P(A)$$
* If this is true, we say $A$ and $B$ are *independent*.
* Otherwise, $A$ and $B$ are *dependent*.
* The multiplication rule, combined with independence tells us 
$$P(A \cap B) = P(A \vert B) * P(B) = P(A) * P(B).$$

## Example

Let's go back to drawing marbles from a box.

* When drawing marbles *with replacement*
   the events $$\begin{aligned}
 A &= \mathrm{first \ draw \    is \ red} \\
 B &= \mathrm{second \ draw \ is \ blue}
 \end{aligned}$$ are *independent*
  
* We can even conclude that the draws are independent in this case.
* When drawing *without replacement*
   the events $A$ and $B$ are dependent. 

## Counting and probability

* When performing an experiment where each outcome is equally likely, we can compute probabilities by counting.
* Example: when rolling two dice, what is the probability of obtaining a sum of 9?
* We call these counting problems *combinatorial*
  .
* For such experiments $$P(E) = \frac{\# E}{\# S}$$ where $S$ is the set of all possible outcomes (our *sample space*).

## Example

What are the chances the sum will be greater than or equal to 7?

In [None]:
%%capture
dice.examples['sum geq 7'] = dice.dice_example(event_spec = lambda ij: ij[0]+ij[1]>=7)


In [None]:
dice.examples['sum geq 7']

There are 21 outcomes whose sum is greater than or equal to 7. Therefore, the chances are $\frac{21}{36}=\frac{7}{12}$.

In [None]:
dice.examples['sum geq 7'].mass_function

## Complement of an event

* Formally, the "opposite" rule is the rule of *complements*.
* We write the complement of an event $E$ as $E^c$ 
$$P(\mathrm{not} \, E) = P(E^c).$$
* The rule of *complements*
   says $$P(E^c) = 1 - P(E)$$

### An event $E$ and its complement $E^c$

In [None]:
%%capture
complement = plt.figure(figsize=figsize)
ax = complement.gca()
ax.set_alpha(0.4)
ax.patch.set_facecolor('#cccccc')
E1 = cir((0.5,0.5), 0.4,
         ec="black", facecolor='yellow', lw=2, alpha=0.4)
ax.add_patch(E1)

ax.set_xticks([])
ax.set_yticks([])
ax.set_xlim([-0.3,1])
ax.set_ylim([-0.3,1])

t = ax.text(0.5, 0.5, '$E$', size=40, color='black', va='center', ha='center')
t = ax.text(-0.05, -0.05, r'$E^c$', size=40, color='black', va='center', ha='center')
complement.text(0.8, 0.04, '$S$', size=40, ha='center', va='center')

In [None]:
complement

## Properties of complements

* For any event $E$, $E$ and $E^c$ are mutually exclusive.
* For any event $E$, $S = E \cup E^c$.
* For any two events $A, B$ 
$$\begin{aligned}
     B &= B \cap S \\
     &=   B \cap (A \cup A^c) \\
     &=   (B \cap A) \cup (B \cap A^c)
     \end{aligned}$$ where $B \cap A$ and $B \cap A^c$ are mutually exclusive.

### Using complements

* For the `small` box, if we draw with replacement, what are the chances it will take less than 5 draws to draw 1st blue marble?
* If $E$={takes less than 5 draws to draw 1st blue marble}, then 
$$\begin{aligned}
     E^c &=\{\text{takes 5 or more draws to draw 1st blue marble}\} \\
     &=\{\text{first 4 draws are red}\} \\
     \end{aligned}$$
* By independence, $$
     P(\text{first 4 draws are red}) = \left(\frac{2}{5}\right)^4
     $$
* Therefore, $$P(\text{takes less than 5 draws to draw 1st blue marble}) = 1 -  \left(\frac{2}{5}\right)^4 = 97\%$$

## Bayes’ theorem

- Credited to [Reverend Thomas Bayes](http://en.wikipedia.org/wiki/Thomas_Bayes)
- The foundation of important sub-branch of statistics: *Bayesian statistics.*
- Given two events $A$ and $B$ $$\begin{aligned}
     P(A \vert B) &= \frac{P(B \, \mathrm{and} \,  A)}{P(B)} \\
     &= \frac{P(A \cap B)}{P(B)} \\
     &= \frac{P(B \vert A)\times P(A)}{P(B)}
     \end{aligned}$$
- The formula is a direct consequence of the multiplication rule.
- Even though it is used in *Bayesian statistics*, it is just part of 
the calculus of probability so even non-Bayesians use it.

### Alternate versions

* By the properties of complements $$\begin{aligned}
     P(B) &= P(B \cap A) + P(B \cap A^c) \\
     &=  P(B \vert A) \times P(A) + P(B \vert A^c) \times P(A^c)
     \end{aligned}$$
* Another version of Bayes’ theorem $$\begin{aligned}
     P(A \vert B) &= \frac{P(B \vert A) \times P(A)}{P(B \vert A) \times P(A) + P(B \vert A^c) \times P(A^c)     } \\
     \end{aligned}$$

## Diagnostic testing

* Suppose a patient from some population is tested for a disease based on some diagnostic test.
* The prevalence of the disease is 0.1% in the population.
* If a patient has the disease, the test result is positive with probability 95 %. (*True positive*
  )
* If a patient does not have the disease, the test result is positive with probability 1 %. (*False positive*
  ).
* What is the probability a patient has the disease given a positive test result? What if the false positive rate were 0.1%?


* Let $$\begin{aligned}
     D &= \{\text{patient has disease}\}     \\
     T^+ &= \{\text{test result is positive}\}     \\
     \end{aligned}$$
* We are given $$\begin{aligned}
     P(D) &= 0.001 \\
     P(T^+ \vert D) &= 0.95 \\
     P(T^+ \vert D^c) &= 0.01 \\
     \end{aligned}$$
* We want to compute $P(D \vert T^+)$.

* By Bayes’ theorem $$\begin{aligned}
     P(D \vert T^+) &= \frac{P(T^+ \vert D) \times P(D)}{P(T^+ \vert D) \times P(D) + P(T^+ \vert D^c) \times P(D^c)} \\
     &= \frac{0.95 \times 0.001}{0.95 \times 0.001 + 0.01 \times 0.999} \\
     &= 8.7 \%
     \end{aligned}$$
* If the test makers improve their false positive rate to 0.001 then $$\begin{aligned}
     P(D \vert T^+)
     &= \frac{0.95 \times 0.001}{0.95 \times 0.001 + 0.001 \times 0.999} \\
     &= 48.7 \%
     \end{aligned}$$

## Binomial formula



## Factorial

For an integer, $n$ its *factorial* is 
$$n! = n \times (n-1) \times (n-2) \times \dots \times 3 \times 2 \times 1.$$

Example: $5! = 5 \times 4 \times 3 \times 2 \times 1 = 120$.

** There are $n!$ orderings of $n$ distinct objects.**

## Drawing $k$ balls out of $n$, in order

The number of ways of drawing $k$ balls without replacement (in order) from $n$ is $$\frac{n!}{(n-k)!}$$

Example: the number of ways of choosing 2 balls from 7 in order is 
$$\frac{7!}{ 5!} = 7 \times 6 = 42.$$

## Binomial coefficient

The number of ways of drawing $k$ balls without replacement and ignoring order from $n$ is 
$$\frac{n!}{k! \times (n-k)!}$$

Example: the number of ways of choosing 2 balls from 7 ignoring order is 
$$\frac{7!}{5! 2!} = \frac{7 \times 6 }{2}  = 21.$$

We define the **binomial coefficient** as
$$\binom{n}{k} = \binom{n}{n-k} = \frac{n!}{k! \times (n-k)!}.$$

### Example

- When flipping a coin 10 times, how many outcomes are there with 7 heads?

- We can represent this as drawing 7 out of a possible 10 slots for the heads, without order. There are $\binom{10}{7} = \frac{10 \times 9 \times 8}{3 \times 2 \times 1} = 120 \
     \text{outcomes}$

## Independent trials

1. A **trial**  is an experiment with two possible outcomes:  **success**
    or  **failure**.
2. The same experiment is repeated *independently*.
3. Each time the experiment is repeated, the chances of success are the same: $p$.

## Binomial formula for computing probabilities

When performing $n$ independent trials, each with probability of success $p$, the probability of observing exactly $k$ successes is 
$$\binom{n}{ k} { p^k} {(1-p)^ {n-k}}.$$

## Roulette

<img src="http://introductorystats.files.wordpress.com/2010/11/roulette-wheel.jpg">

Source: http://introductorystats.wordpress.com.

## Example

- Suppose we bet on  5
   for 6 spins of the roulette wheel. 
   
- What are the chances we win exactly 3 times?

1. Make a box model.
2. For your box, compute the probability of success of each trial. Call this $p$.
3.  **The answer is $\binom{6}{3} p^3 (1-p)^3.$**

# Chance Variability (Chapters 16 and 17)

In these chapters, we consider the behaviour of the
average when drawing from a box multiple times.

Two key concepts are:
- *expected value*: what is the "average" value when we draw from a box?
- *SE (standard error)*: 
     - How variable is a draw from a box?
     - How variable is the average of several draws?

## The law of averages

* There is 18/38 chance of winning 10\$, and 20/38 chance of losing 10\$.
* On average, each bet we "gain" $$\frac{18}{38} \times 10\$ + \frac{20}{38} \times (-10\$) = -\frac{1}{19} \times 10\$ \approx -0.52\$ $$
* This is the average of the 38 outcomes in our "box model".
* Our average winnings after 20 bets is approximately -10.50\$ so we should finish on average with about 89.50 \$.
* Our average winnings after 100 bets is approximately -52\$ so we should finish on average with about 48 \$.
* Our average winnings after 1000 bets is approximately -520\$ so we should finish on average about 420\$ in debt.

## Expected value and standard error

### Sum of draws

* Draw a ticket (with replacement) from a box of balls with values assigned to them (i.e. 10\$, -10\$).

* Repeat this process $n$ times and compute the sum of all the results, calling this the **sum of draws**.

* On average, the **sum of draws**
   should be about 
   
        n * average(values in the box)
   
* The (theoretical) average of draws is **sum of draws**

        "theoretical average"(sum of n draws) = n * average(values in the box)
        
* We call this "theoretical average" the "expected value"

        expected(sum of n draws) = n * average(values in the box)
        
* Roulette example

        expected(sum of 100 bets of 10$ on RED) = -52$

## Chance error

* Of course, we don’t always end up with 48\$ after one hundred bets.
* I simulated the entire experiment 10000 times and recorded the results in `winnings_sample100`.
* A reasonable guess for how close to 48\$ we would be 
      
      SD(winnings_sample100) = 100$

* Even though, on average, we should have 48\$ after 100 bets, our winnings can fluctuate on the order of 100\$.
* Even though, on average, we should have on average -420\$ after 1000 bets, our winnings can fluctuate on the order of 315\$.

### Chance error

* We define the *chance error*
   of the experiment by 
   
      sum of n draws = expected(sum of n draws) + chance error(sum of n draws)
      
* Example:
     - We are going to flip a fair coin 100 times and record the number of heads.
     - After 100 flips we observe 56 heads.
     - The chance error in these 100 draws is 6 because the expected number of heads is 50.

### Square root rule


* The **sum of draws**
   should be near the average but likely to be off by 
   $$\text{SE(sum of n draws)} = \sqrt{n} \times \text{SD(values in the box)}$$
* We call this the *standard error*. It measures the typical size of *chance error*.

### Difference between SD and SE

- **SD is for data.** It is a function that take a list of numbers and returns a number.
- **SE is for chance.** It takes a chance process like drawing 10 balls from a box of numbers and returns a number.

## Expected value and SE for average of draws

- The **average of n draws** (or sample average) is

         average of n draws = (sum of n draws) / n
         
- The expected value of the average of n draws is

         expected(average of n draws) = average(box)
         
- The SE for the average of n draws is

         SE(average of n draws) = SD(box) / sqrt(n)

# Example: box = [1,2,3,4]



In [None]:
box = [1,2,3,4]
model = BoxModel(box)
model.trial()

We can take a sample of size 5 from our box:

In [None]:
model.sample(5)

Let's make a new chance process that computes the sample mean
after 10 draws from the box. This chance process computes the sample mean
and SD of 10 draws from the box.

In [None]:
sample_mean = SampleMean(model, 10)
sample_SD = SampleSD(model, 10)
sample_mean.trial(), sample_SD.trial()


The expected value for this chance process is the mean of the values in the box.

In [None]:
expected_mean = np.mean(box)
expected_mean, sum(box) / 4.

The typical size of chance error for this chance process (i.e. its SE or standard error) is:

In [None]:
SE_mean = np.std(box) / np.sqrt(10)
SE_mean

So, a trial from `sample_mean` will be off by about 0.35 or so.

## Mathematical notation

- We call a draw from a box a **random variable**, labelling it $X$, say.

- Drawing with replacement at random from the box gives us *independent* random variables, $X_1, \dots, X_n$

- The sum of $n$ draws can be written as
$$
\text{sum of $n$ draws} = \sum_{i=1}^n X_i   \quad (\text{with $X_i$ independent draws from our box})
$$

- The average of $n$ draws can be written as
$$
\text{average of $n$ draws} = \frac{1}{n}\sum_{i=1}^n X_i.
$$

### Why is the notation the same as for lists (or samples) of numbers?
   
- This is similar to the relationship between SE and SD.

- One thing refers to sample values (SD, or the sample average), while the other refers to the chance process.


### Example

* In our  RED
   roulette example, $V_1=10 \$, V_2=-10\$ $ and $p=18/38$.
* The shortcut says that $$\text{SD(values in the box)} = 20 \$ \times \sqrt{\frac{18}{38} \times \frac{20}{38}} \approx 10\$.$$
* The square root rule says that $$\text{SE(sum of 100 bet results)} = \sqrt{100} \times 10\$ \approx 100\$.$$
* The square root rule says that $$\text{SE(sum of 1000 bet results)} = \sqrt{1000} \times 10\$ \approx 315\$.$$

# Probability histogram & normal approximation

- This chapter considers a histogram
approximation what we have been calling the `mass_function`
of the `sum of draws`.

- It turns out, that with enough draws, the sample histogram begins to follow
the normal curve.

## Probability histogram for tossing a fair coin

- When tossing a fair coin, there is 1/2 probability of getting 1 head, 1/2 of getting 0 heads.
- We can make a histogram with an rectangle of width 1, area 1/2 around 0, and an identical rectangle around 1.

In [None]:
coin_trial = BoxModel(['H','T'])
coin_trial.mass_function

## Difference between probability histogram and mass function

- The `mass_function` tells us the exact chances of either 0 or 1 success
in our trial.

- The probability histogram is based on breaking the numbers into bins (like
we did with sample data earlier). 

- It then finds all the `mass` in those bins.

- In our example, we have two bins: [-0.5,0.5) and [0.5,1.5).

- There is chances 1/2 for the successes to be in the first bin, and 
chances 1/2 in the second.

- Just as in a histogram for data, areas of bars represents percentages (chances).


## Probability histogram: law of large numbers

* Choose an experiment (e.g. tossing a fair coin twice and counting the number of heads, $H$).
* Repeat the experiment 500 times creating a list $[H_1, H_2, \dots, H_{500}].$
* The frequentist view of probability tells us that the histogram of the list $[H_1, H_2, \dots, H_{500}]$ should look like the *probability histogram*.
* Or, the empirical histogram *converges* to the probability histogram.
* We call this the *Law of Large Numbers*
  

In [None]:
%%capture
tosses = {}
ntoss = 5
tosses[ntoss] = plt.figure(figsize=figsize)
probability_histogram(Binomial(ntoss, coin_trial, ['H']),
                                    bins=np.arange(7)-0.5,
                                    alpha=0.5, facecolor='gray',
                                    xlabel='Number of heads',
                                    ylabel='% per heads',
                                    ndraws=500)
tosses[ntoss].gca().set_xlim([-0.6,5.6])
tosses[ntoss].gca().legend()


In [None]:
tosses[5]

### Probability histogram of successes

In [None]:
%%capture
ntoss = 100
tosses[ntoss] = plt.figure(figsize=figsize)
probability_histogram(Binomial(ntoss, coin_trial, ['H']),
                                    bins=np.arange(35,65)-0.5,
                                    alpha=0.5, facecolor='gray',
                                    xlabel='Number of heads',
                                    ylabel='% per heads',
                                    ndraws=500)
tosses[ntoss].gca().set_xlim([35,65])
tosses[ntoss].gca().legend()


In [None]:
tosses[100]

The probability histogram looks a lot like a normal curve!

**This is not an accident!**

## Normal approximation

### Central limit theorem

* When making many independent draws from a box, the central limit theorem says that we can use the normal curve to approximate probabilities of things for the **sum of draws**
  .
* Specifically, the normal curve applies to 
$$\frac{\text{ sum of draws} - \text{expected( sum of draws)}}{\text{SE( sum of draws)}}$$

We can now compute:
- $\text{average( sum of 100 draws)} = 100 \times (-0.52)\$ = -52\$ $
- $$\text{SE( sum of  100 draws)} = \sqrt{100} \times 360 \times \sqrt{\frac{1}{38} \times \frac{37}{38}} \approx 576\$ $$
- Finishing with more than 200\$ means the **sum of draws** was greater than 100\$ .
- In standardized units, this is $$\frac{100-(-52)}{576} \approx 0.27$$

In [None]:
%%capture
with plt.xkcd():
    winnings_stand = plt.figure(figsize=(10,5))
    standardize_right(100, -52, 576, units="Total amount", standardized=True,
                      data=False)

In [None]:
winnings_stand

In [None]:
%%capture
normal_fig = plt.figure(figsize=figsize)
ax = normal_curve()
interval = np.linspace(0.26, 4, 101)
ax.fill_between(interval, 0*interval, ndist.pdf(interval),
                hatch='+', color='green', alpha=0.5)
ax.set_title('The green area is %0.1f%%' % (100 * ndist.sf(0.27)), fontsize=20, color='green')


In [None]:
normal_fig

## Continuity correction

- When using the normal approximation to the sum of draws, we sometimes use the *continuity correction*.
- This means we might add or subtract 1/2 at the endpoints.

- For example 
   - {observing more than 40 heads in 100 flips} = {observing more than 40.5 heads in 100 flips}
   - {observing less than 45 heads in 100 flips} = {observing less than 44.5 heads in 100 flips}
   - {observing exactly 40 heads in 100 flips} = {observing between 39.5 and 40.5 heads in 100 flips}
   - {observing greater than or equal to 41 heads but less than 52 heads in 100 flips } = {observing between 40.5 and 51.5 heads in 100 flips}
   
   - {observing greater than 41 heads but less than 52 heads in 100 flips } = {observing between 41.5 and 51.5 heads in 100 flips}

## Less than 45 heads with continuity correction

In [None]:
tosses[100]

In [None]:
%%capture
with plt.xkcd():
    avg = 50
    sd = sqrt(0.5**2*100)
    heads_stand = plt.figure(figsize=(10,5))
    standardize_left(44.5, avg, sd, units="Heads", standardized=True,
                     data=False)

In [None]:
SE = sqrt(100) * (1-0) * sqrt(1/2. * 1/2.)
print SE
heads_stand

In [None]:
%%capture
normal_fig = plt.figure(figsize=figsize)
ax = normal_curve()
interval = np.linspace(-4,-1.10, 101)
ax.fill_between(interval, 0*interval, ndist.pdf(interval),
                hatch='+', color='green', alpha=0.5)
ax.set_title('The green area is %0.1f%%' % (100 * ndist.cdf(-1.10)), fontsize=20, color='green')


In [None]:
normal_fig

## Observing exactly 40 heads using continuity correction

The standardized units are

- (39.5 - 50) / 5 = -2.1
- (40.5 - 50) / 5 = -1.9

In [None]:
%%capture
normal_fig = plt.figure(figsize=figsize)
ax = normal_curve()
interval = np.linspace(-2.1,-1.9, 101)
ax.fill_between(interval, 0*interval, ndist.pdf(interval),
                hatch='+', color='green', alpha=0.5)
ax.set_title('The green area is %0.1f%%' % (100 * (ndist.cdf(-1.9) - ndist.cdf(-2.10))), fontsize=20, color='green')



In [None]:
normal_fig

Compare this to the true value:

$$
\binom{100}{40} \left(\frac{1}{2} \right)^{40} \left(\frac{1}{2}\right)^{60}
$$

In [None]:
Binomial(100, coin_trial, ['H']).mass_function[40]

## Central limit theorem

* The central limit theorem applies to **sum of draws**.
* The number of draws should be reasonably large.
* The more lopsided the values are, the more draws needed for reasonable approximation (compare the approximations of rolling  5
   in roulette to flipping a fair coin).
* It is another type of *convergence*
  : as the number of draws grows, the normal approximation gets better.

## Take away 

- If the box is lopsided, convergence to normal curve may be slower.

- But it still happens (and can be used)!

## How many samples should we take?

- The normal approximation works when we take enough
samples.
- But how many should we take?
- There have been various rules proposed...
- For counts, a [rule of thumb](http://en.wikipedia.org/wiki/Binomial_distribution#Normal_approximation) says the Normal approximation to the Binomial is OK when $np \geq k$ and $n(1-p) \geq k$
where $k$ is of the order of 5 or 10.
- **For concreteness, we take $k=10$.**
