In [None]:
%matplotlib inline
%load_ext rpy2.ipython
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats
from numpy import mean, median, sqrt, std
from matplotlib.mlab import csv2rec

# stats60 specific
from code.utils import sample_density
from code.week1 import (stylized_density, 
                        standardize_interval, 
                        standardize_right,
                        find_percentile,
                        percentile_figure,
                        normal_curve)
ndist = scipy.stats.norm
figsize = (8,8)

## Standardized units

- Many calculations we will do involve converting a *list* or *sample* to
standardized units.

- The procedure involves both `average(list)` and `SD(list)`.

### Standardizing a list of numbers

- Subtracting the average of a list of numbers from each entry yields a new list with average 0.
- Dividing each entry in a list of numbers by its SD yields a new list with SD 1.
- Combining the two operations yields a **standardized list**, where entries are in **standard units**.
- Standard units describe how many SDs each each is above or below the average of the list.

### Rule for standardizing a *list*

The standardized list is

    [(entry - mean(list)) / SD(list) for entry in list]
    
### Example: standardize the list [161 , 166 , 171 , 172 ]

#### Table for computing average and SD

Entry | Data | Deviation | Deviation$^2$
------|-----|-----------|---------------
1     |  161 |    161-167.5=-6.5     |    (-6.5)^2=42.25
2     |  166 |     166-167.5=-1.5     |    (-1.5)^2=2.25
3     |  171 |     171-167.5=3.5     |    3.5^2=12.25
4     |  172 |     172-167.5=4.5     |    4.5^2=20.25
Total |  670 |    (not needed, but always 0)       |    77

Average=670/4=167.5, SD = $\sqrt{77/4}=\sqrt{19.25}\approx 4.4$.

### Standardizing the list

The standardized list can be added as a new column to the table.

Entry | Data | Deviation | Deviation$^2$ | Standardized data
------|-----|-----------|----------------|-------------------
1     |  161 |    161-167.5=-6.5     |    (-6.5)^2=42.25 | -6.5/4.4=-1.5
2     |  166 |     166-167.5=-1.5     |    (-1.5)^2=2.25 | -1.5/4.4=-0.3
3     |  171 |     171-167.5=3.5     |    3.5^2=12.25    | 3.5 / 4.4=0.8
4     |  172 |     172-167.5=4.5     |    4.5^2=20.25    | 4.5 / 4.4 = 1.
Total |  670 |    (not needed, but always 0)       |    77


### Standardized units in $\Sigma$ notation

* If our list is $X=[X_1, \dots, X_n]$ and we call our standardized list $$Z=[Z_1, \dots, Z_n].$$
* Then, $$Z_i = \frac{X_i - \bar{X}}{\text{SD}(X)}.$$

In [None]:
%%capture
normal_fig = plt.figure(figsize=(6,6))
ax = normal_curve()
ax.set_title('The normal curve', fontsize=20)


### The normal curve

- Many (but not all) histograms follow the normal curve after standardizing.

- Because standardizing doesn't change the general shape of the
histogram, the original histogram must look like the "normal curve".

In [None]:
normal_fig

### Rules of thumb for SD

Our rules of thumb for SD come from the normal curve:

* The area under the normal curve between -1 and +1 is about $68\%$.
* The area under the normal curve between -2 and +2 is about $95\%$.
* The area under the normal curve between -3 and +3 is about $99.7\%$.

### Sample rows of Table A-104


$z$ | Height | Area
--- | --- | ---
0.70 | 31.23 | 51.61
1.00 | 24.20 | 68.27
2.00 | 5.40 | 95.45

### Using Table A-104

- The height at `z= 0.7` is 31.23% (per standardized unit).

- The curve is symmetric, so the height at `z=-0.7` is also 31.23%.

In [None]:
ax = normal_fig.gca()
locs = [-.7,.7]
ax.bar([loc-0.025 for loc in locs], [ndist.pdf(loc) for loc in locs], width=0.05, color='red')


In [None]:
normal_fig

### Using table A-104


In [None]:
%%capture
fill_1 = plt.figure(figsize=(5,5))
ax = fill_1.gca()
normal_curve(ax=ax)
interval = np.linspace(-0.7,0.7,101)
ax.fill_between(interval, 0*interval, ndist.pdf(interval), 
                hatch='/', facecolor='yellow')


In [None]:
fill_1

The area between $z=-0.7$ and $z=0.7$ is 51.61%. This is the 
entry in Table A-104.

Other areas can be computed by symmetry and other considerations.

In [None]:
%%capture
fill_2 = plt.figure(figsize=(6,6))
ax = normal_curve()
interval = np.linspace(-0.7,0.,101)
ax.fill_between(interval, 0*interval, ndist.pdf(interval), 
                hatch='/', facecolor='yellow')


In [None]:
fill_2

The area between $z=-0.7$ and $z=0$ is ?????


In [None]:
%%capture
fill_3 = plt.figure(figsize=(6,6))
ax = normal_curve()
interval = np.linspace(0.7,4,101)
ax.fill_between(interval, 0*interval, ndist.pdf(interval), 
                hatch='/', facecolor='yellow')


In [None]:
fill_3

The area between $z=0.7$ and $z=\infty$ is ?????


In [None]:
%%capture
fill_4 = plt.figure(figsize=(6,6))
ax = normal_curve()
interval = np.linspace(0.7,4,101)
ax.fill_between(interval, 0*interval, ndist.pdf(interval), 
                hatch='/', facecolor='#820000')
ax.fill_between(-interval, 0*(-interval), ndist.pdf(-interval), 
                hatch='/', facecolor='#820000')


In [None]:
fill_4

<font color="#820000">
Area = ????
</font>

In [None]:
%%capture
fill_5 = plt.figure(figsize=(6,6))
ax = normal_curve()
interval = np.linspace(-4,0.7,101)
ax.fill_between(interval, 0*interval, ndist.pdf(interval), 
                hatch='/', facecolor='#820000')


In [None]:
fill_5

<font color="#820000">
Area = ????
</font>

In [None]:
%%capture
height_fig = plt.figure(figsize=(6,6))
ax = normal_curve()
interval = np.linspace(-2,1,101)
ax.fill_between(interval, 0*interval, ndist.pdf(interval), 
                hatch='/', facecolor='#820000')


In [None]:
height_fig

It will be easiest to first break the region into two parts,
each of which we can use Table A-104 for.


In [None]:
%%capture
height_fig_split = plt.figure(figsize=(6,6))
ax = normal_curve()
interval = np.linspace(-2,0,101)
ax.fill_between(interval, 0*interval, ndist.pdf(interval), 
                hatch='+', facecolor='blue')
interval = np.linspace(0,1,101)
ax.fill_between(interval, 0*interval, ndist.pdf(interval), 
                hatch='o', facecolor='green')


In [None]:
height_fig_split

<font color="green">
Area = 68%/2 = 34%
</font>
<br>
<font color="blue">
Area = 95%/2 = 47%
</font>
<br>

## Normal approximation for data

- If we assume data follow the normal curve, then we can use Table
A-104 to approximate percentages.


### Example: heights of freshmen at Stanford

- The heights of the male freshmen at Stanford averaged 68 inches with an SD of 3 inches. 

- Use the normal curve to estimate the percentage of these men between 62 inches and 71 inches.



### Normal approximation for data

#### Step 1: draw the interval

In [None]:
%%capture

with plt.xkcd():
    bar_fig = plt.figure(figsize=(10,3))
    ax = bar_fig.gca() 
    standardize_interval(62,71,68,3, units='Inches', 
                        facecolor='#820000')
    

In [None]:
bar_fig

#### Step 2: add the average to the interval

In [None]:
%%capture
with plt.xkcd():
    bar_fig = plt.figure(figsize=(10,3))
    ax = bar_fig.gca() 
    standardize_interval(62,71,68,3, units='Inches', include_mean=True,
                         facecolor='#820000')
    

In [None]:
bar_fig

#### Step 3: standardize the endpoints

In [None]:
%%capture
with plt.xkcd():
    bar_fig = plt.figure(figsize=(10,3))
    ax = bar_fig.gca() 
    standardize_interval(62,71,68,3, units='Inches', include_mean=True,
                         facecolor='#820000',
                         standardized=True)
    

In [None]:
bar_fig

#### Step 4: look up percentage using Table A-104


In [None]:
height_fig

In [None]:
a = height_fig_split.gca()
a.set_title('Area is approximately %d %%' % (100 * (ndist.cdf(1) - ndist.cdf(-2))),
            fontsize=20, color='red')
height_fig_split

### The normal approximation doesn’t always work

In [None]:
%%capture
with plt.xkcd():
    bimodal = plt.figure(figsize=(6,6))
    sample = (list(np.random.standard_t(40, size=50000)) + 
              list(np.random.standard_t(30, size=30000) + 7))
    stylized_density(sample, ax=bimodal.gca())


In [None]:
bimodal

In [None]:
%%capture
income_fig = plt.figure(figsize=figsize)
data = csv2rec('data/householdincome2006.csv', delimiter=';')      
breaks = np.array([0,10,20,30,40,50,60,70,80,90,100,150,200,300])*1000
sample = []                                                                 
for l, u in zip(breaks, breaks[1:]):                                       
        g = (data['lower'] < u) * (data['upper'] >= l)                               
        sample.extend(np.random.uniform(l, u, size=int(data['count'][g].sum())))
sample = np.array(sample) / 1000
breaks[-1] = 300000  
ax, density, CDF = sample_density(sample, bins=breaks / 1000.,
            histtype='stepfilled', facecolor='gray')                                                 
ax.set_xlabel('Income (1000$)')


In [None]:
income_fig

## Percentiles

* The 1st percentile of a histogram is the number with 1 % of the area to the left and 99 % of the area to the right.
* The 10th percentile of a histogram is the number with 10 % of the area to the left and 90 % of the area to the right.
* **The median is the 50th percentile.**
* The **first quartile**
   is the 25th percentile; the **third quartile**
   is the 75th percentile.
* The **inter-quartile range**
   is the difference between the third and first quartiles.

### Percentiles

Let's look at the percentiles of our income histogram.

In [None]:
[(p, int(find_percentile(density, p/100.)))
            for p in [2,25,50,75,90,99]]

The interquartile range is 58,000. The median is 47,000.

### Percentiles for the normal curve

In [None]:
[(p, ndist.ppf(p/100.))
  for p in [2,25,50,75,90,95,97.5,99]]

The interquartile range is 1.34. The median is 0.

In [None]:
%%capture
percentile_fig = plt.figure(figsize=(10,10))
ax = normal_curve()
interval = np.linspace(-4, ndist.ppf(0.95), 101)
ax.fill_between(interval, 0*interval, ndist.pdf(interval),
                hatch='+', color='green', alpha=0.5)
ax.set_title('The green area is 95%', fontsize=20, color='green')
ax.annotate('95th percentile=1.65', xy=(ndist.ppf(0.95),0),
            xytext=(ndist.ppf(0.95),-0.1), fontsize=20,
            arrowprops=dict(facecolor='black'),
            horizontalalignment='center')

In [None]:
percentile_fig

In [None]:
interval = np.linspace(ndist.ppf(0.95), 4, 101)
ax.fill_between(interval, 0*interval, ndist.pdf(interval),
                hatch='o', color='red', alpha=0.5)
ax.set_title('The red area is 5%', fontsize=20, color='red')


In [None]:
percentile_fig

In [None]:
%%capture
percentile_fig2 = plt.figure(figsize=(10,10))
ax = normal_curve()
interval = np.linspace(-4, ndist.ppf(0.975), 101)
ax.fill_between(interval, 0*interval, ndist.pdf(interval),
                hatch='+', color='green', alpha=0.5)
ax.set_title('The green area is 97.5%', fontsize=20, color='green')
ax.annotate('97.5th percentile=1.96', xy=(ndist.ppf(0.975),0),
            xytext=(ndist.ppf(0.975),-0.1), fontsize=20,
            arrowprops=dict(facecolor='black'),
            horizontalalignment='center')

In [None]:
percentile_fig2

In [None]:
interval = np.linspace(ndist.ppf(0.975), 4, 101)
ax.fill_between(interval, 0*interval, ndist.pdf(interval),
                hatch='o', color='red', alpha=0.5)
ax.set_title('The red area is 2.5%', fontsize=20, color='red')


In [None]:
percentile_fig2

In [None]:
%%capture
percentile_fig3 = plt.figure(figsize=(10,10))
ax = normal_curve()
interval = np.linspace(ndist.ppf(0.025), ndist.ppf(0.975), 101)
ax.fill_between(interval, 0*interval, ndist.pdf(interval),
                hatch='+', color='#1C2045', alpha=0.5)
ax.set_title('The blue area is 95%', fontsize=20, color='#1C2045')
ax.annotate('97.5th percentile=1.96', xy=(ndist.ppf(0.975),0),
            xytext=(ndist.ppf(0.975),-0.1), fontsize=20,
            arrowprops=dict(facecolor='black'),
            horizontalalignment='center')

In [None]:
percentile_fig3

In [None]:
interval = np.linspace(ndist.ppf(0.975), 4, 101)
ax.fill_between(interval, 0*interval, ndist.pdf(interval),
                hatch='o', color='#8c1515', alpha=0.5)
interval = np.linspace(-4, ndist.ppf(0.025), 101)
ax.fill_between(interval, 0*interval, ndist.pdf(interval),
                hatch='o', color='#8c1515', alpha=0.5)

ax.set_title('Each red area is 2.5%', fontsize=20, color='#8c1515')


In [None]:
percentile_fig3

### Estimating percentiles

- Among all applicants to Stanford the Math SAT scores averaged 560 with an SD of 100. The scores followed the normal distribution quite well. 

- Estimate the 90th percentile of the score distribution.

- **The estimated quantile is**

     560 + 100 * 90th quantile of normal curve 
        = 560 + 100 * 1.28 
        = 688
     
**This is the inverse of forming standardized units.**
   
- We find the 90th quantile by looking at Table A-104 and finding an entry
with Area about 80%. **Why?** 

- Above, my calculator computed this to be 1.28.

### Example: SAT scores

- SAT usually follow a normal curve. 

- Assume the average score is 1100 with an SD of 150. 

- **What percent of students score above 1400?**

As we are told the scores follow a normal curve, we
convert 1400 to standardized units and find the area
above 1400.

In [None]:
%%capture
with plt.xkcd():
    SAT_fig = plt.figure(figsize=(10,5))
    standardize_right(1400, 1100, 150, units="SAT")

In [None]:
SAT_fig

In [None]:
%%capture
with plt.xkcd():
    SAT_fig = plt.figure(figsize=(10,6))
    standardize_right(1400, 1100, 150, standardized=True, units='SAT')

In [None]:
SAT_fig

In [None]:
%%capture
percentile_fig4 = plt.figure(figsize=figsize)
ax = normal_curve()
interval = np.linspace(2., 4, 101)
ax.fill_between(interval, 0*interval, ndist.pdf(interval),
                hatch='o', color='green', alpha=0.5)
ax.set_title('Area is about 2.5%', color='green', fontsize=20)

In [None]:
percentile_fig4

### Example: SAT scores

- **What percent of students score above between 1000 and 1200?**



In [None]:
%%capture
with plt.xkcd():
    SAT_fig = plt.figure(figsize=(12,7))
    standardize_interval(1000,1200,1100,150, units='SAT')

In [None]:
SAT_fig

In [None]:
%%capture
with plt.xkcd():
    SAT_fig = plt.figure(figsize=(12,7))
    standardize_interval(1000.,1200,1100.,150, units='SAT', 
                         standardized=True,
                         fontsize=14)

In [None]:
SAT_fig

In [None]:
%%capture
percentile_fig5 = plt.figure(figsize=figsize)
ax = normal_curve()
interval = np.linspace(-0.67, 0.67, 101)
ax.fill_between(interval, 0*interval, ndist.pdf(interval),
                hatch='o', color='orange', alpha=0.5)
ax.set_title('Area is about %d%%' % np.round(100*(ndist.cdf(0.67)-
                                             ndist.cdf(-0.67))), 
             color='orange', fontsize=20)

In [None]:
percentile_fig5

### Estimating percentiles using normal curve

**What is the 60th percentile of the scores?**

In [None]:
%%capture
sixty_fig = plt.figure(figsize=figsize)
percentile_figure(0.6, 1100, 150, units="SAT", hatch='+', facecolor='red')

In [None]:
sixty_fig

In [None]:
%%capture
sixty_fig = plt.figure(figsize=figsize)
percentile_figure(0.6, 1100, 150, 
                  standardized=True,
                  units="SAT", hatch='+', facecolor='red')

In [None]:
sixty_fig