In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from numpy import mean, std, sqrt
from matplotlib.mlab import csv2rec
import matplotlib
from pylab import poly_between

# stats60 specific

from code.week2 import pearson_lee
figsize = (8,8)

# Regression (Chapters 10-12)

In [None]:
%%capture
hfig = plt.figure(figsize=figsize)
height = pearson_lee()
height._figure = hfig
height.strip = 65
height.SDline()
height.axes.scatter([65], [height.mean_strip], s=300, c='yellow', label='Average(strip)')
height.axes.set_title('The average within the strip is %0.1f' % height.mean_strip, fontsize=15)
matplotlib.rcParams['legend.scatterpoints'] = 1
height.axes.legend(loc='lower right') 
daughter = height.D
mother = height.M

In [None]:
height.figure

Note that the average in the strip at 65in is below the SD line. Why?


## Regression line

- Instead of the SD line we choose a line that minimizes the "vertical distances" from each point to the line.
- The quality of a line is measured by the r.m.s. of these distances, called ** residuals**.
  
- Each choice of slope / intercept yields a new set of residuals.
- The regression line has the residuals with smallest r.m.s.
- This is using the **method of least squares**
   to choose the slope and intercept.

In [None]:
intercept, slope = 20, 0.5
residuals = daughter - (intercept + slope * mother)
sqrt(sum(residuals**2) / len(mother))

In [None]:
intercept, slope = 30, 0.5
residuals = daughter - (intercept + slope * mother)
sqrt(sum(residuals**2) / len(mother))

### Intercept=4, Slope=0.3

In [None]:
%%capture
sample_fig = plt.figure(figsize=figsize)
sample_ax = sample_fig.gca()
D = csv2rec('data/sample_regression.csv')
X = D['x']
Y = D['y']
sample_ax.scatter(X,Y, c='yellow', s=150)
sample_ax.set_xlabel('X', fontsize=15)
sample_ax.set_ylabel('Y', fontsize=15)
del(D)

In [None]:
sample_fig

In [None]:
slope, intercept = 0.3, 4
a = slope*X + intercept
for i in range(X.shape[0]):
    sample_ax.arrow(X[i], Y[i], 0, a[i] - Y[i], color='red')
SSE = np.sum((a-Y)**2)
sample_ax.plot([X.min(), X.max()],[0.3*X.min()+4,0.3*X.max()+4], 'r-', linewidth=3)
sample_ax.set_title('Error(slope=%0.1f, intercept=%0.1f): %0.2f' % (slope, intercept, sqrt(SSE / X.shape[0])),
             fontsize=15)

In [None]:
sample_fig


Error is r.m.s. of ** lengths**


In [None]:
sqrt(sum((Y - (0.3*X + 4))**2) / len(X))

## SD line

In [None]:
%%capture
SD_fig = plt.figure(figsize=figsize)
sample_ax = SD_fig.gca()
sample_ax.scatter(X,Y, c='yellow', s=150)
sample_ax.set_xlabel('X', fontsize=15)
sample_ax.set_ylabel('Y', fontsize=15)
SDslope = Y.std() / X.std()
SDintercept = Y.mean() - SDslope * X.mean()
a = SDslope*X + SDintercept
for i in range(X.shape[0]):
    sample_ax.arrow(X[i], Y[i], 0, a[i] - Y[i], color='blue')
SSE = np.sum((a-Y)**2)
sample_ax.plot([X.min(), X.max()],[SDslope*X.min()+SDintercept,SDslope*X.max()+SDintercept], color='blue',
               linestyle='-', linewidth=3)
sample_ax.set_title('Error(SD line): %0.2f' % np.sqrt(SSE / X.shape[0]), fontsize=15)

In [None]:
SD_fig

## Regression line

- The **regression line** is the line whose slope and intercept have the smallest
`r.m.s.` for the vertical deviations from that line.

- Define the **point of averages** of two lists $X$ and $Y$ as

      point of averages(X, Y) = (average(X), average(Y))

- The regression line passes through the point of averages and has slope:
$$\text{slope} = r(X,Y) \times \frac{SD(Y)}{SD(X)}.$$

- The intercept of the regression line is
 
      intercept = average(Y) - slope * average(X)

In [None]:
point_of_averages = (mean(X), mean(Y))
sample_ax.scatter(point_of_averages[0], point_of_averages[1], marker='+', 
                  color='red', s=500, linewidth=5, label='Point of averages')
sample_ax.legend(loc='lower right')


In [None]:
SD_fig

### Fit for regression line

In [None]:
%%capture
reg_fig = plt.figure(figsize=figsize)
sample_ax = reg_fig.gca()
sample_ax.scatter(X,Y, c='yellow', s=150)
sample_ax.set_xlabel('X', fontsize=15)
sample_ax.set_ylabel('Y', fontsize=15)

reg_slope = np.corrcoef([X,Y])[0,1] * Y.std() / X.std()
reg_intercept = Y.mean() - reg_slope * X.mean()
a = reg_slope*X + reg_intercept
for i in range(X.shape[0]):
    sample_ax.arrow(X[i], Y[i], 0, a[i] - Y[i], color='red')

SSE = np.sum((a-Y)**2)
sample_ax.plot([X.min(), X.max()],[reg_slope*X.min()+reg_intercept,
                                   reg_slope*X.max()+reg_intercept], color='red',
               linestyle='-', linewidth=3)
sample_ax.set_title('Error(regression line): %0.2f' % np.sqrt(SSE / X.shape[0]), fontsize=15)


In [None]:
reg_fig

The regression line has better `r.m.s.` for the vertical deviations!

In this example, the difference is not huge. Let's look back at the 
mother / daughter heights.

In [None]:
%%capture

def error(D,a,b):
    F = a*M+b
    return np.sqrt(np.sum((D-F)**2)/D.shape[0])

D, M = daughter, mother
r = np.corrcoef([D,M])[0,1]
slope_SD = D.std() / M.std()
intercept_SD = D.mean() - slope_SD * M.mean()

slope_r = r * D.std() / M.std()
intercept_r = D.mean() - slope_r * M.mean()

height.axes.set_title('Error(SD line)=%0.1f, Error(regression)=%0.1f' %
            (error(D, slope_SD, intercept_SD),
            error(D, slope_r, intercept_r)), fontsize=15)
height.regline(draw=False)
height.axes.legend(loc=('upper left'))
del(D); del(M)

In [None]:
height.figure

Let's look at those strip averages we computed in the notes on correlation.

In [None]:
mother_heights = range(56,69)
averages = []
for mother in mother_heights:
    height.strip = mother
    averages.append(height.mean_strip)
height.axes.scatter(mother_heights, averages, s=300, c='yellow')

In [None]:
height.figure

**The regression line almost sits on top of the strip averages!**


## Working with regression problems

 The following quantities are enough to do all regression problems
* $\text{average}(X), \text{SD}(X)$
* $\text{average}(Y), \text{SD}(Y)$
* $r(X,Y)$

### Blood alcohol example

It is believed that the more alcohol there is in a person’s blood stream, the slower is that person’s reaction times. An experiment with 10 subjects yields
- average amount of alcohol in blood $0.14\%$ with SD $0.04\%$;
- average reaction time 0.42 seconds, SD 0.1 seconds;
- correlation coefficient 0.8.

**Predict the reaction time of a person with an amount of alcohol of 0.22%.**

#### Answer

- We first compute the slope, intercept 
    $$\begin{aligned}
     \text{slope} &= \frac{0.8 \times 0.1}{0.04} = 2.0 \frac{\text{seconds}}{\%}     \\
     \text{intercept} &= 0.42 - 2. \times 0.14 = 0.14 \, \text{seconds}
     \end{aligned}$$ 
     
     Plugging in an alcohol level of 0.22 yields a predicted time of
     $$2 \times 0.22 + 0.14 = 0.58 \, \text{seconds}.$$
     
- Another way to arrive at the same answer is to note that $(0.14,0.42)$
is the point of averages which is on the regression line. For any other value of blood alcohol, $B$, the corresponding point $y$-value of reaction time on the regression line is 
$$
0.42 + (B - 0.14) * 2.
$$
Substituting $B=0.22$ yields
$$
0.42 + (0.22-0.14)*2. = 0.42 + 0.16 = 0.58.
$$

### Blood alcohol example

**Find the regression line for regressing reaction time on alcohol level.**


#### Answer

Having already computed the slope and intercept of this line, the regression line is described by the equation

       reaction_time = 0.14 + 2 * alcohol level


### Blood alcohol example

**Predict the reaction time of a person whose alcohol level is at the 20th percentile. What percentile does that correspond to in reaction time?**

#### Answer

The 20th percentile of blood alcohol is (using normal approximation
and the [tail normal table](http://www.stanford.edu/class/stats60/Tables/Tail%20normal%20table.html)) $$\begin{aligned}
     0.14 + 0.04 \times \text{20th percentile of normal}
     &  = 0.14 + 0.04 \times (-0.84) \\
     & = 0.11
     \end{aligned}$$ 
     
So, we predict a reaction time of **$0.14 + 2 \times 0.11 = 0.36$.**

The second part of the question asks us to find roughly what percentile this corresponds to. We will use the normal approximation. 

Converting to standardize units, we see a reaction time of 0.36 is
$$
\frac{0.36-0.42}{0.1} = -0.6.
$$
The [tail normal table](http://www.stanford.edu/class/stats60/Tables/Tail%20normal%20table.html)  tells us this is roughly the 27th percentile.

#### Blood alcohol example

**Predict the amount of alcohol a person has in her bloodstream if the reaction time is 0.37 seconds.**

#### Answer

We must find the regression line for regressing `blood_alcohol` on `reaction time.`

It passes through the point of averages and has slope

       slope = 0.8 * 0.04 / 0.1 = 0.32
       
The intercept is

       intercept = 0.14 - 0.32 * 0.42 = 0.006
       
Substituting `reaction_time=0.37` yields 

        blood_alcohol = 0.006 + 0.32 * 0.37 = 0.124.

## Regression fallacy

* Note that someone in the 20th percentile of `blood alcohol` had predicted 27th percentile in `reaction time`.

* This is a general phenomenon, Galton referred to it as "regression to mediocrity."

### Test-retest version of regression fallacy (from book)

In a test-retest situation, usually the bottom group on the 
first test will show some improvement, and the top group will fall back.

It can also be seen from the regression of `daughter` on `mother`...

In [None]:
height.figure

The height of a daughter whose mother is 2 SD above the average height of mothers is only $0.49*2 \approx 1$ SD above the average height of daughters.

## There are two regression lines:

- One line has `mother` as independent variable, `daughter` as dependent variable.

- The other line has `daughter` as independent variable, `mother` as dependent variable.

In [None]:
%%capture
height = pearson_lee()
height.SDline()
height.regline(draw=False, label='D on M')
height.invregline(draw=False, label='M on D')
height.axes.legend(loc='lower right')


In [None]:
height.figure

## Prediction and regression

- While there are two regression lines, the right way to remember which to use is **what do you want to predict?**
- The variable you want to predict goes on the vertical axis ($Y$-axis).
- The variable you want to base your prediction on goes on the vertical axis ($X$-axis).
- In many situations, it will be more natural to predict one variable instead of another.

# Accuracy of prediction (Chapter 11)

- In discussing experiments, we discussed the average of a set of measurements.

- These can be used to predict a new measurement: our best guess is just the average of the previous meausrements.

- A similar calculation is possible with regression.

## SD as a measure of accuracy

- The SD of the set of measurements gives us some idea of how accurate our prediction is 
$$\text{SD(list) = r.m.s.(deviations from average)}$$
- If our prediction is the average, then 
$$\text{SD(list) = r.m.s.(deviations from predictions)}$$
- With regression, we have a new way to predict: using the regression line.

## Accuracy of prediction in regression

- In regression, we are using more information: the fact that tall mothers tend to give birth to taller daughters (but not quite as tall).
- This should improve the accuracy of our prediction of the daughter’s height (which uses the mother’s height).
- Our regression line was chosen to minimize
   the r.m.s. of the residuals of all ines
   $$
   \begin{aligned}
   \text{r.m.s.}(\text{regression $Y$ on $X$}) &= \text{r.m.s.(residuals)} \\
   &= \sqrt{\text{average}(\text{residuals}^2)} \\
   \end{aligned}
   $$
- This r.m.s. is always less than the SD of
   the dependent variable ($Y$) alone
   $$
   \text{r.m.s.}(\text{regression $Y$ on $X$}) = \sqrt{1-r^2} \times \text{SD}(Y)
   $$
- In mother/daughter example, this factor is 2.3 / 2.6 $\approx$ 87% = $\sqrt{1 - 0.49^2}$.

### SD is based on residuals

In [None]:
%%capture
mean_fig = plt.figure(figsize=figsize)
sample_ax = mean_fig.gca()
sample_ax.scatter(X,Y, c='yellow', s=150)
sample_ax.set_xlabel('X', fontsize=15)
sample_ax.set_ylabel('Y', fontsize=15)

meanY = np.mean(Y)
for i in range(X.shape[0]):
    sample_ax.arrow(X[i], Y[i], 0, meanY - Y[i], color='blue')

SSE = np.sum((meanY-Y)**2)
sample_ax.plot([X.min(), X.max()],[meanY, meanY], color='blue',
               linestyle='-', linewidth=3)
sample_ax.set_title('Error(average line): %0.2f' % np.sqrt(SSE / X.shape[0]), fontsize=15)


In [None]:
mean_fig

### So is r.m.s.(regression)

In [None]:
reg_fig

## r.m.s. of regression

- If the data cloud is football shaped, then the r.m.s.
of residuals gives an estimate of the spread in each vertical strip of the regression line.
- This is * homoscedastic*
   scatter.
- If the data cloud has a different shape, this scatter is called * heteroscedastic* and the regression r.m.s. is not useful within a vertical strip

### Homoscedastic scatter

In [None]:
%%capture
homoscedastic = plt.figure(figsize=figsize)
ax = homoscedastic.gca()
n = 300
Xhomo = np.random.standard_normal(n)
Xhomo.sort()
w = 3
Yhomo = 4.5 * Xhomo + 1 + np.random.standard_normal(n) * w

xf, yf = poly_between([-1.25,-0.75], [-20,-20], [20, 20])
ax.fill(xf, yf, facecolor='blue', alpha=0.4, hatch='/', label='_nolegend_')

xf, yf = poly_between([0.75,1.25], [-20,-20], [20, 20])
ax.fill(xf, yf, facecolor='green', alpha=0.4, hatch='/', label='_nolegend_')

ax.set_yticks([])
ax.set_xticks([])
ax.scatter(Xhomo, Yhomo, c='red')
ax.set_ylim([-12,17])
ax.set_title('Homoscedastic: football shaped', fontsize=15, color='green')

In [None]:
homoscedastic

Spread (as measured by SD) within green and blue strips are about the same.

### Heteroscedastic scatter

In [None]:
%%capture
n = 300
Xhetero = np.random.standard_normal(n)
Xhetero.sort()
w = np.linspace(1,6,n)
Yhetero = 0.5 * Xhetero + 1 + np.random.standard_normal(n) * w

heteroscedastic = plt.figure(figsize=figsize)
ax = heteroscedastic.gca()

xf, yf = poly_between([-1.25,-0.75], [-20,-20], [20, 20])
plt.fill(xf, yf, facecolor='blue', alpha=0.4, hatch='/', label='_nolegend_')

xf, yf = poly_between([0.75,1.25], [-20,-20], [20, 20])
plt.fill(xf, yf, facecolor='green', alpha=0.4, hatch='/', label='_nolegend_')

ax.set_yticks([])
ax.set_xticks([])
ax.scatter(Xhetero, Yhetero, c='red')
ax.set_ylim([-10,15])
ax.set_title('Heteroscedastic: NOT football shaped', fontsize=15, color='red')

In [None]:
heteroscedastic

Spread (as measured by SD) is different between green and blue strips.

## Example: using regression r.m.s. in vertical strips

* Given the following information:


Variable | Average | SD
---------|---------|----
mother   | 62.4in    | 2.3in
daughter   | 63.8in   | 2.6in

Correlation $r=0.49$.

** Of mothers of height 66in, what percentage of their daughters will have height above 67in?**

### Answer

* Slope of regression line is $$\text{slope} = 0.49 \times \frac{2.6}{2.3} = 0.54$$
* The average height of daughters of mothers of height 66in is $$63.8 + 0.54 \times (66 - 62.4) = 65.7$$
* The SD is taken to be r.m.s. of regression $$\sqrt{1 - 0.49^2} \times 2.6 = 0.87 \times 2.6 = 2.3.$$
* 67 in corresponds to 
$$(67-65.7)/2.3=0.6$$ standardized units.
* From the normal table (like Table A-104), the percentage is roughly 27%.

### Transforming variables

In [None]:
%%capture
wage_fig = plt.figure(figsize=figsize)
wage_ax = wage_fig.gca()
wage = csv2rec('data/wage.csv')
wage_ax.scatter(wage['education'], np.exp(wage['logwage']), facecolor='red', s=50)
wage_ax.set_xlabel('Education (years)', fontsize=20)
wage_ax.set_ylabel('Wage ($)', fontsize=20)
wage_ax.set_title('SD seems to vary by strips', fontsize=15, color='red')


In [None]:
wage_fig

A transformation such as `log` of dependent or independent variable
can sometimes make point cloud look more football shaped.

In [None]:
%%capture
logwage_fig = plt.figure(figsize=figsize)
logwage_ax = logwage_fig.gca()
logwage_ax.scatter(wage['education'], wage['logwage'], facecolor='red', s=50)
logwage_ax.set_xlabel('Education (years)', fontsize=20)
logwage_ax.set_ylabel('log(Wage) (log $)', fontsize=20)
logwage_ax.set_title('More football shaped', fontsize=15, color='green')


In [None]:
logwage_fig

## Other uses of residuals

- Recall that the residuals are the deviations from  the 
regression line.

- Besides taking their r.m.s. to estimate SD within a strip, 
they tell us whether our regression model is reasonable or not.

- With only one independent variable, we can see reasonably well
from the scatter plot, but it is possible to have *more than one* independent
variable (c.f. [STATS191](http://stats191.stanford.edu)).

- A common thing to do is to plot the residuals.

In [None]:
%%capture
D = csv2rec('data/quadratic_example.csv')
quadratic = plt.figure(figsize=figsize)
ax = quadratic.gca()
X = D['x']
Y = D['y']
ax.scatter(X,Y, c='r', s=40)
m = Y.std() * np.corrcoef([X,Y])[0,1] / X.std()
b = Y.mean() - X.mean() * m

r = ((X*Y).mean() - X.mean() * Y.mean()) / (X.std() * Y.std())
ax.plot([X.mean()-3.5*X.std(),X.mean()+3.5*X.std()],
           [Y.mean()-r*3.5*Y.std(),Y.mean()+r*3.5*Y.std()],
           '-', linewidth=3, color='black')
ax = quadratic.gca()
ax.set_xlabel('X', fontsize=15)
ax.set_ylabel('Y', fontsize=15)
ax.set_title("Regression line doesn't fit point cloud...", color='red', fontsize=15)

In [None]:
quadratic

Let's look at the residuals

In [None]:
%%capture
quadratic_resid = plt.figure(figsize=figsize)
ax =  quadratic_resid.gca()
ax.scatter(X, Y-m*X+b, c='r', s=40)
ax.set_xlabel('X', fontsize=15)
ax.set_ylabel('Residuals after regressing Y on X', fontsize=15)

In [None]:
quadratic_resid

Compare this to residuals in mother-daughter regression

In [None]:
%%capture
resid = plt.figure(figsize=figsize)
plt.scatter(height.M, height.D - height.M*slope_r -intercept_r, c='yellow',
            s=height.marker_size,
            edgecolor='gray')
ax = resid.gca()
ax.set_xlabel("Mother's height (inches)", fontsize=15)
ax.set_ylabel('Residual after regressing Daughter on Mother (inches)', fontsize=15)



In [None]:
resid