In [None]:
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
def standard_units(arr):
    return (arr - np.average(arr))/np.std(arr)

def correlation(t, x, y):
    x_standard = standard_units(t.column(x))
    y_standard = standard_units(t.column(y))
    return np.average(x_standard * y_standard)

def slope(t, x, y):
    r = correlation(t, x, y)
    y_sd = np.std(t.column(y))
    x_sd = np.std(t.column(x))
    return r * y_sd / x_sd

def intercept(t, x, y):
    x_mean = np.mean(t.column(x))
    y_mean = np.mean(t.column(y))
    return y_mean - slope(t, x, y)*x_mean

def fitted_values(t, x, y):
    """Return an array of the regression estimates at all the x values"""
    a = slope(t, x, y)
    b = intercept(t, x, y)
    return a*t.column(x) + b

def residuals(t, x, y):
    predictions = fitted_values(t, x, y)
    return t.column(y) - predictions

In [None]:
def plot_fitted(t, x, y):
    tbl = t.select(x, y)
    tbl.with_columns('Fitted Value', fitted_values(t, x, y)).scatter(0)

# Variance Decomposition

**Please run all cells before this cell, including the import cell at the top of the notebook.**

In [None]:
galton = Table.read_table('galton.csv')
heights = Table().with_columns('MidParent', galton.column('midparentHeight'),'Child', galton.column('childHeight'))
heights = heights.with_columns('Fitted Value', fitted_values(heights, 'MidParent', 'Child'),
                                         'Residual', residuals(heights, 'MidParent', 'Child'))
heights

In [None]:
plot_fitted(heights, 'MidParent', 'Child')

In [None]:
plot_fitted(heights, 'MidParent', 'Child')
ave_child = np.mean(heights.column('Child'))
plots.plot([64, 76], [ave_child, ave_child]);

No matter what the shape of the scatter plot, the variance of the observed values of $y$ is the sum of the variance of the fitted value and the variance of the residual.

$$
\mbox{variance of }y ~=~ \mbox{variance of fitted value} + \mbox{variance of residual}
$$

In [None]:
np.std(residuals(heights, 'MidParent', 'Child')) ** 2

In [None]:
np.std(heights.column('Fitted Value')) ** 2

In [None]:
np.std(heights.column('Fitted Value')) ** 2 + np.std(residuals(heights, 'MidParent', 'Child')) ** 2

In [None]:
np.std(heights.column('Child')) ** 2

No matter what the shape of the scatter plot, the SD of the residuals is a fraction of the SD of the observed values of $y$. The fraction is  $\sqrt{1-r^2}$.

$$
\mbox{SD of residuals} ~=~ \sqrt{1 - r^2} \cdot \mbox{SD of }y
$$

In [None]:
r = correlation(heights, 'MidParent', 'Child')
r

In [None]:
np.sqrt(1 - r**2) * np.std(heights.column('Child'))

In [None]:
np.std(residuals(heights, 'MidParent', 'Child'))

## Dicussion question

**Please run all cells before this cell, including the import cell at the top of the notebook.**

In [None]:
# Part 1
# we want sqrt(1 - r^2) * sd y
r = 0.6
sd_y = 15
sd_resid = np.sqrt(1 - 0.6 ** 2) * 15
sd_resid

In [None]:
# Part 2
# Hint: Chebychev's Inequality
# 75% = 1 - (1/2^2) of the data is within 2 SDs of the mean
12.0 * 2