In [None]:
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
def standard_units(x):
    return (x - np.average(x)) / np.std(x)

def correlation(t, label_x, label_y):
    return np.mean(standard_units(t.column(label_x))*standard_units(t.column(label_y)))

def slope(t, label_x, label_y):
    r = correlation(t, label_x, label_y)
    return r*np.std(t.column(label_y))/np.std(t.column(label_x))

def intercept(t, label_x, label_y):
    return np.mean(t.column(label_y)) - slope(t, label_x, label_y)*np.mean(t.column(label_x))

In [None]:
def draw_line(slope=0, intercept=0, x=None, color='r'):
    if x is None:
        x1, x2, y1, y2 = plots.gca().axis()
    x = make_array(x1, x2)
    y = x*slope + intercept
    plots.plot(x, y, color=color)

In [None]:
def demographics_errors(slope, intercept):
    # Use four random points from the original data
    sample = [[14.7, 33995], [19.1, 61454], [50.7, 71183], [59.5, 105918]]
    demographics.scatter('College%', 'Median Income', alpha=0.5)
    xlims = make_array(5, 75)
    # Plot a line with the slope and intercept you specified:
    plots.plot(xlims, slope * xlims + intercept, lw=4)
    # Plot red lines from each of the four points to the line
    for x, y in sample:
        plots.plot([x, x], [y, slope * x + intercept], color='r', lw=4)

# Errors in Estimation

**Please run all cells before this cell, including the import cell at the top of the notebook.**

In [None]:
def fitted_values(t, x, y):
    """Return an array of the regressions estimates at all the x values"""
    a = slope(t, x, y)
    b = intercept(t, x, y)
    return a*t.column(x) + b

In [None]:
demographics = Table.read_table('district_demographics2016.csv')
demographics.show(5)

In [None]:
demographics = demographics.drop('State', 'District', 'Percent voting for Clinton')
demographics.show(5)

In [None]:
demographics.scatter('College%', 'Median Income')

In [None]:
correlation(demographics, 'College%', 'Median Income')

In [None]:
regression_slope = slope(demographics, 'College%', 'Median Income')
regression_intercept = intercept(demographics, 'College%', 'Median Income')
regression_slope, regression_intercept

In [None]:
predicted = fitted_values(demographics, 'College%', 'Median Income')

In [None]:
demographics = demographics.with_column('Linear Prediction', predicted)
demographics.scatter('College%')

In [None]:
actual = demographics.column('Median Income')
errors = actual - predicted

In [None]:
demographics.with_column('Error', errors)

In [None]:
np.mean(errors)

In [None]:
np.mean(errors ** 2) ** 0.5

In [None]:
demographics_errors(regression_slope, regression_intercept)

In [None]:
demographics_errors(1500, 20000)

## Takes any slope, any intercept

**Please run all cells before this cell, including the import cell at the top of the notebook.**

In [None]:
demographics_errors(-1000, 75000)

## Root Mean Square Error

**Please run all cells before this cell, including the import cell at the top of the notebook.**

In [None]:
def show_demographics_rmse(slope, intercept):
    demographics_errors(slope, intercept)
    x = demographics.column('College%')
    y = demographics.column('Median Income')
    prediction = slope * x + intercept
    mse = np.mean((y - prediction) ** 2)
    print("Root mean squared error:", round(mse ** 0.5, 2))

In [None]:
show_demographics_rmse(-1000, 75000)

In [None]:
show_demographics_rmse(1500, 20000)

In [None]:
show_demographics_rmse(regression_slope, regression_intercept)