# Lecture 21 - Residuals 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
plt.style.use('fivethirtyeight')

%matplotlib inline

## Regression line vs. other lines 

In [None]:
def standard_units(x):
    "Convert any array of numbers to standard units."
    return (x - np.mean(x)) / np.std(x)

def correlation(df, x, y):
    """Computes the correlation between columns x and y"""
    x_su = standard_units(df[x])
    y_su = standard_units(df[y])
    return np.mean(x_su * y_su)

def slope(df, x, y):
    """Computes the slope of the regression line"""
    r = correlation(df, x, y)
    y_sd = np.std(df[y])
    x_sd = np.std(df[x])
    return r * y_sd / x_sd
    
def intercept(df, x, y):
    """Computes the intercept of the regression line"""
    x_mean = np.mean(df[x])
    y_mean = np.mean(df[y])
    return y_mean - slope(df, x, y)*x_mean

def fitted_values(df, x, y):
    """Return an array of the regressions estimates at all the x values"""
    a = slope(df, x, y)
    b = intercept(df, x, y)
    return a*df[x] + b

### Example: 2016 election dataset 

In [None]:
demographics = pd.read_csv('data/district_demographics2016.csv')
demographics.head(10)

In [None]:
predict_voting = demographics[['Median Income', 'Percent voting for Clinton']].copy()
predict_voting['Fitted'] = ...

In [None]:
plt.scatter(predict_voting['Median Income'], predict_voting['Percent voting for Clinton'], 
           label='Percent voting for Clinton')
plt.scatter(predict_voting['Median Income'], predict_voting['Fitted'], 
            label='Fitten Line')
plt.xlabel('Median Income')
plt.ylabel('Percent voting for Clinton')
plt.legend();

In [None]:
predict_income = demographics[['College%', 'Median Income']].copy()
predict_income['Fitted'] = ...

In [None]:
plt.scatter(predict_income['College%'], predict_income['Median Income'], 
           label='Median Income')
plt.scatter(predict_income['College%'], predict_income['Fitted'], 
            label='Fitted Line')
plt.xlabel('College%')
plt.ylabel('Median Income')
plt.legend();

<br><br><br>

---
<center>Return to Slides, Slide 6</center>

---

<br><br><br>

## Residuals

In [None]:
demos = demographics.drop(['State', 'District', 'Percent voting for Clinton'], axis=1)
demos.head(5)

In [None]:
def residuals(df, x, y):
    predictions = fitted_values(df, x, y)
    return df[y] - predictions

In [None]:
demos['Fitted Value'] = ...
demos['Residual'] = ...
demos.head(5)

In [None]:
plt.scatter(demos['College%'], demos['Median Income'], 
           label='Median Income')
plt.scatter(demos['College%'], demos['Fitted Value'], 
            label='Fitted Line')
plt.scatter(demos['College%'], demos['Residual'], 
            label='Residual')
plt.xlabel('College%')
plt.ylabel('Median Income')
plt.legend();

In [None]:
def plot_residuals(df, x, y):
    df['Fitted Value'] = fitted_values(df, x, y)
    df['Residual'] = residuals(df, x, y)
    plt.scatter(df[x], df[y], label=y)
    plt.scatter(df[x], df['Fitted Value'], label='Fitted Line')
    plt.xlabel('College%')
    plt.ylabel('Median Income')
    plt.legend();

    df.plot.scatter(x=x, y='Residual')

In [None]:
plot_residuals(demographics, 'College%', 'Median Income')

In [None]:
family_heights = pd.read_csv('data/family_heights.csv')
parents = (family_heights['father'] + family_heights['mother'])/2
heights = pd.DataFrame({
    'Parent Average': parents,
    'Child': family_heights['child']}
    )
plot_residuals(heights, 'Parent Average', 'Child')

<br><br><br>

---
<center>Return to Slides, Slide 8</center>

---

<br><br><br>

## Dugongs

In [None]:
dugong = pd.read_csv('data/dugong.csv')
dugong.head(5)

In [None]:
...

In [None]:
...

In [None]:
...

<br><br><br>

---
<center>Return to Slides, Slide 9</center>

---

<br><br><br>

## US Women

In [None]:
us_women = pd.read_csv('data/us_women.csv')
us_women.head(5)

In [None]:
...

In [None]:
...

In [None]:
...

## Average of Residuals 

In [None]:
...

In [None]:
...

In [None]:
...

In [None]:
...

In [None]:
...