In [None]:
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
def r_scatter(r):
    """Generate a scatter plot with a correlation approximately r"""
    # You don't have to understand how this function works.
    plots.figure(figsize=(5,5))
    x = np.random.normal(0, 1, 1000)
    z = np.random.normal(0, 1, 1000)
    y = r*x + (np.sqrt(1-r**2))*z
    plots.scatter(x, y, color='darkblue', s=20)
    plots.xlim(-4, 4)
    plots.ylim(-4, 4)

In [None]:
def standard_units(x):
    "Convert any array of numbers to standard units."
    return (x - np.average(x)) / np.std(x)

In [None]:
def correlation(t, x, y):
    """t is a table; x and y are column labels"""
    x_in_standard_units = standard_units(t.column(x))
    y_in_standard_units = standard_units(t.column(y))
    return np.average(x_in_standard_units * y_in_standard_units)

# Watch out for: nonlinearity, outliers, and ecological correlations

**Please run all cells before this cell, including the import cell at the top of the notebook.**

## 1. Nonlinearity

In [None]:
new_x = np.arange(-4, 4.1, 0.5)
nonlinear = Table().with_columns('x', new_x, 'y', new_x**2)
nonlinear.scatter('x', 'y', s=50, color='r')

In [None]:
correlation(nonlinear, 'x', 'y')

## 2.  Outliers

In [None]:
line = Table().with_columns('x', make_array(1, 2, 3, 4), 'y', make_array(1, 2, 3, 4))
line.scatter('x', 'y', s=50, color='r')

In [None]:
correlation(line, 'x', 'y')

In [None]:
outlier = Table().with_columns('x', make_array(1, 2, 3, 4, 5), 'y', make_array(1, 2, 3, 4, 0))
outlier.scatter('x', 'y', s=50, color='r')

In [None]:
correlation(outlier, 'x', 'y')

In [None]:
outlier2 = Table().with_columns('x', make_array(1, 2, 3, 4, 5), 'y', make_array(1, 2, -2, 4, 5))
outlier2.scatter('x', 'y', s=50, color='r')

In [None]:
correlation(outlier2, 'x', 'y')

## 3. Ecological correlations

In [None]:
sat2014 = Table.read_table('sat2014.csv').sort('State')
sat2014.show(4)

In [None]:
sat2014.scatter('Critical Reading', 'Math')

In [None]:
correlation(sat2014, 'Critical Reading', 'Math')