# Lecture 18 - Correlation and Linear Regression 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
plt.style.use('fivethirtyeight')

%matplotlib inline

In [None]:
def r_scatter(r):
    plt.figure(figsize=(5,5))
    "Generate a scatter plot with a correlation approximately r"
    x = np.random.normal(0, 1, 1000)
    z = np.random.normal(0, 1, 1000)
    y = r*x + (np.sqrt(1-r**2))*z
    plt.scatter(x, y, color='darkblue', s=20)
    plt.xlim(-4, 4)
    plt.ylim(-4, 4)

## Guessing the Future 

In [None]:
families = pd.read_csv('data/family_heights.csv')
families.head(5)

In [None]:
parent_avgs = ...
heights = pd.DataFrame({'Parent Average': parent_avgs, 
                        'Child': families['child']})

In [None]:
heights.plot.scatter(...)

Let's say we are interested in predicting the height of a child given the average parent's height is 68 inches. 

We can examine the data points near a parent's average height of 68 inches. 

In [None]:
nearby = heights[(heights['Parent Average'] >= 67.5) & (heights['Parent Average'] < 68.5)]
nearby_mean = nearby['Child'].mean()
nearby_mean

In [None]:
heights.plot.scatter(x='Parent Average', y='Child')
plt.plot([67.5, 67.5], [50, 85], color='red', lw=2)
plt.plot([68.5, 68.5], [50, 85], color='red', lw=2)
plt.scatter(68, nearby_mean, color='magenta', s=50);

### Predict Child Height

Using this idea, we can create a function that will predict the height of a child given the parent's average height. 

In [None]:
def predict_child(h):
    """Predict the height of a child whose parents have a parent average 
    height of p_avg.
    
    The prediction is the average height of the children whose parent average 
    height is in the range p_avg plus or minus 0.5.
    """
    nearby = heights[(heights['Parent Average'] >= h-1/2) & (heights['Parent Average'] < h+1/2)]
    return nearby['Child'].mean()

Now we can create a new DataFrame that has the predicted height for each parents / child combination. 

In [None]:
heights_with_predictions = heights.copy()
heights_with_predictions['Prediction'] = ...
heights_with_predictions.head(5)

In [None]:
plt.scatter(heights_with_predictions['Parent Average'], 
            heights_with_predictions['Child'], 
            label='Child')
plt.scatter(heights_with_predictions['Parent Average'], 
            heights_with_predictions['Prediction'], 
            label='Prediction')

plt.xlabel('Parent Average')
plt.ylabel('Child')
plt.title('Scatter Plot of Parent Average vs. Child Height')

# Add a legend
plt.legend()

# Show the plot
plt.show()


*Back to Slides*

## Association

In [None]:
hybrid = pd.read_csv('data/hybrid.csv')
hybrid.head(5)

In [None]:
hybrid.sort_values('msrp', ascending = False)

Let's examine the relationship between several numeric variables.

In [None]:
hybrid.plot.scatter(...);

In [None]:
hybrid.plot.scatter(...);

We can look solely at SUVs. 

In [None]:
suv = ...
suv.shape

In [None]:
suv.plot.scatter(...);

In [None]:
suv.plot.scatter(...);

*Back to Slides*

## Correlation Coefficient 

In [None]:
def standard_units(x):
    "Convert any array of numbers to standard units."
    return (x - np.average(x)) / np.std(x)

In [None]:
plt.scatter(standard_units(suv['mpg']), standard_units(suv['msrp']))
plt.xlim(-3, 3)
plt.xlim(-3, 3)
plt.xlabel('mpg (standard units)')
plt.ylabel('msrp (standard units)');

In [None]:
suv.plot.scatter(x='acceleration', y='msrp');

In [None]:
plt.scatter(standard_units(suv['acceleration']), 
            standard_units(suv['msrp']))
plt.xlim(-3, 3)
plt.xlim(-3, 3)
plt.xlabel('acceleration (standard units)')
plt.ylabel('msrp (standard units)');

## Correlation 


In [None]:
r_scatter(-1)

In [None]:
x = np.arange(1, 7, 1)
y = np.array([2, 3, 1, 5, 2, 7])

In [None]:
plt.scatter(x, y, s=30, color='red')

In [None]:
plt.scatter(standard_units(x), standard_units(y), s=30, color='red')

In [None]:
df = pd.DataFrame({'x': x, 'y': y, 
                   'x (standard units)': standard_units(x), 
                   'y (standard units)': standard_units(y), 
                   'product of standard units':  standard_units(x)*standard_units(y)})
df

*Back to Slides*

Here, we introduced the correlation coefficient: 

\begin{align}
r 
& = \text{Mean}\left(\text{StandardUnits}(x) *  \text{StandardUnits}(y)\right)\\
& = \frac{1}{n} \sum_{i=1}^n \text{StandardUnits}(x_i) *  \text{StandardUnits}(y_i)\\
& = \frac{1}{n}\sum_{i=1}^n \left( \frac{x_i - \text{Mean}(x)}{\text{Stdev}(x)} \right) * \left( \frac{y_i - \text{Mean}(y)}{\text{Stdev}(y)} \right) \\
\end{align}

In [None]:
# r is the average of the products of the standard units 
r = np.average(df.iloc[:,2] * df.iloc[:,3])
r

In [None]:
def correlation(df, x, y):
    """df is a DataFrame; x and y are column labels"""
    x_in_standard_units = standard_units(df[x])
    y_in_standard_units = standard_units(df[y])
    return np.average(x_in_standard_units * y_in_standard_units)

In [None]:
correlation(df, 'x', 'y')

In [None]:
# Built-in pandas function 

df.iloc[:,0:2].corr()

### Examples 

In [None]:
suv.plot.scatter(...);

In [None]:
correlation(suv, ...)

In [None]:
suv.plot.scatter(...);

In [None]:
correlation(suv, ...)

### Switching Axis 

In [None]:
correlation(df, 'x', 'y')

In [None]:
df.plot.scatter(x='x', y='y', s=30, color='red');

In [None]:
df.plot.scatter(x='y', y='x', s=30, color='blue');

In [None]:
correlation(df, 'y', 'x')

*Back to Slides*

## Caution on Intrepreting Correlation 

### Nonlinearity 

In [None]:
new_x = np.arange(-4, 4.1, 0.5)
nonlinear = pd.DataFrame({'newx': new_x, 'y': new_x**2})
nonlinear.plot.scatter('newx', 'y', s=30, color='r');

In [None]:
correlation(nonlinear, 'newx', 'y')

### Outliers 

In [None]:
line = pd.DataFrame({'x': np.array([1, 2, 3, 4]), 
                     'y': np.array([1, 2, 3, 4])})
line.plot.scatter(x='x', y='y', s=30, color='r');

In [None]:
correlation(line, 'x', 'y')

In [None]:
outlier = pd.DataFrame({'x': np.array([1, 2, 3, 4, 5]), 
                        'y': np.array([1, 2, 3, 4, 0])})
outlier.plot.scatter(x='x', y='y', s=30, color='r');

In [None]:
correlation(outlier, 'x', 'y')

### Ecological Correlations 

In [None]:
sats = pd.read_csv('data/sat_scores.csv')
sats.sort_values('State', inplace=True)
sats.head(10)

In [None]:
sats.plot.scatter(x='Critical Reading', y='Math');

In [None]:
correlation(sats, 'Critical Reading', 'Math')

In [None]:
def rate_code(x):
    if x <= 25:
        return 'low'
    elif x <= 50:
        return 'low-moderate'
    elif x <= 75:
        return 'moderate_high'
    else:
        return 'high'

In [None]:
rate_codes = sats.apply(lambda x: rate_code(x['Participation Rate']), axis=1)

In [None]:
sats['Rate Code'] = rate_codes
sats.head(10)

In [None]:
sns.scatterplot(data=sats, x="Critical Reading", y="Math", hue="Rate Code");

In [None]:
sats[sats['Rate Code'] == 'low']

In [None]:
sats[sats['Rate Code'] == 'high']

## Prediction Lines 

Let's build an intuition abou the relationship between the slope of the nearest neighbor line and the correlation coefficient.

We will again use the heights data. 

In [None]:
heights.sort_values('Parent Average', inplace=True)
heights.head(10)

We will build a slightly more robust Nearest Neighbor predictor. 

In [None]:
def nn_heights(parent_average, window=0.5):
    lower_bound = parent_average - window
    upper_bound = parent_average + window
    similar_child_heights = (
        heights[(heights['Parent Average'] >= lower_bound) & (heights['Parent Average'] < upper_bound)]["Child"]
    )
    if len(similar_child_heights) == 0: #handle the case when there is no data
        return np.nan # nan = not a number , a special floating point "number"
    else:
        return np.mean(similar_child_heights)

Make predictions at many different parent heights not just the heights in the dataset. 

In [None]:
test_heights = pd.DataFrame({"Parent Average": np.arange(61,74,0.2)})
test_heights["NN Prediction"] = (
    test_heights.apply(lambda row: nn_heights(row['Parent Average']), axis=1))

In [None]:
# Plot it, using plotly 
fig = px.scatter(heights, x="Parent Average", y="Child", height=600)
fig.add_scatter(x=test_heights["Parent Average"], 
                y=test_heights["NN Prediction"], name="NN Prediction")

It will be easier to start in standard units. 

In [None]:
# Transform the heights data to standard units
su_heights = pd.DataFrame({'Parent Average': standard_units(heights['Parent Average']), 
                           'Child': standard_units(heights['Child'])})

# Transform the predictions to standard units 
su_test_heights = (
    pd.DataFrame({'Parent Average': 
                  (test_heights['Parent Average'] - heights['Parent Average'].mean()) 
                  / heights['Parent Average'].std(), 
                  'NN Prediction': 
                  (test_heights['NN Prediction'] - heights['Child'].mean()) 
                  / heights['Child'].std()}))

# Plot it 
fig = px.scatter(su_heights, x="Parent Average", y="Child", height=600)
fig.add_scatter(x=su_test_heights["Parent Average"], 
                y=su_test_heights["NN Prediction"], name="NN Prediction")    


Computing the correlation we get: 

In [None]:
correlation(heights, "Parent Average", "Child")

What happens if we draw a line with that slope: 

In [None]:
r = correlation(su_heights, "Parent Average", "Child")
fig = px.scatter(su_heights, x="Parent Average", y="Child", height=600)
fig.add_scatter(x=su_test_heights["Parent Average"], 
                y=su_test_heights["NN Prediction"], 
                name="NN Prediction")
fig.add_scatter(x=np.arange(-3,4,0.1), y= r * np.arange(-3,4,0.1), 
                name=f"Line(y={np.round(r,4)} x)")

### The Relationship between Correlations and NN Predictions

Here we examine the relationship between the nearest neighbor prediction "line" and the correlation for several synthetic datasets.

In [None]:
def make_correlated_data(r, n=500):
    "Generate a a table with columns x and y with a correlation of approximately r"
    x = np.random.normal(0, 1, n)
    z = np.random.normal(0, 1, n)
    # This is "magic" to sample from a multivariate Gaussian
    y = r*x + (np.sqrt(1-r**2))*z 
    return pd.DataFrame({"x": x, "y": y})

In [None]:
#  You don't need to understand all the parts of this function.
def make_correlation_and_line_plot(r):
    """ 
    Generates a plot of synthetic data with a correlation coefficient r
    along with the nearest neighbor predictions and 
    a line with the slope r and intercept 0
    """
    # Make synthetic data
    example = make_correlated_data(r).sort_values("x")
    
    # Compute nearest neighbor predictions
    def nn_prediction_example(x_val):
        """ Predicts y-value for x based on the example table """
        neighbors = (
            example[(example['x'] >= x_val - 0.25) & (example['x'] < x_val + 0.25)]["y"]
        )
        if len(neighbors) == 0:
            return np.nan
        else: 
            return np.mean(neighbors)   
    example["NN Prediction"] = (
        example.apply(lambda row: nn_prediction_example(row['x']), axis=1))
    
    # Generate Plots.
    x,y = example["x"], example["y"]
    fig = px.scatter(example, x="x", y="y", height=600)
    fig.add_scatter(x=example["x"], y=example["NN Prediction"], 
                    name="NN Prediction", line_color="red")
    fig.add_scatter(x=x, y= r * x, name=f"Line(y={r} x)")
    fig.add_scatter(x=x, y=x, line_color="gray", line_dash="dot", name="Line(y=x)")
    return fig

#### Correlation of 0.90 

In [None]:
make_correlation_and_line_plot(r=0.90)

#### Correlation of 0.6 

In [None]:
make_correlation_and_line_plot(r=0.60)

#### Correlation of 0.2

In [None]:
make_correlation_and_line_plot(r=0.20)

#### Correlation of 0

In [None]:
make_correlation_and_line_plot(r=0)

#### Correlation of -0.60

In [None]:
make_correlation_and_line_plot(r=-0.60)

<br><br><br>

---

<center>Return to Slides</center>

---

<br><br><br>

## Defining the linear regression line

In standard units we developed a simple equation for the regression line:

\begin{align}
\text{SU}(y_\text{predicted}) = r * \text{SU}(x_\text{new})
\end{align}

where $r$ is the correlation coefficient and $\text{SU}$ is the standard units:

\begin{align}
\text{SU}(y_\text{predicted}) & = \frac{y_\text{predicted} - \text{Mean}(y)}{\text{Stdev}(y)} \\
\text{SU}(x_\text{new}) &= \frac{x_\text{new} - \text{Mean}(x)}{\text{Stdev}(x)}
\end{align}



Here we use $x_\text{new}$ to indicate a new $x$ value for which we want to make a prediction  $y_\text{predicted}$.

We would like to express this line in the original units of the data.  We can do that by substituting the definition of standard units:

\begin{align}
\frac{y_\text{predicted} - \text{Mean}(y)}{\text{Stdev}(y)} = r *  \frac{x_\text{new} - \text{Mean}(x)}{\text{Stdev}(x)}
\end{align}

While this equation does desribe a line it would look a little nicer in the form:

\begin{align}
y_\text{predicted} = \text{slope} * x_\text{new}  + \text{intercept}
\end{align}

Let's do some algebra to get that equation:
$$
\require{color}
\definecolor{comment}{RGB}{200,100,50}
\begin{align}
\frac{y_\text{predicted} - \text{Mean}(y)}{\text{Stdev}(y)} &= r *  \frac{x_\text{new} - \text{Mean}(x)}{\text{Stdev}(x)}\\
\frac{y_\text{predicted} - \text{Mean}(y)}{\text{Stdev}(y)} &= r * \frac{1}{\text{Stdev}(x)} x_\text{new} - r * \frac{1}{\text{Stdev}(x)}\text{Mean}(x)  & \color{comment} \text{Expanding the right side}\\
y_\text{predicted} - \text{Mean}(y) &= r * \frac{\text{Stdev}(y)}{\text{Stdev}(x)} x_\text{new} - r * \frac{\text{Stdev}(y)}{\text{Stdev}(x)}\text{Mean}(x) &  \color{comment} \text{Multiplying by $\text{Stdev}(y)$}\\
y_\text{predicted} &= r * \frac{\text{Stdev}(y)}{\text{Stdev}(x)} x_\text{new} + \text{Mean}(y) - r * \frac{\text{Stdev}(y)}{\text{Stdev}(x)}\text{Mean}(x) &  \color{comment} \text{Adding $\text{Mean}(y)$}\\
y_\text{predicted} &= \left(r * \frac{\text{Stdev}(y)}{\text{Stdev}(x)}\right) x_\text{new} + \left(\text{Mean}(y) - r * \frac{\text{Stdev}(y)}{\text{Stdev}(x)}\text{Mean}(x)\right) &  \color{comment} \text{Rearranging Terms}
\end{align}
$$

This means we can define the slope and intercept as:
\begin{align}
\text{slope} &= r * \frac{\text{Stdev}(y)}{\text{Stdev}(x)}\\
\text{intercept} & = \text{Mean}(y) - \text{slope} * \text{Mean}(x)
\end{align}

<br><br><br>

## Implementing Linear Regression

Using the above equations implement the slope and intercept functions:

In [None]:
def slope(t, x, y):
    """Computes the slope of the regression line"""
    ...

<br><details><br>
    
```python
def slope(t, x, y):
    """Computes the slope of the regression line"""
    r = correlation(t, x, y)
    y_sd = np.std(t[y])
    x_sd = np.std(t[x])
    return r * y_sd / x_sd
```

<br></details><br>

In [None]:
def intercept(t, x, y):
    """Computes the intercept of the regression line"""
    ...

<br><details><br>
    
```python
def intercept(t, x, y):
    """Computes the intercept of the regression line"""
    x_mean = np.mean(t[x])
    y_mean = np.mean(t[y])
    return y_mean - slope(t, x, y)*x_mean
```

<br></details><br>

Testing it out 

In [None]:
example = make_correlated_data(0.5)
slope(example, 'x', 'y')

<br><br>

Computing the slope and intercept for the heights dataset: 

In [None]:
heights_slope = ...
heights_intercept = ...
[heights_slope, heights_intercept]

<br><details><br>
    
```python
heights_slope = slope(heights, 'Parent Average', 'Child')
heights_intercept = intercept(heights, 'Parent Average', 'Child')
[heights_slope, heights_intercept]
```

<br></details><br>

Adding the regression predictions: 

<br><details><br>
    
```python
heights["Regression Prediction"] = heights_slope * heights["Parent Average"] + heights_intercept
heights.head(10)
```

<br></details><br>

In [None]:
fig = px.scatter(heights, x="Parent Average", y="Child", height=600)
fig.add_scatter(x=test_heights["Parent Average"], 
                y=test_heights["NN Prediction"], name="NN Prediction")
line_name = f"y = {np.round(heights_slope,2)} x + {np.round(heights_intercept,2)}"
fig.add_scatter(x=heights["Parent Average"], 
                y=heights["Regression Prediction"],
                name=line_name)