# Lecture 20 - Least Squares

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
plt.style.use('fivethirtyeight')

%matplotlib inline

In [None]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.offline as po
po.init_notebook_mode()

In [None]:
from scipy import optimize
import functools
import math

In [None]:
# Some functions for plotting. You don't have to understand how any
# of the functions in this cell work, since they use things we 
# haven't learned about in Data 8.

def resize_window(lim=3.5):
    plots.xlim(-lim, lim)
    plots.ylim(-lim, lim)
    
def draw_line(slope=0, intercept=0, x=np.array([-4, 4]), color='r'):
    y = x*slope + intercept
    plots.plot(x, y, color=color)
    
def draw_vertical_line(x_position, color='black'):
    x = np.array(x_position, x_position)
    y = np.array([-4, 4])
    plots.plot(x, y, color=color)
    
def make_correlated_data(r):
    x = np.random.normal(0, 1, 1000)
    z = np.random.normal(0, 1, 1000)
    y = r*x + (np.sqrt(1-r**2))*z
    return x, y

def r_table(r):
    """
    Generate a table of 1000 x,y data points in standard units
    whose correlation is approximately equal to r
    """
    np.random.seed(8)
    x, y = make_correlated_data(r)
    return pd.DataFrame({'x': x, 'y': y})

In [None]:
def demographics_errors(slope, intercept):
    # Use four convenient points from the original data
    sample = [[14.7, 33995], [19.1, 61454], [50.7, 71183], [59.5, 105918]]
    demographics.plot.scatter(x='College%', y='Median Income', alpha=0.5)
    xlims = np.array([5, 75])
    # Plot a line with the slope and intercept you specified:
    plt.plot(xlims, slope * xlims + intercept, lw=4)
    # Plot red lines from each of the four points to the line
    for x, y in sample:
        plt.plot([x, x], [y, slope * x + intercept], color='r', lw=4)

In [None]:
def show_demographics_rmse(slope, intercept):
    demographics_errors(slope, intercept)
    x = demographics['College%']
    y = demographics['Median Income']
    prediction = slope * x + intercept
    mse = np.mean((y - prediction) ** 2)
    print("Root mean squared error:", round(mse ** 0.5, 2))

In [None]:
def fitted_values(df, x, y):
    """Return an array of the regressions estimates at all the x values"""
    a = slope(df, x, y)
    b = intercept(df, x, y)
    return a*df[x] + b

# Slope & Intercept


In [None]:
def standard_units(x):
    "Convert any array of numbers to standard units."
    return (x - np.mean(x)) / np.std(x)

In [None]:
def correlation(df, x, y):
    """Computes the correlation between columns x and y"""
    x_su = standard_units(df[x])
    y_su = standard_units(df[y])
    return np.mean(x_su * y_su)

In [None]:
def slope(df, x, y):
    """Computes the slope of the regression line"""
    ...
    return ...
    
def intercept(df, x, y):
    """Computes the intercept of the regression line"""
    
    return ...

In [None]:
example = r_table(0.5)
slope(example, 'x', 'y')

## Heights Data 

In [None]:
families = pd.read_csv('data/family_heights.csv')

parent_avgs = (families["father"] + families["mother"])/2
heights = pd.DataFrame({'MidParent': parent_avgs, 
                        'Child': families['child']})

In [None]:
heights.head()

In [None]:
def nn_heights(parent_average, window=0.5):
    lower_bound = parent_average - window
    upper_bound = parent_average + window
    similar_child_heights = (
        heights[(heights['MidParent'] >= lower_bound) & (heights['MidParent'] < upper_bound)]["Child"]
    )
    if len(similar_child_heights) == 0: #handle the case when there is no data
        return np.nan # nan = not a number , a special floating point "number"
    else:
        return np.mean(similar_child_heights)

Make predictions at many different parent heights not just the heights in the dataset. 

In [None]:
test_heights = pd.DataFrame({"MidParent": heights['MidParent']})
test_heights["NN Prediction"] = (
    test_heights.apply(lambda row: nn_heights(row['MidParent']), axis=1))

In [None]:
heights_slope = ...
heights_intercept = ...
heights_slope, heights_intercept

In [None]:
test_heights['Regression Prediction'] = (
    ...
)
test_heights.head()

In [None]:
plt.scatter(test_heights['MidParent'], heights['Child'], label='Child')
plt.scatter(test_heights['MidParent'], test_heights['NN Prediction'], label='NN Pred')
plt.scatter(test_heights['MidParent'], test_heights['Regression Prediction'], label='Reg Pred')
plt.xlabel('Mid Parent Height')
plt.ylabel('Predicted Height')
plt.legend()

<br><br><br>

---
<center>Return to Slides, Slide 8</center>

---

<br><br><br>

<br><br>

---


# Making Predictions with Linear Regression


We can now compute predictions, but how good are they?  How do we know that we have a good linear fit? To study this we will consider a new dataset.

In [None]:
demographics = pd.read_csv('data/district_demographics2016.csv')
demographics.head(5)

In [None]:
plt.scatter(demographics['College%'], demographics['Median Income'])
plt.xlabel('College%')
plt.ylabel('Median Income');

In [None]:
correlation(demographics, 'College%', 'Median Income')

### Making Predictions

Here we will try to predict the income for each district as a function of the percent of college educated people.

In [None]:
regression_slope = ...
regression_intercept = ...
print("Slope:", regression_slope)
print("Intercept:", regression_intercept)

Make the actual predictions.

In [None]:
demographics["Linear Prediction"] = ...
demographics.head(5)

Visualizing the predictions:

In [None]:
plt.scatter(demographics['College%'], demographics['Median Income'], label='Median Income')
plt.scatter(demographics['College%'], demographics['Linear Prediction'], label='Linear Prediction')
plt.xlabel('College%')
plt.ylabel('Median Income')
plt.legend();

## Computing the Error

The error is the difference between the actual and predicted value:

$$
\text{error} = y - y_\text{predicted}
$$

In a future lecture, we will refer to this error as the **residual**.

In [None]:
actual = demographics['Median Income']
predicted = demographics["Linear Prediction"]

errors = actual - predicted

In [None]:
demographics['Error'] =  errors
demographics.head(5)

<br><br>

#### What are the districts with the largest error values?

In [None]:
demographics["Abs Error"] = np.abs(demographics["Error"])
demographics.sort_values("Abs Error", ascending=False).head(5)

#### What would a large error suggest?

<br><br><br>

## Visualizing the Errors

In [None]:
demographics_errors(regression_slope, regression_intercept)

In [None]:
# Try different slopes and intercepts. 

demographics_errors(...)

In [None]:
demographics_errors(...)


## Summarizing the Overall Error

What is the average error? 

Mean Absolute Error 

Mean Squared Error (MSE)

Root Mean Squared Error (RMSE)

<br><br>

#### Discussion Question
Assuming $y$ is income in dollars. What are the units of:
1. Mean Absolute Error
2. Mean Squared Error
3. Root Mean Squared Error

<br><br>

In [None]:
show_demographics_rmse(...)

In [None]:
show_demographics_rmse(...)

In [None]:
show_demographics_rmse(...)

<br><br><br>

---

<center> Return to Slides, Slide 10 </center>

---

<br><br><br>

## Numerical Optimization

If our goal is just to find the parameters of our line that minimize some kind of error, we can use numerical optimization tools.  Suppose we wanted to minimize the function:

$$
f(x) = \left(x - 2\right)^2 + 3
$$

In [None]:
def f(x):
    return ((x-2)**2) + 3

In [None]:
x = np.arange(1, 3, 0.1)
y = f(x)
px.line(x=x, y=y)

In [None]:
# from scipy import optimize
# import functools
# import math

# You don't need to understand the details
# This function using scipy.optimize.minimize function 

def minimize(f, start=None, smooth=False, log=None, array=False, **vargs):
    if start is None:
        assert not array, "Please pass starting values explicitly when array=True"
        arg_count = f.__code__.co_argcount
        assert arg_count > 0, "Please pass starting values explicitly for variadic functions"
        start = [0] * arg_count
    if not hasattr(start, '__len__'):
        start = [start]

    if array:
        objective = f
    else:
        @functools.wraps(f)
        def objective(args):
            return f(*args)

    if not smooth and 'method' not in vargs:
        vargs['method'] = 'Powell'
    result = optimize.minimize(objective, start, **vargs)
    if log is not None:
        log(result)
    if len(start) == 1:
        return result.x.item(0)
    else:
        return result.x

minimize(f)
print("x_min =", minimize(f))
print("f(x_min) =", f(minimize(f)))

In [None]:
fig = px.line(x=x, y=y)
fig.add_scatter(x=[minimize(f)], y=[f(minimize(f))],
                name="Minimum", marker_color="red", marker_size=10)

Minimize works for even more complex functions.

$$
f(x) = 2 * \sin(\pi x) + x^3 + x^4 + \sin(10x)
$$

In [None]:
def complicated_function(x):
    return 2 * np.sin(x*np.pi) + x ** 3 + x ** 4 + np.sin(x * 10)

In [None]:
x = np.arange(-1.5, 1.5, 0.01)
y2 = complicated_function(x)
px.line(x=x, y=y2)

We can still use minimize to find the minimum:

In [None]:
x_min = minimize(complicated_function)
print("x_min =", x_min)
print("f(x_min) =", complicated_function(x_min))

In [None]:
fig = px.line(x=x, y=y2)
fig.add_scatter(x=[x_min],
                y=[complicated_function(x_min)],
                name="Minimum", marker_color="red", marker_size=10)

We can even minimize multidimensional functions:

$$
\texttt{surface_function(a,b)} = -\frac{\cos\left(\pi \sqrt{(a+0.5)^2 + b^2}\right)}{\sqrt{(a+0.5)^2 + b^2} + 1}
$$

In [None]:
def surface_function(a, b):
    d = np.sqrt( (a+0.5)**2 + b**2 )
    return -np.cos(np.pi* d) / (d**2 + 1)

In [None]:
a_min, b_min = minimize(surface_function)
[a_min, b_min]

In [None]:
xs = np.arange(-1.5, 1.5, 0.01)
ys = np.arange(-1.5, 1.5, 0.01)
x, y = np.meshgrid(xs, ys)
zs = surface_function(x.flatten(), y.flatten())
go.Figure(data=[
    go.Surface(x = xs, y = ys,
               z=zs.reshape(len(xs), len(ys))),
    go.Scatter3d(x=[a_min], y=[b_min], z=[surface_function(a_min, b_min)])
    ], 
    layout=dict(height=1000, 
                scene_xaxis_title="a", scene_yaxis_title="b", 
                scene_zaxis_title="surface"))

<br><br><br>

---

<center> Return to Slides, Slide 10 </center>

---

<br><br><br>

## Minimizing RMSE 

We can use minimize to find the slope and intercept that minimize root mean squared error in our predictions:

In [None]:
def demographics_rmse(any_slope, any_intercept):
    x = demographics['College%']
    y = demographics['Median Income']
    estimate = any_slope*x + any_intercept
    return (np.mean((y - estimate) ** 2)) ** 0.5

In [None]:
demographics_rmse(...)

In [None]:
demographics_rmse(...)

In [None]:
...

How does this compare to the slope and intercept we derived earlier?

In [None]:
[regression_slope, regression_intercept]

What happens if we minimize the mean squared error instead of the root mean squared error?

In [None]:
def demographics_mse(slope, intercept):
    x = demographics['College%']
    y = demographics['Median Income']
    estimate = slope*x + intercept
    return np.mean(((y - estimate) ** 2))

In [None]:
...

What about if we minimize the absolute error?

In [None]:
def demographics_mae(any_slope, any_intercept):
    x = demographics['College%']
    y = demographics['Median Income']
    estimate = any_slope*x + any_intercept
    return np.mean(np.abs(y - estimate))

In [None]:
...

This is different! 

In [None]:
mae_slope, mae_intercept = minimize(demographics_mae)
fig = px.scatter(demographics, x="College%", y="Median Income", color="State")
xtest = np.arange(0, 75, 0.1)
fig.add_scatter(x=xtest, 
                y=regression_slope * xtest + regression_intercept,
                name = f"Least Squares: {np.round(regression_slope, 2)} x + {np.round(regression_intercept)}")
fig.add_scatter(x=xtest, 
                y=mae_slope * xtest + mae_intercept,
                name = f"MAE: {np.round(mae_slope, 2)} x + {np.round(mae_intercept)}")
fig

<br><br><br>

---

## Varying the Slope and Intercept and Plotting the RMSE

In [None]:
alt_slopes = regression_slope + np.arange(-20, 20)
rmses = []
for new_slope in alt_slopes:
    rmses = np.append(rmses, demographics_rmse(new_slope, regression_intercept))

variations = pd.DataFrame({"Slope": alt_slopes, "RMSE": rmses})
variations.head(5)

In [None]:
fig = px.scatter(variations, x="Slope", y="RMSE")
fig.add_scatter(x=[regression_slope], y=[demographics_rmse(regression_slope, regression_intercept)], 
                marker_size=10, name="Best Slope")

What if we tried to change the intercept value while using the best slope so far?

In [None]:
alt_intercepts = regression_intercept + np.arange(-2000, 2000, 100)
rmses = []
for new_intercept in alt_intercepts:
    rmses = np.append(rmses, demographics_rmse(regression_slope, new_intercept))

variations = pd.DataFrame({"Intercept": alt_intercepts, "RMSE": rmses})
fig = px.scatter(variations, x="Intercept", y="RMSE")
fig.add_scatter(x=[regression_intercept], y=[demographics_rmse(regression_slope, regression_intercept)], 
                marker_size=10, name="Best Intercept")

What if we tried changing both the slope and the intercept at the same time?

In [None]:
# This cell is slow
alt_slopes = regression_slope + np.arange(-100, 100, 1)
alt_intercepts = regression_intercept + np.arange(-1000, 1000, 10)
variations = pd.DataFrame(columns = ["Slope", "Intercept", "RMSE"])
i=0
for new_slope in alt_slopes:
    for new_intercept in alt_intercepts:
        rmse = demographics_rmse(new_slope, new_intercept)
        variations.loc[i] = [new_slope, new_intercept, rmse]
        i = i+1
    
variations
go.Figure(data=[
    go.Contour(x=variations["Slope"], y=variations["Intercept"], z=variations["RMSE"]), 
    go.Scatter(x=[regression_slope], y=[regression_intercept], marker_color="red")
],
layout=dict(width = 800,height=600, xaxis_title="Slope", yaxis_title="Intercept"))

<br><br><br>

---

<center> Return to Slides, Slide 12</center>

---

<br><br><br>

## Multiple Linear Regression

We can also use multiple variables to help predict a single variable. 

In [None]:
hybrid = pd.read_csv('data/hybrid.csv')
hybrid.head(5)

In [None]:
px.scatter_3d(
    hybrid, 
    x="mpg", y="acceleration", z="msrp",
    hover_name="vehicle", 
    color="class", 
    height=800
)

Suppose we use the model: 

$$ y = a * acc + b * mpg + c$$

In [None]:
def hybrid_rmse(a, b, c):
    actual = hybrid["msrp"]
    acc = hybrid["acceleration"]
    mpg = hybrid["mpg"]
    predicted = a*acc + b*mpg + c
    mse = np.sqrt(np.mean((actual - predicted)**2))
    return mse

In [None]:
...

In [None]:
print(f"Error: {hybrid_rmse(a, b, c):,}")

In [None]:
mpg_range = np.arange(10, 80)
acceleration_range = np.arange(5, 25)
predictions = pd.DataFrame(columns=["mpg", "acc", "pred"])
i = 0
for mpg in mpg_range:
    for acc in acceleration_range: 
        pred = a * acc + b * mpg + c
        predictions.loc[i] = [mpg, acc, pred]
        i = i+1

In [None]:
fig = px.scatter_3d(
    hybrid, 
    x="mpg", y="acceleration", z="msrp",
    hover_name="vehicle", 
    color="class", 
    height=800
)
fig.add_surface(
    x = mpg_range, y = acceleration_range,
    z = predictions["pred"].to_numpy().reshape(len(mpg_range), len(acceleration_range)).T,
    showscale=False
)

## Fitting Non-Linear Data 

We could try to improve our predictions by defining a more complex equation: 

$$y = a * acc + b * mpg + c * acc^2 + d * mpg^2 + e$$

In [None]:
def hybrid_better_rmse(a, b, c, d, e):
    actual = hybrid["msrp"]
    acc = hybrid["acceleration"]
    mpg = hybrid["mpg"]
    predicted = a*acc + b*mpg + c*acc**2 + d*mpg**2 + e
    mse = np.sqrt(np.mean((actual - predicted)**2))
    return mse

In [None]:
...

In [None]:
print(f"Error: {hybrid_better_rmse(a,b,c,d,e):,}")

In [None]:
mpg_range = np.arange(10, 80)
acceleration_range = np.arange(5, 25)
predictions = pd.DataFrame(columns=["mpg", "acc", "pred"])
i = 0
for mpg in mpg_range:
    for acc in acceleration_range: 
        pred = a*acc + b*mpg + c*acc**2 + d*mpg**2 + e
        predictions.loc[i] = [mpg, acc, pred]
        i = i+1
        
fig = px.scatter_3d(
    hybrid, 
    x="mpg", y="acceleration", z="msrp",
    hover_name="vehicle", 
    color="class", 
    height=800
)
fig.add_surface(
    x = mpg_range, y = acceleration_range,
    z = predictions["pred"].to_numpy().reshape(len(mpg_range), len(acceleration_range)).T,
    showscale=False
)

### Example 2: Nonlinear Regression

In [None]:
shotput = pd.read_csv('data/shotput.csv')
shotput.head()

In [None]:
shotput.plot.scatter(x='Weight Lifted', y='Shot Put Distance')

In [None]:
def shotput_linear_rmse(any_slope, any_intercept):
    x = shotput['Weight Lifted']
    y = shotput['Shot Put Distance']
    estimate = any_slope*x + any_intercept
    return np.mean((y - estimate) ** 2) ** 0.5

In [None]:
best_line = ...
best_line

In [None]:
weights = shotput.iloc[:,0]

In [None]:
linear_fit = ...

shotput['Best Line'] = linear_fit

In [None]:
plt.scatter(shotput['Weight Lifted'],shotput['Shot Put Distance'], label='Shot Put Distance')
plt.scatter(shotput['Weight Lifted'], shotput['Best Line'], label='Best Line')
plt.xlabel('Weight Lifted')
plt.ylabel('Shot Put Distance')
plt.legend()

### Quadratic Function 

$f(x) = ax^2 + bx + c$  

for constants $a$, $b$, and $c$.

In [None]:
def shotput_quadratic_rmse(a, b, c):
    x = shotput['Weight Lifted']
    y = shotput['Shot Put Distance']
    estimate = a*(x**2) + b*x + c
    return np.mean((y - estimate) ** 2) ** 0.5

In [None]:
best_quad = ...
best_quad

In [None]:
# x = weight lifted = 100 kg
# Then predicted shot put distance:

(-0.00104)*(100**2) + 0.2827*100 - 1.5318

In [None]:
quad_fit = ...

In [None]:
shotput['Best Quadratic Curve'] = quad_fit

In [None]:
plt.scatter(shotput['Weight Lifted'],shotput['Shot Put Distance'], label='Shot Put Distance')
plt.scatter(shotput['Weight Lifted'], shotput['Best Line'], label='Best Line')
plt.scatter(shotput['Weight Lifted'], shotput['Best Quadratic Curve'], label='Best Quad. Line')
plt.xlabel('Weight Lifted')
plt.ylabel('Shot Put Distance')
plt.legend()