# Lecture 19 - Linear Regression 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
plt.style.use('fivethirtyeight')

%matplotlib inline

In [None]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.offline as po
po.init_notebook_mode()

In [None]:
from scipy import optimize
import functools
import math

# Review - Correlation 

### Standard Units

$$
\text{StandardUnits}(x) = \frac{x - \text{Mean}(x)}{\text{Stdev}(x)} 
$$

In [None]:
def standard_units(x):
    "Convert any array of numbers to standard units."
    return (x - np.average(x)) / np.std(x)

### Correlation

$$
\begin{align}
r 
& = \text{Mean}\left(\text{StandardUnits}(x) *  \text{StandardUnits}(y)\right)\\
& = \frac{1}{n} \sum_{i=1}^n \text{StandardUnits}(x_i) *  \text{StandardUnits}(y_i)\\
& = \frac{1}{n}\sum_{i=1}^n \left( \frac{x_i - \text{Mean}(x)}{\text{Stdev}(x)} \right) * \left( \frac{y_i - \text{Mean}(y)}{\text{Stdev}(y)} \right) \\
\end{align}
$$

In [None]:
def correlation(df, x, y):
    """Computes the correlation between columns x and y"""
    x_su = standard_units(df[x])
    y_su = standard_units(df[y])
    return np.mean(x_su * y_su)

### Examples 

In [None]:
hybrid = pd.read_csv('data/hybrid.csv')
suv = hybrid[hybrid['class'] == 'SUV']

In [None]:
suv.plot.scatter(x='mpg', y='msrp');

In [None]:
correlation(suv, 'mpg', 'msrp')

In [None]:
suv.plot.scatter(x='acceleration', y='msrp');

In [None]:
correlation(suv, 'acceleration', 'msrp')

## Caution on Intrepreting Correlation 

### Nonlinearity 

In [None]:
new_x = np.arange(-4, 4.1, 0.5)
nonlinear = pd.DataFrame({'newx': new_x, 'y': new_x**2})
nonlinear.plot.scatter('newx', 'y', s=30, color='r');

In [None]:
correlation(nonlinear, 'newx', 'y')

### Outliers 

In [None]:
line = pd.DataFrame({'x': np.array([1, 2, 3, 4]), 
                     'y': np.array([1, 2, 3, 4])})
line.plot.scatter(x='x', y='y', s=30, color='r');

In [None]:
correlation(line, 'x', 'y')

In [None]:
outlier = pd.DataFrame({'x': np.array([1, 2, 3, 4, 5]), 
                        'y': np.array([1, 2, 3, 4, 0])})
outlier.plot.scatter(x='x', y='y', s=30, color='r');

In [None]:
correlation(outlier, 'x', 'y')

### Ecological Correlations 

In [None]:
sats = pd.read_csv('data/sat_scores.csv')
sats.sort_values('State', inplace=True)
sats.head(10)

In [None]:
sats.plot.scatter(x='Critical Reading', y='Math');

In [None]:
correlation(sats, 'Critical Reading', 'Math')

In [None]:
def rate_code(x):
    if x <= 25:
        return 'low'
    elif x <= 50:
        return 'low-moderate'
    elif x <= 75:
        return 'moderate_high'
    else:
        return 'high'

In [None]:
rate_codes = sats.apply(lambda x: rate_code(x['Participation Rate']), axis=1)

In [None]:
sats['Rate Code'] = rate_codes
sats.head(10)

In [None]:
sns.scatterplot(data=sats, x="Critical Reading", y="Math", hue="Rate Code");

In [None]:
sats[sats['Rate Code'] == 'low'].head(10)

In [None]:
sats[sats['Rate Code'] == 'high']

<br><br><br>

---
<center>Return to Slides</center>

---

<br><br><br>

## Prediction Lines 

Let's build an intuition about the relationship between the slope of the nearest neighbor line and the correlation coefficient.

We will again use the heights data. 

In [None]:
families = pd.read_csv('data/family_heights.csv')
parent_avgs = (families["father"] + families["mother"])/2
heights = pd.DataFrame({'Parent Average': parent_avgs, 
                        'Child': families['child']})
heights.sort_values('Parent Average', inplace=True)
heights.head(10)

We will build a slightly more robust Nearest Neighbor predictor. 

In [None]:
def nn_heights(parent_average, window=0.5):
    lower_bound = parent_average - window
    upper_bound = parent_average + window
    similar_child_heights = (
        heights[(heights['Parent Average'] >= lower_bound) & (heights['Parent Average'] < upper_bound)]["Child"]
    )
    if len(similar_child_heights) == 0: #handle the case when there is no data
        return np.nan # nan = not a number , a special floating point "number"
    else:
        return np.mean(similar_child_heights)

Make predictions at many different parent heights not just the heights in the dataset. 

In [None]:
test_heights = pd.DataFrame({"Parent Average": np.arange(61,74,0.2)})
test_heights["NN Prediction"] = (
    test_heights.apply(lambda row: nn_heights(row['Parent Average']), axis=1))

In [None]:
# Plot it, using plotly 
fig = px.scatter(heights, x="Parent Average", y="Child", height=600)
fig.add_scatter(x=test_heights["Parent Average"], 
                y=test_heights["NN Prediction"], name="NN Prediction")

It will be easier to start in standard units. 

In [None]:
# Transform the heights data to standard units
su_heights = pd.DataFrame({'Parent Average': standard_units(heights['Parent Average']), 
                           'Child': standard_units(heights['Child'])})

# Transform the predictions to standard units 
su_test_heights = (
    pd.DataFrame({'Parent Average': 
                  (test_heights['Parent Average'] - heights['Parent Average'].mean()) 
                  / heights['Parent Average'].std(), 
                  'NN Prediction': 
                  (test_heights['NN Prediction'] - heights['Child'].mean()) 
                  / heights['Child'].std()}))

# Plot it 
fig = px.scatter(su_heights, x="Parent Average", y="Child", height=600)
fig.add_scatter(x=su_test_heights["Parent Average"], 
                y=su_test_heights["NN Prediction"], name="NN Prediction")    


Computing the correlation we get: 

In [None]:
correlation(heights, "Parent Average", "Child")

What happens if we draw a line with that slope: 

In [None]:
r = correlation(su_heights, "Parent Average", "Child")
fig = px.scatter(su_heights, x="Parent Average", y="Child", height=600)
fig.add_scatter(x=su_test_heights["Parent Average"], 
                y=su_test_heights["NN Prediction"], 
                name="NN Prediction")
fig.add_scatter(x=np.arange(-3,4,0.1), y= r * np.arange(-3,4,0.1), 
                name=f"Line(y={np.round(r,4)} x)")

### The Relationship between Correlations and NN Predictions

Here we examine the relationship between the nearest neighbor prediction "line" and the correlation for several synthetic datasets.


In [None]:
def make_correlated_data(r, n=500):
    "Generate a a table with columns x and y with a correlation of approximately r"
    x = np.random.normal(0, 1, n)
    z = np.random.normal(0, 1, n)
    # This is "magic" to sample from a multivariate Gaussian
    y = r*x + (np.sqrt(1-r**2))*z 
    return pd.DataFrame({"x": x, "y": y})

In [None]:
#  You don't need to understand all the parts of this function.
def make_correlation_and_line_plot(r):
    """ 
    Generates a plot of synthetic data with a correlation coefficient r
    along with the nearest neighbor predictions and 
    a line with the slope r and intercept 0
    """
    # Make synthetic data
    example = make_correlated_data(r).sort_values("x")
    
    # Compute nearest neighbor predictions
    def nn_prediction_example(x_val):
        """ Predicts y-value for x based on the example table """
        neighbors = (
            example[(example['x'] >= x_val - 0.25) & (example['x'] < x_val + 0.25)]["y"]
        )
        if len(neighbors) == 0:
            return np.nan
        else: 
            return np.mean(neighbors)   
    example["NN Prediction"] = (
        example.apply(lambda row: nn_prediction_example(row['x']), axis=1))
    
    # Generate Plots.
    x,y = example["x"], example["y"]
    fig = px.scatter(example, x="x", y="y", height=600)
    fig.add_scatter(x=example["x"], y=example["NN Prediction"], 
                    name="NN Prediction", line_color="red")
    fig.add_scatter(x=x, y= r * x, name=f"Line(y={r} x)")
    fig.add_scatter(x=x, y=x, line_color="gray", line_dash="dot", name="Line(y=x)")
    return fig

#### Correlation of 0.90

In [None]:
make_correlation_and_line_plot(...)

#### Correlation of 0.60

In [None]:
make_correlation_and_line_plot(...)

#### Correlation of 0.20

In [None]:
make_correlation_and_line_plot(...)

#### Correlation of 0

In [None]:
make_correlation_and_line_plot(...)

#### Correlation of -0.6

In [None]:
make_correlation_and_line_plot(...)

<br><br><br>

---

<center>Return to Slides</center>

---

<br><br><br>

# Defining the linear regression line

In standard units we developed a simple equation for the regression line:

\begin{align}
\text{SU}(y_\text{predicted}) = r * \text{SU}(x_\text{new})
\end{align}

where $r$ is the correlation coefficient and $\text{SU}$ is the standard units:

\begin{align}
\text{SU}(y_\text{predicted}) & = \frac{y_\text{predicted} - \text{Mean}(y)}{\text{Stdev}(y)} \\
\text{SU}(x_\text{new}) &= \frac{x_\text{new} - \text{Mean}(x)}{\text{Stdev}(x)}
\end{align}



Here we use $x_\text{new}$ to indicate a new $x$ value for which we want to make a prediction  $y_\text{predicted}$.

We would like to express this line in the original units of the data.  We can do that by substituting the definition of standard units:

\begin{align}
\frac{y_\text{predicted} - \text{Mean}(y)}{\text{Stdev}(y)} = r *  \frac{x_\text{new} - \text{Mean}(x)}{\text{Stdev}(x)}
\end{align}

While this equation does desribe a line it would look a little nicer in the form:

\begin{align}
y_\text{predicted} = \text{slope} * x_\text{new}  + \text{intercept}
\end{align}

Let's do some algebra to get that equation:
$$
\require{color}
\definecolor{comment}{RGB}{200,100,50}
\begin{align}
\frac{y_\text{predicted} - \text{Mean}(y)}{\text{Stdev}(y)} &= r *  \frac{x_\text{new} - \text{Mean}(x)}{\text{Stdev}(x)}\\
\frac{y_\text{predicted} - \text{Mean}(y)}{\text{Stdev}(y)} &= r * \frac{1}{\text{Stdev}(x)} x_\text{new} - r * \frac{1}{\text{Stdev}(x)}\text{Mean}(x)  & \color{comment} \text{Expanding the right side}\\
y_\text{predicted} - \text{Mean}(y) &= r * \frac{\text{Stdev}(y)}{\text{Stdev}(x)} x_\text{new} - r * \frac{\text{Stdev}(y)}{\text{Stdev}(x)}\text{Mean}(x) &  \color{comment} \text{Multiplying by $\text{Stdev}(y)$}\\
y_\text{predicted} &= r * \frac{\text{Stdev}(y)}{\text{Stdev}(x)} x_\text{new} + \text{Mean}(y) - r * \frac{\text{Stdev}(y)}{\text{Stdev}(x)}\text{Mean}(x) &  \color{comment} \text{Adding $\text{Mean}(y)$}\\
y_\text{predicted} &= \left(r * \frac{\text{Stdev}(y)}{\text{Stdev}(x)}\right) x_\text{new} + \left(\text{Mean}(y) - r * \frac{\text{Stdev}(y)}{\text{Stdev}(x)}\text{Mean}(x)\right) &  \color{comment} \text{Rearranging Terms}
\end{align}
$$

This means we can define the slope and intercept as:
\begin{align}
\text{slope} &= r * \frac{\text{Stdev}(y)}{\text{Stdev}(x)}\\
\text{intercept} & = \text{Mean}(y) - \text{slope} * \text{Mean}(x)
\end{align}

<br><br><br>

## Implementing Linear Regression

Using the above equations implement the slope and intercept functions. 

### Slope

In [None]:
def slope(df, x, y):
    """Computes the slope of the regression line"""
    r = ...
    
    return ...

<br><details><br>
    
```python
def slope(t, x, y):
    """Computes the slope of the regression line"""
    r = correlation(t, x, y)
    y_sd = np.std(t[y])
    x_sd = np.std(t[x])
    return r * y_sd / x_sd
```

<br></details><br>

### Intercept

In [None]:
def intercept(df, x, y):
    """Computes the intercept of the regression line"""
    x_mean = ...
    
    return ...

<br><details><br>
    
```python
def intercept(t, x, y):
    """Computes the intercept of the regression line"""
    x_mean = np.mean(t[x])
    y_mean = np.mean(t[y])
    return y_mean - slope(t, x, y)*x_mean
```

<br></details><br>

Testing it out

In [None]:
example = make_correlated_data(0.5)
slope(example, 'x', 'y')

<br><br>
Computing the slope and intercept for the heights dataset:

In [None]:
heights_slope = slope(...)
heights_intercept = intercept(...)
[heights_slope, heights_intercept]

<br><details><br>
    
```python
heights_slope = slope(heights, 'Parent Average', 'Child')
heights_intercept = intercept(heights, 'Parent Average', 'Child')
[heights_slope, heights_intercept]
```

<br></details><br>

Adding the regression predictions: 

In [None]:
heights["Regression Prediction"] = ...
heights.head(10)

<br><details><br>
    
```python
heights["Regression Prediction"] = heights_slope * heights["Parent Average"] + heights_intercept
heights.head(10)
```

<br></details><br>

In [None]:
fig = px.scatter(heights, x="Parent Average", y="Child", height=600)
fig.add_scatter(x=test_heights["Parent Average"], 
                y=test_heights["NN Prediction"], name="NN Prediction")
line_name = f"y = {np.round(heights_slope,2)} x + {np.round(heights_intercept,2)}"
fig.add_scatter(x=heights["Parent Average"], 
                y=heights["Regression Prediction"],
                name=line_name)

### Predict Linear 

We can define a function to predict the y-value given a DataFrame and two columns, `x` and `y`. 

In [None]:
def predict_linear(df, x, y):
    """Return an DataSeries of the regressions estimates at all the x values"""
    pred_y = slope(df, x, y) * df[x] + intercept(df, x, y)
    return pred_y

<br><br>

---


# Making Predictions with Linear Regression


We can now compute predictions, but how good are they?  How do we know that we have a good linear fit? To study this we will consider a new dataset.

In [None]:
demographics = pd.read_csv('data/district_demographics2016.csv')
demographics.head(5)

In [None]:
px.scatter(demographics, 
           x="College%", 
           y="Median Income",
           color="State")

In [None]:
correlation(demographics, 'College%', 'Median Income')

**Discussion Question:** Any concerns about the correlation computation being done here?


<br><br>

### Making Predictions

Here we will try to predict the income for each district as a function of the percent of college educated people.

In [None]:
demo_slope = slope(demographics, 'College%', 'Median Income')
demo_intercept = intercept(demographics, 'College%', 'Median Income')
print("Slope:", demo_slope)
print("Intercept:", demo_intercept)

Make the actual predictions.

In [None]:
demographics["Linear Prediction"] =  predict_linear(demographics, 'College%', 'Median Income')
demographics.head(5)

Visualizing the predictions:

In [None]:
fig = px.scatter(demographics, x="College%", y="Median Income", height=400)
fig.add_scatter(x=demographics["College%"], 
                y=demographics["Linear Prediction"], 
                name="Linear Prediction")

In [None]:
fig = px.scatter(demographics, x="College%", y="Median Income", height=400)
xtest = np.arange(0, 75, 1)
fig.add_scatter(x=xtest, 
                y=demo_slope * xtest + demo_intercept,
                name = f"{np.round(demo_slope, 2)} x + {np.round(demo_intercept)}")
fig

<br><br><br>

---

<center> Return to Slides </center>

---

<br><br><br>

## Computing the Error

The error is the difference between the actual and predicted value:

$$
\text{error} = y - y_\text{predicted}
$$

In a future lecture, we will refer to this error as the **residual**.

In [None]:
y = demographics['Median Income']
predicted = predict_linear(demographics, 'College%', 'Median Income')

errors = y - predicted

In [None]:
demographics['Error'] =  errors
demographics.head(5)

<br><br>
What are the districts with the largest error values?

In [None]:
demographics["Abs Error"] = np.abs(demographics["Error"])
demographics.sort_values("Abs Error", ascending=False).head(5)

What would a large error suggest?

<br><br><br>

### Visualizing the Errors

In [None]:
fig = px.scatter(demographics, x="College%", y="Median Income", hover_name="District")
xtest = np.arange(0, 75, 1)
fig.add_scatter(x=xtest, 
                y=demo_slope * xtest + demo_intercept,
                name = f"{np.round(demo_slope, 2)} x + {np.round(demo_intercept)}")
fig.add_scatter(x=demographics["College%"].repeat(3), 
                y=np.ravel(np.vstack([y, predicted, np.nan * predicted]).T),
                marker_color="gray", line_width=0.75, name="Errors")
fig

In [None]:
demographics.hist('Error', bins=50, density=True);

<br><br><br>

---
## Summarizing the Overall Error

What is the average error? 

In [None]:
...

Mean Absolute Error 

In [None]:
...

Mean Squared Error (MSE)

In [None]:
...

Root Mean Squared Error (RMSE)

In [None]:
...

<br><br>

#### Discussion Question
Assuming $y$ is income in dollars. What are the units of:
1. Mean Absolute Error
2. Mean Squared Error
3. Root Mean Squared Error

<br><br>

<br>

---

## Error as Function of our Model (Line)

In [None]:
def demographics_rmse(slope, intercept):
    predicted = ...
    actual = demographics["Median Income"]
    errors = predicted - actual
    rmse = ...
    return rmse

The value of our error function for the slope and intercept we derived in last lecture is:

In [None]:
demographics_rmse(demo_slope, demo_intercept)

What if we used a different slope and intercept value:

In [None]:
def visualize_demographics_rmse(slope, intercept):
    rmse = demographics_rmse(slope, intercept)
    predicted = slope * demographics["College%"] + intercept 
    actual = demographics["Median Income"]
    fig = px.scatter(demographics, x="College%", y="Median Income")
    xtest = np.arange(0, 75, 1)
    fig.add_scatter(x=xtest, y=slope * xtest + intercept,
                    name = f"{np.round(slope, 2)} x + {np.round(intercept)}")
    fig.add_scatter(x=demographics["College%"].repeat(3), 
                    y=np.ravel(np.vstack([actual, predicted, np.nan * predicted]).T),
                    marker_color="gray", line_width=0.75, name="Errors")
    fig.update_layout(title=f"RMSE = {np.round(rmse, 2)}")
    return fig

In [None]:
visualize_demographics_rmse(demo_slope, demo_intercept)

In [None]:
visualize_demographics_rmse(demo_slope+1000, demo_intercept - 50000)

<br><br><br>

---

### Varying the Slope and Intercept and Plotting the RMSE

In [None]:
alt_slopes = demo_slope + np.arange(-20, 20)
rmses = []
for new_slope in alt_slopes:
    rmses = np.append(rmses, demographics_rmse(new_slope, demo_intercept))

variations = pd.DataFrame({"Slope": alt_slopes, "RMSE": rmses})
variations.head(5)

In [None]:
fig = px.scatter(variations, x="Slope", y="RMSE")
fig.add_scatter(x=[demo_slope], y=[demographics_rmse(demo_slope, demo_intercept)], marker_size=10, 
                name="Best Slope")

What if we tried to change the intercept value while using the best slope so far?

In [None]:
alt_intercepts = demo_intercept + np.arange(-2000, 2000, 100)
rmses = []
for new_intercept in alt_intercepts:
    rmses = np.append(rmses, demographics_rmse(demo_slope, new_intercept))

variations = pd.DataFrame({"Intercept": alt_intercepts, "RMSE": rmses})
fig = px.scatter(variations, x="Intercept", y="RMSE")
fig.add_scatter(x=[demo_intercept], y=[demographics_rmse(demo_slope, demo_intercept)], 
                marker_size=10, name="Best Intercept")

What if we tried changing both the slope and the intercept at the same time?

In [None]:
alt_slopes = demo_slope + np.arange(-100, 100, 1)
alt_intercepts = demo_intercept + np.arange(-1000, 1000, 10)
variations = pd.DataFrame(columns = ["Slope", "Intercept", "RMSE"])
i=0
for new_slope in alt_slopes:
    for new_intercept in alt_intercepts:
        rmse = demographics_rmse(new_slope, new_intercept)
        variations.loc[i] = [new_slope, new_intercept, rmse]
        i = i+1
    
variations
go.Figure(data=[
    go.Contour(x=variations["Slope"], y=variations["Intercept"], z=variations["RMSE"]), 
    go.Scatter(x=[demo_slope], y=[demo_intercept], marker_color="red")
],
layout=dict(width = 800,height=600, xaxis_title="Slope", yaxis_title="Intercept"))

<br><br><br>

---

<center> Return to Slides </center>

---

<br><br><br>

## Numerical Optimization

If our goal is just to find the parameters of our line that minimize some kind of error, we can use numerical optimization tools.  Suppose we wanted to minimize the function:

$$
f(x) = \left(x - 2\right)^2 + 3
$$

In [None]:
def f(x):
    return ((x-2)**2) + 3

In [None]:
x = np.arange(1, 3, 0.1)
y = f(x)
px.line(x=x, y=y)

In [None]:
# from scipy import optimize
# import functools
# import math

# You don't need to understand the details
# This function using scipy.optimize.minimize function 

def minimize(f, start=None, smooth=False, log=None, array=False, **vargs):
    if start is None:
        assert not array, "Please pass starting values explicitly when array=True"
        arg_count = f.__code__.co_argcount
        assert arg_count > 0, "Please pass starting values explicitly for variadic functions"
        start = [0] * arg_count
    if not hasattr(start, '__len__'):
        start = [start]

    if array:
        objective = f
    else:
        @functools.wraps(f)
        def objective(args):
            return f(*args)

    if not smooth and 'method' not in vargs:
        vargs['method'] = 'Powell'
    result = optimize.minimize(objective, start, **vargs)
    if log is not None:
        log(result)
    if len(start) == 1:
        return result.x.item(0)
    else:
        return result.x

minimize(f)
print("x_min =", minimize(f))
print("f(x_min) =", f(minimize(f)))

In [None]:
fig = px.line(x=x, y=y)
fig.add_scatter(x=[minimize(f)], y=[f(minimize(f))],
                name="Minimum", marker_color="red", marker_size=10)

Minimize works for even more complex functions.

$$
f(x) = 2 * \sin(\pi x) + x^3 + x^4 + \sin(10x)
$$

In [None]:
def complicated_function(x):
    return 2 * np.sin(x*np.pi) + x ** 3 + x ** 4 + np.sin(x * 10)

In [None]:
x = np.arange(-1.5, 1.5, 0.01)
y2 = complicated_function(x)
px.line(x=x, y=y2)

We can still use minimize to find the minimum:

In [None]:
x_min = minimize(complicated_function)
print("x_min =", x_min)
print("f(x_min) =", complicated_function(x_min))

In [None]:
fig = px.line(x=x, y=y2)
fig.add_scatter(x=[x_min],
                y=[complicated_function(x_min)],
                name="Minimum", marker_color="red", marker_size=10)

We can even minimize multidimensional functions:

$$
\texttt{surface_function(a,b)} = -\frac{\cos\left(\pi \sqrt{(a+0.5)^2 + b^2}\right)}{\sqrt{(a+0.5)^2 + b^2} + 1}
$$

In [None]:
def surface_function(a, b):
    d = np.sqrt( (a+0.5)**2 + b**2 )
    return -np.cos(np.pi* d) / (d**2 + 1)

In [None]:
a_min, b_min = minimize(surface_function)
[a_min, b_min]

In [None]:
xs = np.arange(-1.5, 1.5, 0.01)
ys = np.arange(-1.5, 1.5, 0.01)
x, y = np.meshgrid(xs, ys)
zs = surface_function(x.flatten(), y.flatten())
go.Figure(data=[
    go.Surface(x = xs, y = ys,
               z=zs.reshape(len(xs), len(ys))),
    go.Scatter3d(x=[a_min], y=[b_min], z=[surface_function(a_min, b_min)])
    ], 
    layout=dict(height=1000, 
                scene_xaxis_title="a", scene_yaxis_title="b", 
                scene_zaxis_title="surface"))

<br><br><br>

---

<center> Return to Slides </center>

---

<br><br><br>

## Minimizing RMSE 

We can use minimize to find the slope and intercept that minimize root mean squared error in our predictions:

In [None]:
minimize(demographics_rmse)

How does this compare to the slope and intercept we derived earlier?

In [None]:
[demo_slope, demo_intercept]

What happens if we minimize the mean squared error instead of the root mean squared error?

In [None]:
def demographics_mse(slope, intercept):
    x = demographics['College%']
    y = demographics['Median Income']
    estimate = slope*x + intercept
    return ...

In [None]:
minimize(demographics_mse)

What about if we minimize the absolute error?

In [None]:
def demographics_mae(any_slope, any_intercept):
    x = demographics['College%']
    y = demographics['Median Income']
    estimate = any_slope*x + any_intercept
    return ...

In [None]:
minimize(demographics_mae)

This is different! 

In [None]:
mae_slope, mae_intercept = minimize(demographics_mae)
fig = px.scatter(demographics, x="College%", y="Median Income", color="State")
xtest = np.arange(0, 75, 0.1)
fig.add_scatter(x=xtest, 
                y=demo_slope * xtest + demo_intercept,
                name = f"Least Squares: {np.round(demo_slope, 2)} x + {np.round(demo_intercept)}")
fig.add_scatter(x=xtest, 
                y=mae_slope * xtest + mae_intercept,
                name = f"MAE: {np.round(mae_slope, 2)} x + {np.round(mae_intercept)}")
fig

<br><br><br>

---

<center> Return to Slides </center>

---

<br><br><br>