# Minimize Errors

In [None]:
import numpy as np
from datascience import *
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

In [None]:
faithful = Table.read_table("../Lab08/faithful-new.csv")
faithful.scatter('duration','wait')

In [None]:
def standard_units(xyz):
    "Convert any array of numbers to standard units."
    return (xyz - np.mean(xyz))/np.std(xyz)  
def correlation(t, label_x, label_y):
    return np.mean(standard_units(t.column(label_x))*standard_units(t.column(label_y)))
    
# Regression
def slope(t, label_x, label_y):
    r = correlation(t, label_x, label_y)
    return r*np.std(t.column(label_y))/np.std(t.column(label_x))
def intercept(t, label_x, label_y):
    return np.mean(t.column(label_y)) - slope(t, label_x, label_y)*np.mean(t.column(label_x))

In [None]:
correlation(faithful,"duration","wait")

## Fitting a Line: Method 1 -- Converting to stadard units

In [None]:
faithful.scatter('duration','wait')
slp = slope(faithful, 'duration','wait')
inter = intercept(faithful,'duration','wait')
print(f"Slope: {slp:.2f}    Intercept: {inter:.2f}")
plt.scatter(0,inter)
xs = [0, 7]
ys = [inter, slp * 7 + inter]
plt.plot(xs,ys, color='blue')
plt.savefig('faithful_regress.png')
plt.show()

## Fitting a Line: Method 2 -- Minimizing the residuals

In [None]:
def mse_c(any_slope, any_intercept):
    tbl = faithful
    xlabel = 'duration'
    ylabel = 'wait'
    xdata, ydata = tbl.column(xlabel), tbl.column(ylabel)
    fitted = any_slope * xdata + any_intercept
    mse = np.mean((ydata - fitted) ** 2)
    print("Root mean squared error:", mse ** 0.5)
    return mse

In [None]:
mse_c(5,5)

In [None]:
mse_c(10.73,33.47)

In [None]:
m_slope, m_intercept = minimize(mse_c)

In [None]:
faithful.scatter('duration','wait')
plt.scatter(0,m_intercept)
xs = [0, 7]
ys = [inter, slp * 7 + inter]
plt.plot(xs,ys, color='green')
plt.savefig('faithful_regress_mse.png')
plt.show()

## Fitting a Line: Method 3 -- Using Numpy's Polyfit

In [None]:
x = faithful.column("duration")
y = faithful.column("wait")
m, b = np.polyfit(x, y, 1)
print(f"The slope is: {m:.2f}, and the intercept is: {b:.2f}")

## Fitting a Line: Method 4 -- Using the Scipy module stats

In [None]:
from scipy.stats import linregress

slope, intercept, r, p, stderr = linregress(x, y)
print(f"The slope is: {m:.2f}, and the intercept is: {b:.2f}")

## Fitting a Line: Method 5 -- Using the statsmodels module

In [None]:
import statsmodels.api as sm

X = sm.add_constant(x)
model = sm.OLS(y, X).fit()
print(model.summary())

## Summary
Linear regression is such a common task there are many Python packages that can be used. The correct choice depends on your used case and preferences. **The key is that you understand what is happening under the hood when you fit a line to data.**