# Linear Regression

## lesson 8: programming the best fit slope

In [3]:
import numpy as np

In [4]:
xs = np.array([1,2,3,4,5], dtype=np.float64)
ys = np.array([5,4,6,5,6], dtype=np.float64)

In [8]:
def best_fit_slope(xs, ys):
    m = (((mean(xs)*mean(ys)) - mean(xs*ys)) / ((mean(xs)**2) - mean(xs**2)))
    return m

# m is our regression line's slope
m = best_fit_slope(xs, ys)
print m

0.3


## lesson 9: programming the best fit line

In [9]:
def best_fit_slope_and_intercept(xs, ys):
    m = (((mean(xs)*mean(ys)) - mean(xs*ys)) / ((mean(xs)**2) - mean(xs**2)))
    b = mean(ys) - m*mean(xs)
    return m, b

# m is our regression line's slope
# b is the intercept on the Y-axis
m, b = best_fit_slope_and_intercept(xs, ys)
print m, b

0.3 4.3


In [12]:
regression_line = []
for x in xs:
    regression_line.append((m * x) + b)

print regression_line

[4.6000000000000023, 4.9000000000000012, 5.2000000000000002, 5.4999999999999991, 5.799999999999998]


In [13]:
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')

plt.scatter(xs, ys, color='#003F72')
plt.plot(xs, regression_line)
plt.show()

## lesson 11: Programming R Squared

In [14]:
def squared_error(ys_orig, ys_line):
    """ This function calculates the squared error of any line to datapoints"""
    return sum((ys_line - ys_orig) * (ys_line - ys_orig))

def coefficient_of_determination(ys_orig, ys_line):
    """ Calculates R squared, a.k.a. the 'coefficient of determination'
    The closer it is to 1, the less error there is."""
    y_mean_line = [np.mean(ys_orig) for y in ys_orig]
    squared_error_regr = squared_error(ys_orig, ys_line)
    squared_error_y_mean = squared_error(ys_orig, y_mean_line)
    return 1 - (squared_error_regr / squared_error_y_mean)

r_squared = coefficient_of_determination(ys, regression_line)
print r_squared

0.321428571429


## lesson 12: Programming a test

In [17]:
import random

def create_dataset(hm, variance, step=2, correlation=None):
    """ hm = 'how many' datapoints we want to generate in the set
    variance = self-explanatory.
    step = how far to step on average per point.
    correlation = 'None', 'pos', 'neg'"""
    val = 1
    ys = []
    for i in range(hm):
        y = val + random.randrange(-variance, variance)
        ys.append(y)
        if correlation and correlation == 'pos':
            val += step
        elif correlation and correlation == 'neg':
            val -= step

    xs = [i for i in range(len(ys))]
    
    return np.array(xs, dtype=np.float64), np.array(ys, dtype=np.float64)

In [19]:
xs, ys = create_dataset(40, 40 , 2, correlation='pos')

""" The code below we wrote earlier above (= copy paste)"""
m, b = best_fit_slope_and_intercept(xs,ys)
regression_line = [(m*x)+b for x in xs]
r_squared = coefficient_of_determination(ys,regression_line)
print(r_squared)

plt.scatter(xs,ys,color='#003F72', label = 'data')
plt.plot(xs, regression_line, label = 'regression line')
plt.legend(loc=4)
plt.show()

0.516136203344


In [None]:
""" we can now experiment with the test.  
Creating a dataset with lower variance SHOULD result in an r_squared closer to 1."""