In [None]:
# This is just a preamble that sets a bunch of options up.
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Make graphs a little prettier
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 5)

In [None]:
# Small data set of height in inches and weight in pounds of children
height_and_weight = [
    [42.8, 40.0],
    [63.5, 93.5],
    [37.5, 35.5],
    [39.5, 30.0],
    [45.5, 52.0],
    [38.5, 17.0],
    [43.0, 38.5],
    [22.5,  8.5],
    [37.0, 33.0],
    [23.5,  9.5],
    [33.0, 21.0],
    [58.0, 79.0]]

# Use pandas to load the data we have into 2 columns - heights and weights.
df = pd.DataFrame(height_and_weight, columns=['height', 'weight'])

# Scatter plot the data
plt.xlabel('Height (in)')
plt.ylabel('Weight (lb)')
plt.plot(df['height'], df['weight'], '.', markersize=12)

In [None]:
X = df.as_matrix(['height'])
Y = np.array(df['weight'])

from sklearn.linear_model import LinearRegression

# Fit the model to our data
model = LinearRegression()
model.fit(X, Y)

# Same scatter plot as before
plt.xlabel('Height (in)')
plt.ylabel('Weight (lb)')
plt.plot(df['height'], df['weight'], '.', markersize=12)

# Line showing what our model predicts for heights in the range [20, 65]
plt.plot([20, 65], model.predict([[20], [65]]))

# But what exactly is this line?

In [None]:
plt.xlim(20, 65)

# Scatter plot and linear regression line.
plt.xlabel('Height (in)')
plt.ylabel('Weight (lb)')
plt.plot(df['height'], df['weight'], '.', markersize=12)
plt.plot([20, 65], model.predict([[20], [65]]))

# 1) It passes through the means of the heights and weights
plt.scatter([df['height'].mean()], [df['weight'].mean()], s=100)

# 2) It minimizes the sum of squares of the errors (the difference of the predicted and actual values):
for height, weight in zip(df['height'], df['weight']):
    pred_weight = model.predict([[height]])
    plt.plot([height, height], [weight, pred_weight], color='red')

In [None]:
# Let's make some predictions!

print('Height = 50, predicted weight = %.3f' % model.predict([[50]]))
print('Height = 22, predicted weight = %.3f' % model.predict([[22]]))

"In mathematics, **extrapolation** is the process of estimating, beyond the original observation range, the value of a variable on the basis of its relationship with another variable." - https://en.wikipedia.org/wiki/Extrapolation

### Side note #1: Why do we minimize the square of the errors, and not simply the sum of the errors?

In [None]:
# However, take a look at this example.
points = [[0, 0], [0, 2], [2, 0], [2, 2]]
plt.plot([x for x, y in points], [y for x, y in points] ,
         '.', markersize=12)
plt.plot([-0.5, 2.5], [1, 1], '-')

for x, y in points:
    plt.plot([x, x], [y, 1], '--', color='red')

plt.xlim(-0.5, 2.5)
plt.ylim(-0.5, 2.5)

In [None]:
from IPython.html import widgets
import math

points = [[0, 0], [0, 2], [2, 0], [2, 2]]
def graph(i):
    plt.plot([x for x, y in points], [y for x, y in points] ,
             '.', markersize=12)
    plt.plot([-0.5, 2.5], [i, i], '-')
    
    errorSum = 0
    errorSquareSum = 0
    
    for x, y in points:
        plt.plot([x, x], [y, i], '--', color='red')
        error = math.fabs(y-i)
        plt.text(x + 0.02, (y + i) / 2 - 0.1, 'error=%.1f' % error, size='x-large')
        plt.text(x + 0.02, (y + i) / 2 + 0.1, 'error^2=%.1f' % (error * error), size='x-large')
        errorSum += error
        errorSquareSum += error * error
    
    plt.text(-0.49, -0.45, 'sum of error = %.2f, sum of error^2 = %.2f' % (errorSum, errorSquareSum), size='x-large')
    
    plt.xlim(-0.5, 2.5)
    plt.ylim(-0.5, 2.5)

    
# This should be executed in an Jupyter notebook!
widgets.interact(graph, i=(0.0, 2.0));

In [None]:
# The sum of absolute errors in this case is 4 regardless of what y-coordinate we choose for our line.
# However, the sum of squares of errors varies between 4 and 8.
# Intuitively, using the error squared penalizes points that are further away from the line much more.

### Side note #2: Anscombe's quartet

In [None]:
# Before we move on, a cautionary note! Linear regression is merely a tool that has to be used appropriately.
# Let's take a look at a set of 4 famous graphs known as Anscombe's quartet.

anscombe = [
    [[10., 8.04], [8., 6.95], [13., 7.58], [9., 8.81], [11., 8.33], [14., 9.96],
                  [6., 7.24], [4., 4.26], [12., 10.84], [7., 4.82], [5., 5.68]],
    [[10., 9.14], [8., 8.14], [13., 8.74], [9., 8.77], [11., 9.26], [14., 8.10],
                  [6., 6.13], [4., 3.10], [12., 9.13], [7., 7.26], [5., 4.74]],
    [[10., 7.46], [8., 6.77], [13., 12.74], [9., 7.11], [11., 7.81], [14., 8.84],
                  [6., 6.08], [4., 5.39], [12., 8.15], [7., 6.42], [5., 5.73]],
    [[8., 6.58], [8., 5.76], [8., 7.71], [8., 8.84], [8., 8.47], [8., 7.04],
                 [8., 5.25], [19., 12.50], [8., 5.56], [8., 7.91], [8., 6.89]]
]

for i in range(4):
    plt.subplot(2, 2, i+1)
    plt.plot([x for x, y in anscombe[i]], [y for x, y in anscombe[i]], '.', markersize=12)
    plt.xlim(2, 20)
    plt.ylim(2, 14)

In [None]:
# What's so interesting about the 4 data sets?

properties = {}
properties['mean_x'] = []
properties['mean_y'] = []
properties['var_x'] = []
properties['var_y'] = []
properties['coef'] = []
properties['intercept'] = []
for i in range(4):
    dataframe = pd.DataFrame(anscombe[i], columns=['x', 'y'])
    properties['mean_x'].append(dataframe['x'].mean())
    properties['mean_y'].append(dataframe['y'].mean())
    properties['var_x'].append(dataframe['x'].var())
    properties['var_y'].append(dataframe['y'].var())
    
    print(dataframe.corr())
        
    model = LinearRegression()
    model.fit(dataframe.as_matrix(['x']), np.array(dataframe['y']))
    properties['coef'].append(model.coef_)
    properties['intercept'].append(model.coef_)
    
    plt.subplot(2, 2, i+1)
    plt.plot([x for x, y in anscombe[i]], [y for x, y in anscombe[i]], '.', markersize=12)
    plt.plot([2, 20], [2 * model.coef_ + model.intercept_, 20 * model.coef_ + model.intercept_])
    plt.xlim(2, 20)
    plt.ylim(2, 14)
    plt.title('dataset %d' % i)
    
pd.DataFrame(properties)

In [None]:
# Visually, the data looks very different, and the data in the 2 graphs on the right are clearly not linearly related.
# However, all of them have the same basic statistical properties (mean, variance, correlation), and linear regression
# finds nearly identical lines for all of them.