<a href="https://colab.research.google.com/github/harperd/machine-learning/blob/master/notebooks/simple-linear-regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Simple Linear Regression

Suppose you are the CEO of a restaurant franchise and are considering different cities for opening a new outlet. The chain already has trucks in various cities and you have data for profits and populations from the cities. You need to figure out what the expected profit of a new food truck might be given only the population of the city that it would be placed in.

In [5]:
# NumPy adds support for large, multi-dimensional arrays and matrices, along with a large collection 
# of high-level mathematical functions to operate on these arrays.
import numpy as np

# Matplotlib is a plotting library for the Python programming language and its numerical mathematics 
# extension NumPy. It provides an object-oriented API for embedding plots into applications using 
# general-purpose GUI toolkits like Tkinter, wxPython, Qt, or GTK+.
import matplotlib.pyplot as plt

# Allow saving our graphs in the notebook
%matplotlib inline

!pip install plotly
!pip install plotly --upgrade

import plotly
plotly.tools.set_credentials_file(username='rharper74@gmail.com', api_key='KzwdDa6shFfpQaAwSITY')

# Pandas is a software library for data manipulation and analysis. In particular, it offers data 
# structures and operations for manipulating numerical tables and time series.
import pandas as pd

Traceback (most recent call last):



KeyboardInterrupt: ignored

In [0]:
df_raw = pd.read_csv(
    'https://raw.githubusercontent.com/harperd/machine-learning/master/data/ex1data1.csv',
    header = None,
    names=[ 'Population', 'Profit' ])
%time print(f'{len(df_raw.index)} rows read.')

In [0]:
df_raw.head()

In [0]:
df_raw.describe()

In [0]:
df_raw.plot(kind = 'scatter', x = 'Population', y = 'Profit', figsize = (12,8))

In [0]:
# Append a bias column to the beginning
# of the DataFrame. This will be used when computing
# the hypothesis using matrix multiplication.
if 'Bias' not in df_raw:
  # Insert new column at index 0, with name Bias and
  # a value of all ones.
  df_raw.insert(0, 'Bias', 1)
  
df_raw.head()

In [0]:
# Set X (independent/training variable) 
# and y (dependent/target variable)
cols = df_raw.shape[1]

# Get the first two columns of our DataFrame and assign to X.
X = df_raw.iloc[:,0:cols-1]

# Get the last column of our DataFrame and assign to y.
y = df_raw.iloc[:,cols-1:cols]

# Convert from DataFrames to numpy matrices for easier
# calculations.
X = np.matrix(X.values)
y = np.matrix(y.values)
theta = np.matrix(np.array([0,0]))

X.shape, theta.shape, y.shape

$\large J( \theta _{0} ,\ \theta _{1}) =\frac{1}{2m}\sum\limits ^{m}_{i=1}\left( h_{\theta }\left( x^{( i)}\right) -y^{( i)}\right)^{2}$

In [0]:
 def compute_cost(X, y, theta):
  # h(x)
  # Calculate predictions, h(x), with current theta values.
  # Here we are doing matrix multiplication as 
  # X multiplied by theta transposed.
  predictions = X * theta.T

  # h(x) - y
  # Get the error which tells us how far off our
  # preditions are from the dependent variables.
  error = predictions - y
  
  # sum( (h(x) - y)^2 )
  # Get the sum square error.
  sum_square_error = np.sum(np.square(error))
  
  # 1/2m * sum( (h(x) - y)^2 )
  # Compute the cost.
  m = len(y)
  cost = sum_square_error / ( 2 * m )
  
  return cost

In [0]:
print(f'Cost with a theta0 of {theta[0,0]} and theta1 of {theta[0,1]} is {compute_cost(X, y, theta)}')

*repeat until convergence {* 

​	$temp 0:= \theta_{0}-\alpha\frac{1}{m}\sum\limits ^{m}_{i=1}\left( h_{\theta }\left( x^{(i)}\right) -y^{( i)}\right)$

​	$temp1 := \theta_{1}-\alpha\frac{1}{m}\sum\limits ^{m}_{i=1}\left( h_{\theta }\left( x^{(i)}\right) -y^{( i)}\right)\cdot x^{(i)}$

​	$\theta_{0} := temp0$

​	$\theta_{1} := temp1$

*}* 

In [0]:
def gradient_descent(X, y, theta, alpha):
    temp = np.matrix(np.zeros(theta.shape))
    m = len(y)
    cost_history = []
    theta_history = []
    cost = 0
    iterations = 0
    converged = False
    
    # Keep calculating new theta values until we have converged
    # at the minimum.
    while not converged:
      # Calculate predictions, h(x), with current theta values.
      predictions = X * theta.T

      # Get the error which tells us how far off we are from the
      # actual/dependent variables.
      error = predictions - y

      # Get feature data without bias column.
      x = X[:,1]

      # Calculate new theta values.
      temp[0,0] = theta[0,0] - ( alpha * ( np.sum(error) / m ) )
      temp[0,1] = theta[0,1] - ( alpha * ( np.sum(np.multiply(error, x)) / m ) )

      theta = temp

      # Compute the cost.
      cost = compute_cost(X, y, theta)

      if(cost == float('inf')):
        print('ERROR: The learning rate, alpha, is too large.')
        print()
        break;

      # Check for convergence.
      # If the previous cost is the same as the current cost then
      # we have converged to the minimum.
      if (iterations > 0 and cost_history[iterations - 1] == cost):
        converged = True
      else:
        # Increase the number of iterations
        iterations = iterations + 1
        # Track our cost history. We will graph this to see how cost
        # decreases with the number of iterations.
        cost_history.append(cost)
        theta_history.append(theta)
      
    return theta_history, cost_history, iterations

In [0]:
# Run gradent descent and calculate the optimal theta values with the
# minimal cost which will draw a line that directly interstects our data.
thetas, costs, iterations = gradient_descent(X, y, theta, alpha = .024)

optimal_theta0 = thetas[iterations - 1][0, 0]
optimal_theta1 = thetas[iterations - 1][0, 1]

print(f'Optimal theta values: {optimal_theta0} and {optimal_theta1}')
print(f'Cost................: {costs[iterations - 1]}')
print(f'Iterations..........: {iterations:,}')
print('')
%time

In [0]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.plot(
    [i[0][0,0] for i in thetas],
    [i[0][0,1] for i in thetas],
    costs)

ax.set_xlabel('Theta0')
ax.set_ylabel('Theta1')
ax.set_zlabel('Cost')

plt.show()

In [0]:
## Return evenly spaced numbers over a specified interval of 100.
x = np.linspace(df_raw.Population.min(), df_raw.Population.max(), 100)
f = optimal_theta0 + (optimal_theta1 * x)

fig, ax = plt.subplots(figsize = (12,8))

ax.plot(x, f, 'r', label='Prediction')
ax.scatter(df_raw.Population, df_raw.Profit, label='Training Data')
ax.legend(loc = 2)
ax.set_xlabel('Population')
ax.set_ylabel('Profit')
ax.set_title('Predicted Profit vs. Population Size')

In [0]:
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(np.arange(iterations), costs, 'r')
ax.set_xlabel('Iterations')
ax.set_ylabel('Cost')
ax.set_title('Error vs. Training Epoch')