<a href="https://colab.research.google.com/github/jamestheengineer/data-science-from-scratch-Python/blob/master/Chapter_8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Only do this once per VM, otherwise you'll get multiple clones and nested directories
# !git clone https://github.com/jamestheengineer/data-science-from-scratch-Python.git
#%cd data-science-from-scratch-Python/
#!pip install import-ipynb

In [0]:
# Gradient descent
import import_ipynb
from Chapter_4 import Vector, dot

In [0]:
# A lot of times, we'll want to minimize the following type of function
def sum_of_squares(v: Vector) -> float:
  """Conputes the sum of squared elements in v"""
  return dot(v,v)

In [0]:
from typing import Callable

# Derivative is defined as the limit of the difference quotients of a function f
def difference_quotient(f: Callable[[float], float],
                        x: float,
                        h: float) -> float:
    return (f(x + h) - f(x)) / h


In [0]:
# For many functions, you can calculate the derivative explicitly
def square(x:float) -> float:
  return x * x

def derivative(x: float) -> float:
  return 2 * x

In [0]:
# But a lot of the times,  you'll have to estimate derivatives
# Let's see how they compare

xs = range(-10, 11)
actuals = [derivative(x) for x in xs]
estimates = [difference_quotient(square, x, h=0.001) for x in xs]

# plot to show they're basically the same
import matplotlib.pyplot as plt

plt.title("Actual Derivatives vs. Estimates")
plt.plot(xs, actuals, 'rx', label='Actual')
plt.plot(xs, estimates, 'b+', label='Estimate')
plt.legend(loc=9)
plt.show()

In [0]:
# Partial derivatives are necessary a lot of the times
def partial_difference_quotient(f: Callable[[Vector], float],
                                v: Vector,
                                i: int,
                                h: float) -> float:
    """Returns the i-th partial difference quotient of f at v"""
    w = [v_j + (h if j == i else 0)     # add h to just the ith element of v
         for j, v_j in enumerate(v)]
    
    return (f(w) - f(v)) / h

In [0]:
def estimate_gradient(f: Callable[[Vector], float],
                      v: Vector,
                      h: float = 0.0001):
  return [partial_difference_quotient(f, v, i, h)
          for i in range(len(v))]

In [0]:
# One drawback of using difference quotients is that it's computationally expensive

# Usually, you just pick a random starting point and take tiny steps in the opposite direction of the gradient until the gradient is very small
import random
from Chapter_4 import distance, add, scalar_multiply

def gradient_step(v: Vector, gradient: Vector, step_size: float) -> Vector:
  """Moves 'step_size' in the 'gradient' directions from 'v'"""
  assert len(v) == len(gradient)
  step = scalar_multiply(step_size, gradient)
  return add(v, step)

def sum_of_squares_gradient(v: Vector) -> Vector:
  return [2 * v_i for v_i in v]

# pick a random starting point
v = [random.uniform(-10, 10) for i in range(3)]

for epoch in range(1000):
  grad = sum_of_squares_gradient(v)     # compute the gradient at v
  v = gradient_step(v, grad, -0.01)     # take a negative gradient step
  print(epoch, v)

assert distance(v, [0, 0, 0]) < 0.001   # v should be close to 0

In [0]:
# To fit models, we usually compute a loss function to see how close we are to matching the original data

# x ranges from -50 to 49, y is always 20 * x + 5
inputs = [(x, 20 * x + 5) for x in range(-50, 50)]

def linear_gradient(x: float, y: float, theta: Vector) -> Vector:
  slope, intercept = theta
  predicted = slope * x + intercept     # The prediction of the model
  error = (predicted - y)               # error is (predicted - actual)
  squared_error = error ** 2            # We'll minimize squared error
  grad = [2 * error * x, 2 * error]
  return grad

In [0]:
from Chapter_4 import vector_mean

# Start with random values for slope and intercept
theta = [random.uniform(-1, 1), random.uniform(-1,1)]

learning_rate = 0.001

for epoch in range(5000):
  # Compute the mean of the gradients
  grad = vector_mean([linear_gradient(x, y, theta) for x, y in inputs])
  # Take a step in that direction
  theta = gradient_step(theta, grad, -learning_rate)
  print(epoch, theta)

slope, intercept = theta
assert 19.9 < slope < 20.1, "slope should be about 20"
assert 4.9 < intercept < 5.1, "intercept should be about 5"


In [0]:
# Minibatch gradient descent for large data sets
from typing import TypeVar, List, Iterator

T = TypeVar('T') # this allows us to type "generic" functions

def minibatches(dataset: List[T],
                batch_size: int,
                shuffle: bool = True) -> Iterator[List[T]]:
  """Generates 'batch_size' - sized minibatches from the dataset"""
  # start indexes 0, batch_size, 2 * batch_size, ...
  batch_starts = [start for start in range(0, len(dataset), batch_size)]

  if shuffle: random.shuffle(batch_starts) # shuffle the batches

  for start in batch_starts:
    end = start + batch_size
    yield dataset[start:end]
    

In [0]:
# Now we can solve our problem using minibatches
theta = [random.uniform(-1, 1), random.uniform(-1, 1)]

for epoch in range(1000):
  for batch in minibatches(inputs, batch_size=20):
    grad = vector_mean([linear_gradient(x, y, theta) for x, y in batch])
    theta = gradient_step(theta, grad, -learning_rate)
  print(epoch, theta)

slope, intercept = theta
assert 19.9 < slope < 20.1, "slope should be about 20"
assert 4.9 < intercept < 5.1, "intercept should be about 5"

In [0]:
# Another variation is stochastic gradient descent, in which you take gradient steps on example at a time
theta = [random.uniform(-1, 1), random.uniform(-1, 1)]

for epoch in range(100):
  for x, y in inputs:
    grad = linear_gradient(x, y, theta)
    theta = gradient_step(theta, grad, -learning_rate)
  print(epoch, theta)

slope, intercept = theta
assert 19.9 < slope < 20.1, "slope should be about 20"
assert 4.9 < intercept < 5.1, "intercept should be about 5"