# Gradient Descent

Optimizing functions algorithmically

In [1]:
# Awful hack to import past chapter modules
import sys
sys.path.insert(0, "../")

from linalg import Vector, dot, distance, add, scalar_multiply, add, vector_mean
from typing import Callable
import random
import numpy as np

In [2]:
def sum_of_squares(xs: Vector) -> float:
    """
    Return the sum of the square of each element in xs
    """
    # this is equivalent to x dot x
    return dot(x, x)

Consider a function (e.g. a loss function) which reduces a vector to a meaningful float. The main idea of gradient descent is to algorithmically find the inputs that minimize this reducing function

### Terms:
- **Gradient**: The vector of partial deriviates for a vector relative to a function. E.g. if `y = sum_of_squares(xs)`, the gradirent is `dy/dxs` or `[dy/dx_0, dy/dx_1, ... dy/dx_n]`

#### Estimating the Gradient

In [3]:
def difference_quotient(f: Callable[[float], float], x: float, h: float) -> float:
    return (f(x + h) - f(x)) / h


This is the definition of a gradient for a single variable x and function f(x). We can estimate the gradient by just choosing a very small h (e.g. 10**-6). We can also do this for partial-derivatives in a vector calculus setting for f(xs):

In [4]:
def partial_diff_quotient(f: Callable[[Vector], float], xs: Vector, i: int, h: float) -> float:
    w = [x_j + (h if i == j else 0) for j, x_j in enumerate(xs)]  # single out and add h to just the ith element of xs
    return (f(w) - f(xs)) / h  # reflects only the change we made to the ith variable

In [5]:
def estimate_gradient(f: Callable[[Vector], float], xs: Vector, h: float = 10**-4) -> Vector:
    """
    Estimate the gradient of f with respect to xs by computing partial diff quotients element-wise
    """
    # note this is expensive and why auto-grad libraries mathematically compute most derivatives
    return [partial_diff_quotient(f, xs, i, h) for i in range(len(xs))]

### Using the Gradient

For sum of squares, it is obvious the minimum overall is achieved by a vector of zeros. In some cases though, the minimum(s) may not be obvious, so we'll use it as an example to evaluatate our algorithm

In [6]:
def gradient_step(xs: Vector, gradient: Vector, step_size: float) -> Vector:
    """
    Moves `step_size` along the gradient of f w.r.t. xs, returning a input
    """
    assert len(xs) == len(gradient)
    update = scalar_multiply(step_size, gradient)
    return add(xs, update)

def sum_of_squares_gradient(xs: Vector) -> Vector:
    """
    We know the partial-derivative for a sum of squares is just 2*`the_term`
    """
    return [2*x for x in xs]    

In [7]:
# Now lets verify with an experiment
xs = [random.uniform(-10, 10) for i in range(10)]

for i in range(1, 100000):
    grad = sum_of_squares_gradient(xs)
    xs = gradient_step(xs, grad, -1 * (10**-4))

assert(distance(xs, np.zeros(10)) < 10**-6) # we should have gotten very close to zero
print(xs)

[7.31227121845904e-09, 1.7551306520604136e-08, -1.0897493692015273e-09, 1.1797208695386382e-08, -1.7806089301868516e-08, 6.852900385318273e-09, 1.3483110056719228e-08, -1.8385160717955794e-08, 1.345429991064904e-08, -5.285101662485176e-09]


In [8]:
# x ranges from (-50, 49), y = 20 * x + 5: we'll use gradient descent to fit parameters to this, 
# as if we had no idea y = 20*x + 5
data = [(x, 0*(x**2) + 20*x + 5) for x in range(-10, 10)]

In [9]:
# we think f(x) is a polynomial, and want to compute a gradient using coefficients ws
def linear_gradient_mse(x: float, y: float, ws: Vector) ->  float:
    predicted = sum([w * (x**i) for i, w in enumerate(ws)]) # our weights are coefficients to the polynomial
    target = y 
    error = predicted - target  
    grad = [2 * error * (x**i) for i in range(len(ws))]
    return grad    

In [10]:
weights_linear = [random.uniform(-1, 1) for i in range(2)]
lr = 10**-3
for epoch in range(5000):
    # compute mean of gradients
    mean_grad = vector_mean([linear_gradient_mse(x, y, weights_linear) for x, y in data])
    weights_linear = gradient_step(weights_linear, mean_grad, -1 * lr)
    if epoch % 500 == 0 or epoch == 4999:
        print(epoch, weights_linear)  # second near 20, first near 5

0 [0.7493938552161804, 0.6937188669052676]
500 [3.316192137232547, 19.97410139204759]
1000 [4.376394724111463, 19.990408342356385]
1500 [4.7690451810357635, 19.99644768952365]
2000 [4.9144649180095525, 19.998684386975718]
2500 [4.968321725071917, 19.999512757220636]
3000 [4.9882678185480485, 19.999819547601263]
3500 [4.9956549375894355, 19.999933168700306]
4000 [4.998390787985251, 19.99997524874898]
4500 [4.999404021608042, 19.999990833270793]
4999 [4.999778837990579, 19.99999659831249]


In [11]:
data = [(x, -4*(x**2) + 20*x + 5) for x in range(-10, 10)]
# now lets attempt to fit a cubic to a linear: the 2nd and 3rd degree terms (last 2) should near zero over time
weights_cubic = [random.uniform(-1, 1) for i in range(3)]
print(weights_cubic)
lr = 2 * 10**-4 # slow down for visibility
for epoch in range(25000):
    # compute mean of gradients
    mean_grad = vector_mean([linear_gradient_mse(x, y, weights_cubic) for x, y in data])
    # print('mean_grad', mean_grad)
    weights_cubic = gradient_step(weights_cubic, mean_grad, -1 * lr)
    if epoch % 1000 == 0 or epoch == 9999:
        print(epoch, weights_cubic)  # second near 20, first near 5

[0.07331434705382489, -0.4999972588541248, 0.31423464852320215]
0 [0.01337427757302162, -0.13999793974560476, -3.5386010332412927]
1000 [0.992637191534093, 20.040746189590127, -3.932959358462227]
2000 [1.6456469105089766, 20.034145208260902, -3.9438828302760314]
3000 [2.1922467312514566, 20.028581165383205, -3.953027256663449]
4000 [2.6497768100593726, 20.0239237964485, -3.960681577005555]
5000 [3.032751273406366, 20.020025356868246, -3.9670886076273266]
6000 [3.3533191363063506, 20.016762177297554, -3.972451597357781]
7000 [3.621649702856828, 20.014030740605715, -3.9769406751454257]
8000 [3.8462551648452723, 20.011744398024785, -3.98069824702163]
9000 [4.03426063214471, 20.009830620409904, -3.983843513615964]
9999 [4.1914863599916545, 20.0082301612171, -3.9864738457901825]
10000 [4.19163016101349, 20.008228697412985, -3.9864762515316574]
11000 [4.323355950545671, 20.006887811581688, -3.988679978536936]
12000 [4.433616709109311, 20.005765426288487, -3.990524602980882]
13000 [4.52591021

In [12]:
data = [(x, -4*(x**2) + 20*x + 5) for x in range(-3, 3)]
# now lets attempt to fit a cubic to a linear: the 2nd and 3rd degree terms (last 2) should near zero over time
weights_cubic = [random.uniform(-1, 1) for i in range(4)]
print(weights_cubic)
lr = 2 * 10**-4 # slow down for visibility
for epoch in range(35000):
    # compute mean of gradients
    mean_grad = vector_mean([linear_gradient_mse(x, y, weights_cubic) for x, y in data])
    # print('mean_grad', mean_grad)
    weights_cubic = gradient_step(weights_cubic, mean_grad, -1 * lr)
    if epoch % 1000 == 0 or epoch == 9999:
        print(epoch, weights_cubic)  # second near 20, first near 5

[0.24218050105883449, 0.6864097736224155, 0.15349626994888754, 0.43045351049843306]
0 [0.23573463185676402, 0.7140982403814589, 0.0998882543738665, 0.6125962956745288]
1000 [0.026636344775258985, 4.151548924877041, -0.7515507028172141, 2.8922332093047247]
2000 [0.0023853063785252222, 6.653999846505032, -1.045966468776184, 2.4720447736105986]
3000 [0.09116266027772561, 8.723577261820047, -1.3281294183234296, 2.1168724476220713]
4000 [0.2606657208215198, 10.441496723966226, -1.5873257891946697, 1.8179936995179908]
5000 [0.4840705451444456, 11.87185770906582, -1.8241879876001483, 1.5657780623730186]
6000 [0.7410168451761746, 13.066472481212925, -2.0399977961217, 1.352258626722784]
7000 [1.016276546294831, 14.067339274310239, -2.2361397059946264, 1.1709196858364859]
8000 [1.2986098357975548, 14.908560847358638, -2.4140264788952535, 1.01642424868523]
9000 [1.5798604138674563, 15.617879194648687, -2.575059720903787, 0.8843904558159383]
9999 [1.8539747870108259, 16.21735415778043, -2.72046444

## fin
I spent way too much time just getting this to work and while the it can technically fit things correctly for arbitrary polynomials, the settings are extremely sensitive to both learning rate and batch size, causing major issues with overflow. Not worth digging too far into that, but I made changes to `vector_mean` that might help. Skipping **SGD** and **Batch-SGD** because they follow from these.