In [15]:
import numpy as np
from random import randrange
import torch

In [16]:
def grad_check_sparse(f, x, analytic_grad, num_checks=1, h=1e-5):
    """
    sample a few random elements and only return numerical
    in this dimensions.
    """

    for i in range(num_checks):
        ix = tuple([randrange(m) for m in x.shape])

        oldval = x[ix]
        x[ix] = oldval + h # increment by h
        fxph = f(x) # evaluate f(x + h)
        x[ix] = oldval - h # increment by h
        fxmh = f(x) # evaluate f(x - h)
        x[ix] = oldval # reset

        grad_numerical = (fxph - fxmh) / (2 * h)
        grad_analytic = analytic_grad[ix]
        rel_error = (abs(grad_numerical - grad_analytic) /
                    (abs(grad_numerical) + abs(grad_analytic)))
        print('numerical: %f analytic: %f, relative error: %e'
              %(grad_numerical, grad_analytic, rel_error))

In [17]:
x = np.array([5.])

f = lambda x: x**2
grad = 2*x

grad_numerical = grad_check_sparse(f, x, grad, 10)

numerical: 10.000000 analytic: 10.000000, relative error: 1.892886e-11
numerical: 10.000000 analytic: 10.000000, relative error: 1.892886e-11
numerical: 10.000000 analytic: 10.000000, relative error: 1.892886e-11
numerical: 10.000000 analytic: 10.000000, relative error: 1.892886e-11
numerical: 10.000000 analytic: 10.000000, relative error: 1.892886e-11
numerical: 10.000000 analytic: 10.000000, relative error: 1.892886e-11
numerical: 10.000000 analytic: 10.000000, relative error: 1.892886e-11
numerical: 10.000000 analytic: 10.000000, relative error: 1.892886e-11
numerical: 10.000000 analytic: 10.000000, relative error: 1.892886e-11
numerical: 10.000000 analytic: 10.000000, relative error: 1.892886e-11


In [18]:
def softmax_loss_i(h, y_i):
  exp_h = np.exp(h)
  sum_exp_h = np.sum(exp_h)
  loss = -h[y_i] + np.log(sum_exp_h)

  grad = exp_h / sum_exp_h

  grad[y_i] -= 1

  return loss, grad

In [19]:
h = np.array([1., 2., 3.])
y = 2

softmax_loss_i(h, y)

(0.40760596444438013, array([ 0.09003057,  0.24472847, -0.33475904]))

In [20]:
import torch.nn.functional as F


h_t = torch.tensor(h, requires_grad=True)
ls = F.log_softmax(h_t, dim=-1)

ls.expand(1, -1)

loss = F.nll_loss(ls.expand(1, -1), torch.tensor([y]))
print(loss)
loss.backward()
h_t.grad

tensor(0.4076, dtype=torch.float64, grad_fn=<NllLossBackward>)


tensor([ 0.0900,  0.2447, -0.3348], dtype=torch.float64)

In [21]:
h = np.array([1., 2., 3.])
y = 2

loss, grad = softmax_loss_i(h,y)

f = lambda w: softmax_loss_i(h,y)[0]
grad_numerical = grad_check_sparse(f, h, grad, 15)

numerical: 0.244728 analytic: 0.244728, relative error: 2.238249e-11
numerical: -0.334759 analytic: -0.334759, relative error: 1.041890e-11
numerical: 0.244728 analytic: 0.244728, relative error: 2.238249e-11
numerical: -0.334759 analytic: -0.334759, relative error: 1.041890e-11
numerical: 0.244728 analytic: 0.244728, relative error: 2.238249e-11
numerical: 0.244728 analytic: 0.244728, relative error: 2.238249e-11
numerical: 0.244728 analytic: 0.244728, relative error: 2.238249e-11
numerical: 0.090031 analytic: 0.090031, relative error: 1.012151e-10
numerical: 0.090031 analytic: 0.090031, relative error: 1.012151e-10
numerical: -0.334759 analytic: -0.334759, relative error: 1.041890e-11
numerical: 0.090031 analytic: 0.090031, relative error: 1.012151e-10
numerical: 0.090031 analytic: 0.090031, relative error: 1.012151e-10
numerical: 0.090031 analytic: 0.090031, relative error: 1.012151e-10
numerical: 0.244728 analytic: 0.244728, relative error: 2.238249e-11
numerical: 0.090031 analytic

In [22]:
h = np.array([1., 2., 3.])
y = 0

loss, grad = softmax_loss_i(h,y)

f = lambda w: softmax_loss_i(h,y)[0]
grad_numerical = grad_check_sparse(f, h, grad, 15)

numerical: -0.909969 analytic: -0.909969, relative error: 1.361362e-11
numerical: -0.909969 analytic: -0.909969, relative error: 1.361362e-11
numerical: 0.244728 analytic: 0.244728, relative error: 2.238249e-11
numerical: 0.665241 analytic: 0.665241, relative error: 1.016688e-11
numerical: -0.909969 analytic: -0.909969, relative error: 1.361362e-11
numerical: -0.909969 analytic: -0.909969, relative error: 1.361362e-11
numerical: -0.909969 analytic: -0.909969, relative error: 1.361362e-11
numerical: 0.244728 analytic: 0.244728, relative error: 2.238249e-11
numerical: 0.665241 analytic: 0.665241, relative error: 1.016688e-11
numerical: 0.665241 analytic: 0.665241, relative error: 1.016688e-11
numerical: 0.244728 analytic: 0.244728, relative error: 2.238249e-11
numerical: -0.909969 analytic: -0.909969, relative error: 1.361362e-11
numerical: 0.665241 analytic: 0.665241, relative error: 1.016688e-11
numerical: 0.244728 analytic: 0.244728, relative error: 2.238249e-11
numerical: 0.244728 an

In [23]:
def softmax_loss(h, y):
  idx = np.arange(len(h))

  exp_h = np.exp(h)
  sum_exp_h = np.sum(exp_h, axis=1, keepdims=True)
  loss = -h[idx, y] + np.log(sum_exp_h)
  loss = loss.mean()



  grad = exp_h / sum_exp_h
  grad[idx, y] -= 1

  grad /= len(h)
  return loss, grad

In [24]:
h = np.array([[1., 2., 3.],
              [1., -2., 5.]])
y = np.array([0, 2])

In [25]:
# h = np.array([[1., 2., 3.]])
# y = np.array([0])

In [26]:
softmax_loss(h, y)

(1.2133254861618188,
 array([[-4.54984713e-01,  1.22364236e-01,  3.32620478e-01],
        [ 8.98505903e-03,  4.47339748e-04, -9.43239878e-03]]))

In [27]:
import torch.nn.functional as F

In [28]:
h_t = torch.tensor(h, requires_grad=True)
ls = F.log_softmax(h_t, dim=1)
loss = F.nll_loss(ls, torch.tensor(y))
print(loss)
loss.backward()
h_t.grad

tensor(1.2133, dtype=torch.float64, grad_fn=<NllLossBackward>)


tensor([[-4.5498e-01,  1.2236e-01,  3.3262e-01],
        [ 8.9851e-03,  4.4734e-04, -9.4324e-03]], dtype=torch.float64)

In [32]:
W = torch.tensor([[1., 2., 4.],
                  [3., 4., 5.]], requires_grad=True)

tensor([[1., 2., 4.],
        [3., 4., 5.]])