In [1]:
import torch
import tqdm.notebook as tqdm

In [2]:
# Parameters that we want to optimize over

X1 = torch.nn.Parameter(torch.randn(2))  # torch.nn.Parameter marks a Tensor as one that we want to optimize
X2 = torch.nn.Parameter(torch.randn(5))

print(X1)
print(X2)

Parameter containing:
tensor([-0.9771,  1.4426], requires_grad=True)
Parameter containing:
tensor([ 0.7405, -1.5294, -0.4147, -0.7225,  2.3669], requires_grad=True)


In [3]:
# ^^ note the "requires_grad=True" with the tensor - this means that
# it will accumulate gradients from automatic differentiation, which we can
# use with gradient descent

In [4]:
mat = torch.randn(X1.size(-1), X2.size(-1))

# Here's a simple toy objective function
# We want to find the optimal rank-1 approximation of mat
# i.e.
# argmin_{X1, X2} || mat - X1 @ X2^T ||^2_F
#
def objective_function(x, S):
    return (mat - X1.unsqueeze(-1) @ X2.unsqueeze(-2)).norm().square()

# ^^^ This objective function is non-convex, so there's no guarantee that 
# a gradient-based optimization should find the global minimum
# However, gradient-based optimization is the standard approach these days

In [5]:
# With the random initial values of X1 and X2, our objective function shouldn't be that good
print(objective_function(X1, X2))

tensor(18.1362, grad_fn=<PowBackward0>)


In [6]:
# Now here's an optimization loop

optimizer = torch.optim.Adam(params=[X1, X2], lr=0.1)
# ^^ Adam is a gradient descent optimizer that does some fancy geometry stuff for faster optimization
# lr = learning rate. 0.1 is a good starting point, though you maybe want to play with this hyperparameter

#############
# The gradient descent loop
#############
num_iter = 400  # You may need more iterations than this
iterator = tqdm.tqdm(range(num_iter))

for _ in iterator:
    loss = objective_function(X1, X2)  # Using the current values of x and S, compute the obj. function
        # Should be a scalar
    loss.backward()  # This line runs backpropagation to compute d loss / dx and d loss / dS
        # If you peak at x.grad and S.grad at this line, you will see a tensor that corresponds
        # to these derivatives
    optimizer.step()  # This line performs the gradient update, using whatever is stored in
        # x.grad and S.grad
        # x and S will now be updated
    optimizer.zero_grad()  # This line resets whatever is stored in d loss / dx and d loss / dS
    
    # Let's see the loss going down over time
    iterator.set_postfix(loss=loss.item())

  0%|          | 0/400 [00:00<?, ?it/s]

In [7]:
print(X1, X2)  # They're different values now

Parameter containing:
tensor([-0.4659,  0.9794], requires_grad=True) Parameter containing:
tensor([ 0.2173, -0.9189,  1.4809,  0.5069,  1.9396], requires_grad=True)


In [8]:
# And how good is our objective function now?
print(objective_function(X1, X2))

tensor(1.9370, grad_fn=<PowBackward0>)


In [9]:
# ^^^ better!