In [13]:
import tensorflow_probability as tfp
import numpy as np
import functools
import tensorflow as tf
import contextlib
import time

In [14]:
def make_val_and_grad_fn(value_fn):
    @functools.wraps(value_fn)
    def val_and_grad(x):
        return tfp.math.value_and_gradient(value_fn, x)

    return val_and_grad


@contextlib.contextmanager
def timed_execution():
    t0 = time.time()
    yield
    dt = time.time() - t0
    print("Evaluation took: %f seconds" % dt)


def np_value(tensor):
    """Get numpy value out of possibly nested tuple of tensors."""
    if isinstance(tensor, tuple):
        return type(tensor)(*(np_value(t) for t in tensor))
    else:
        return tensor.numpy()


def run(optimizer):
    """Run an optimizer and measure it's evaluation time."""
    optimizer()  # Warmup.
    with timed_execution():
        result = optimizer()
        return np_value(result)

In [15]:
# Fix numpy seed for reproducibility
np.random.seed(12345)

# The objective must be supplied as a function that takes a single
# (Tensor) argument and returns a tuple. The first component of the
# tuple is the value of the objective at the supplied point and the
# second value is the gradient at the supplied point. The value must
# be a scalar and the gradient must have the same shape as the
# supplied argument.

# The `make_val_and_grad_fn` decorator helps transforming a function
# returning the objective value into one that returns both the gradient
# and the value. It also works for both eager and graph mode.

dim = 10
minimum = np.ones([dim])
scales = np.exp(np.random.randn(dim))


@make_val_and_grad_fn
def quadratic(x):
    return tf.reduce_sum(scales * (x - minimum) ** 2, axis=-1)


# The minimization routine also requires you to supply an initial
# starting point for the search. For this example we choose a random
# starting point.
start = np.random.randn(dim)

# Finally an optional argument called tolerance let's you choose the
# stopping point of the search. The tolerance specifies the maximum
# (supremum) norm of the gradient vector at which the algorithm terminates.
# If you don't have a specific need for higher or lower accuracy, leaving
# this parameter unspecified (and hence using the default value of 1e-8)
# should be good enough.
tolerance = 1e-10


@tf.function
def quadratic_with_lbfgs():
    return tfp.optimizer.lbfgs_minimize(
        quadratic, initial_position=tf.constant(start), tolerance=tolerance
    )


results = run(quadratic_with_lbfgs)

# The optimization results contain multiple pieces of information. The most
# important fields are: 'converged' and 'position'.
# Converged is a boolean scalar tensor. As the name implies, it indicates
# whether the norm of the gradient at the final point was within tolerance.
# Position is the location of the minimum found. It is important to check
# that converged is True before using the value of the position.

print("L-BFGS Results")
print("Converged:", results.converged)
print("Location of the minimum:", results.position)
print("Number of iterations:", results.num_iterations)

Evaluation took: 0.009760 seconds
L-BFGS Results
Converged: True
Location of the minimum: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Number of iterations: 10


In [17]:
loss, grads = quadratic(np.random.randn(10))