## Experiment setting

* **Inference**: Moment matching
* **Transition model**: SVGP reinitialized during each episode and trained based on all real experience collected so far
* **Transition model optimizer**: LBFGS
* **Policy** kernel regressor trained with VI, initialized with real experience and trained based on virtual experience
* **Policy optimizer**: Adam
* **State encoding**: N/A

In [2]:
import numpy as np
import logging
import os

from gpflow.likelihoods import Gaussian
from gpflow.optimizers import Scipy
from gpflow.config import default_float

import gpflow_pilco
from gpflow_pilco.envs import CartPole
from gpflow_pilco.models.priors import PilcoPenaltySNR
from gpflow_pilco.utils.optimizers import GradientDescent
from gpflow_pilco.components import GaussianObjective

from mbrlax.policy import GPPolicy
from mbrlax.transition_model import GPTransitionModel
from mbrlax.models import GPModelSpec
from mbrlax.agents import PilcoAgent
from mbrlax.harness import ExperimentHarness
from mbrlax.inference_strategy import MomentMatchingStrategy
from mbrlax.utils import MomentsInitialStateModel

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import PiecewiseConstantDecay
from tensorflow_probability.python.distributions import MultivariateNormalTriL
from tensorflow_probability.python import bijectors

## Initialize experiment

In [3]:
root_dir = os.getcwd()
cartpole_env = CartPole(time_per_step=0.1)
dtype = default_float()

## Initialise dependencies

### Inference strategy

In [4]:
inference_strategy = MomentMatchingStrategy()

### Transition model

In [5]:
transition_model_spec = GPModelSpec(
    type = gpflow_pilco.models.SVGP,
    num_inducing = 32,
    likelihood = Gaussian(),
    prior = PilcoPenaltySNR(threshold=1e5, power=30),
    mean_function = "default",
    model_uncertainty = True,
)

transition_model = GPTransitionModel(
    gp_model_spec = transition_model_spec,
    inference_strategy = inference_strategy,
    optimizer = Scipy(),
    reinitialize = True
)

### Objective function

In [6]:
target = tf.zeros([4], dtype=default_float())
height = cartpole_env.pole.height
precis = 16 * tf.convert_to_tensor([[height ** 2, 0, -height, 0, 0],
                                    [0, height ** 2, 0, 0, 0],
                                    [-height, 0, 1, 0, 0],
                                    [0, 0, 0, 0, 0],
                                    [0, 0, 0, 0, 0]], dtype=default_float())
objective = GaussianObjective(target=target, precis=precis)

### Policy

In [7]:
# gradient descent optimizer
initial_learning_rate = 0.01
step_limit = 5000
global_clipnorm = 1.0

values = tuple((0.1 ** k) * initial_learning_rate for k in range(3))
bounds = tuple(k * step_limit // len(values) for k in range(1, len(values)))
schedule = PiecewiseConstantDecay(boundaries=bounds, values=values)
adam = Adam(learning_rate=schedule, global_clipnorm=global_clipnorm)
policy_optimizer = GradientDescent(optimizer=adam, step_limit=step_limit)

# inverse link function
invlink = bijectors.Chain(bijectors=[
    bijectors.Scale(scale=tf.cast(x=20 - 1e-5, dtype=default_float())),
    bijectors.Shift(shift=tf.cast(x=-0.5, dtype=default_float())),
    bijectors.NormalCDF()]
)

In [8]:
policy_model_spec = GPModelSpec(
    type=gpflow_pilco.models.SVGP,
    num_inducing=32,
    likelihood=Gaussian(),
    prior=None,
    mean_function="default",
    model_uncertainty=False,
    invlink = invlink
)

policy = GPPolicy(
    action_space=cartpole_env.action_space,
    gp_model_spec=policy_model_spec,
    objective=objective,
    optimizer=policy_optimizer,
    inference_strategy=inference_strategy
)


### Initial state distribution

In [9]:
state_scale = tf.linalg.diag(0.1 + tf.zeros([4], dtype=dtype))
state_loc = tf.convert_to_tensor(value=(0.0, np.pi, 0.0, 0.0), dtype=dtype)
initial_state_distribution = MultivariateNormalTriL(loc=state_loc, scale_tril=state_scale)
initial_state_model = MomentsInitialStateModel(initial_state_distribution)

## Run the experiment

In [10]:
pilco_agent = PilcoAgent(
    transition_model = transition_model,
    reward_model = lambda x: None, #explicit rewards not used, instead rely on objective to evaluate policy
    initial_state_model = initial_state_model,
    policy = policy,
)

In [13]:
logger = logging.getLogger("mm_gradient_descent")
logging.basicConfig(
    level=logging.INFO,
    datefmt='%Y-%m-%d %H:%M:%S',
    format='%(asctime)-4s %(levelname)s:%(name)s:%(message)s')

harness = ExperimentHarness(
    logger = logger,
    logging_file = open(root_dir + "/logs/mm_gradient_descent.txt", "a"),
    agent = pilco_agent,
    env = cartpole_env,
    max_train_episodes = 10,
    max_eval_episodes = 0,
)
harness.run()

NameError: name 'Driver' is not defined