### Setup

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
import time
from tqdm.auto import tqdm, trange
import control
import notebook_setup
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import torch
import torch.nn as nn
from commonml.helpers.logs import get_tensorboard_scalar_frame

from systems.base import SystemEnv
from systems.plotting import (
    plot_env_response,
    multiple_response_plots
)
from rl import learn_rl, transform_rl_policy, evaluate_rl
from xform import get_transforms
from systems.simple import SimpleEnv
from systems.springmass import SpringMassEnv
from systems.pendulum import PendulumEnv
from systems.cartpole import CartpoleEnv
from systems.lunarlander import LanderEnv
from mpcontrol import evaluate_mpc, learn_mpc, MPCAgent
from lqcontrol import evaluate_lqr, learn_lqr, LQRAgent

# Control

# Policy transformations

## System specification

In [None]:
sys_kwargs = dict(a=-0.1, b=1)
learn_kwargs = dict(steps=200_000, seed=0, learning_rate=5e-3,
                    n_steps=4000, batch_size=200, n_epochs=10,
                    gamma=0.)
q, r = np.asarray([[1]]), np.asarray([[0.00001]])
xformA = np.asarray([[0.5]])
xformB = np.asarray([[-0.5]])
x0 = np.asarray([-2.5])
make_env = lambda: SimpleEnv(**sys_kwargs, q=q, seed=0)
def make_xform_env():
    env = make_env()
    env.system.A = xformA @ env.system.A
    env.system.B = xformB @ env.system.B
    return env
sys = create_simple(**sys_kwargs, name='simple')
env = make_env()

# sys_kwargs = dict(k=4, m=0.2, df=0.01)
# learn_kwargs = dict(steps=100_000, seed=0, learning_rate=2e-3,
#                     n_steps=2048, batch_size=64, n_epochs=10,
#                     gamma=0.)
# q, r = np.asarray([[1,0], [0,1]]), np.asarray([[0.00001]])
# angA, angB = np.pi/4, np.pi
# scalarA, scalarB = 0.8, 0.5
# xformA = np.asarray([[np.cos(angA), -np.sin(angA)],
#                     [np.sin(angA), np.cos(angA)]]).T \
#         @ (np.eye(2) * scalarA)
# xformB = np.asarray([[np.cos(angB), -np.sin(angB)],
#                     [np.sin(angB), np.cos(angB)]]).T \
#         @ (np.eye(2) * scalarB)
# x0 = np.asarray([-0.5, 0])
# make_env = lambda: SpringMassEnv(**sys_kwargs, q=q, seed=0)
# def make_xform_env():
#     env = make_env()
#     env.system.A = xformA @ env.system.A
#     env.system.B = xformB @ env.system.B
#     return env
# env = make_env()
# sys = create_spring(**sys_kwargs)

# sys_kwargs = dict(m=0.1, l=1, g=10., df=0.02)
# learn_kwargs = dict(steps=200_000, seed=0, learning_rate=1e-3,
#                     n_steps=2048, batch_size=128, n_epochs=10,
#                     gamma=0.99)
# q, r = np.asarray([[10,0], [0,1e-1]]), np.asarray([[0.00001]])
# angA, angB = np.pi/4, np.pi
# scalarA, scalarB = 0.8, 0.5
# xformA = np.asarray([[np.cos(angA), -np.sin(angA)],
#                     [np.sin(angA), np.cos(angA)]]).T \
#         @ (np.eye(2) * scalarA)
# xformB = np.asarray([[np.cos(angB), -np.sin(angB)],
#                     [np.sin(angB), np.cos(angB)]]).T \
#         @ (np.eye(2) * scalarB)
# x0 = np.asarray([-0.5, 0])
# make_env = lambda: PendulumEnv(**sys_kwargs, q=q, seed=0)
# def make_xform_env():
#     env = PendulumEnv(**sys_kwargs, q=q, seed=0,
#                       xformA=xformA, xformB=xformB)
#     return env
# env = make_env()
# sys = create_pendulum(**sys_kwargs)

# sys_kwargs = dict(mc=0.5, mp=0.1, l=1, g=10, df=0.01)
# learn_kwargs = dict(steps=400_000, seed=0, learning_rate=2e-3,
#                     n_steps=2048, batch_size=64, n_epochs=10,
#                     gamma=0.99)
# q = np.asarray([[1,0,0,0], [0,0.1,0,0],[0,0,1e-5,0],[0,0,0,1e-1]])
# r = np.asarray([[0.00001]])
# xformA = np.diagflat(np.random.RandomState(seed=0).randn(4))
# xformB = np.diagflat(np.random.RandomState(seed=1).randn(4))
# x0 = np.asarray([-np.pi/45, 0, 0, 0])
# make_env = lambda: CartpoleEnv(**sys_kwargs, q=q, seed=0)
# def make_xform_env():
#     env = CartpoleEnv(**sys_kwargs, q=q, seed=0,
#                       xformA=xformA, xformB=xformB)
#     return env
# env = make_env()
# sys = create_cartpole(**sys_kwargs)

# sys_kwargs = dict(vp=vp, sp=sp)
# learn_kwargs = dict(steps=200_000, seed=0, learning_rate=2e-4, n_steps=2000,
#                    gamma=0.8)
# q = np.diagflat([1,1,1,0.1,0.1,0.1,1,1,1,0.1,0.1,0.1])
# r = np.eye(4) * 1e-4
# xformA = np.random.RandomState(seed=0).randn(12,12)
# xformB = np.diagflat(np.random.RandomState(seed=1).randn(4))
# x0 = np.zeros(12, dtype=np.float32)
# make_env = lambda: MultirotorEnv(**sys_kwargs, seed=0)
# def make_xform_env():
#     env = MultirotorEnv(**sys_kwargs, q=q, seed=0,
#                       xformA=xformA, xformB=xformB)
#     return env
# env = make_env()
# sys = create_multirotor(**sys_kwargs)

## Classical Approaches

### Evaluation w/ LQR

In [None]:
env = make_env()
env_ = make_xform_env()
if not isinstance(env.system, control.LinearIOSystem):
    warnings.warn(('Must be a linear system in StateSpace from. '
                   'Use `control.linearize()` to convert.'))
    sys = env.system.linearize(
            np.zeros(env.system.nstates), np.zeros(env.system.ninputs))
    sys_xform = env_.system.linearize(
            np.zeros(env_.system.nstates), np.zeros(env_.system.ninputs))
else:
    sys = env.system
    sys_xform = env_.system

# Linear transformation with
k_og, *_ = control.lqr(sys, q, r)
sys_opt = sys.feedback(k_og)
sys_opt.name = 'sys_opt'

# transformed system but with old control law
sys_xform_old = sys_xform.feedback(k_og)
sys_xform_old.name = 'sys_xform_old'

# Optimizing on modified system
k, *_ = control.lqr(sys_xform, q, r)
sys_xform_opt = sys_xform.feedback(k)
sys_xform_opt.name = 'sys_xform_opt'

# Optimizing using transformed law
feedback = policy_transform(sys, xformA, xformB, k_og)
sys_xform_opt2 = sys_xform.feedback(feedback)
sys_xform_opt2.name = 'sys_xform_opt2'

In [None]:
# print(evaluate_lqr(k_og, make_env()))
print(evaluate_lqr(k_og, make_xform_env()))
print(evaluate_lqr(k, make_xform_env()))
print(evaluate_lqr(feedback, make_xform_env()))

In [None]:
multiple_response_plots([
    'old policy on old system',
    lambda: plot_env_response(make_env(), x0, k_og),
    'old policy on new system',
    lambda: plot_env_response(make_xform_env(), x0, k_og),
    'Optimal LQR on new system',
    lambda: plot_env_response(make_xform_env(), x0, k),
    'old policy transformation on new system',
    lambda: plot_env_response(make_xform_env(), x0, feedback)
])

### Evaluation w/ MPC

In [None]:
env = make_env()
env_ = make_xform_env()
if not isinstance(env.system, control.LinearIOSystem):
    warnings.warn(('Must be a linear system in StateSpace from. '
                   'Use `control.linearize()` to convert.'))
    sys = env.system.linearize(
            np.zeros(env.system.nstates), np.zeros(env.system.ninputs))
    sys_xform = env_.system.linearize(
            np.zeros(env_.system.nstates), np.zeros(env_.system.ninputs))
else:
    sys = env.system
    sys_xform = env_.system
state_xform, action_xform = policy_transform(sys, xformA, xformB)

In [None]:
sys_xform

In [None]:
# print(evaluate_mpc(make_env(), horizon=5, model_env=make_env()))
print(evaluate_mpc(make_xform_env(), horizon=10, model_env=make_env()))
print(evaluate_mpc(make_xform_env(), horizon=10, model_env=make_xform_env()))
print(evaluate_mpc(make_xform_env(), horizon=10, model_env=make_env(), state_xform=state_xform, action_xform=action_xform))

## Data-driven

In [None]:
# experiment config

# Whether the knowledge of the source system is known,
# or approximated from sampled experiences
data_driven_source = True
# Whether to assume that the system transformations are known
# and not approximate
accurate_xfer = False
# The factor by which action bounds are relaxed to fully allow the
# transformed policy to interact with environment
constrained_actions = None # True by default, not implemented yet
buffer_episodes=5
name = env.__class__.__name__
if data_driven_source and not accurate_xfer:
    name += 'StochasticAll'
elif data_driven_source:
    name += 'StochasticSource'
elif not accurate_xfer:
    name += 'StochasticXfer'

In [None]:
# train rl policy on original environment
agent = learn_rl(make_env(), tensorboard_log=name+'/Source',
                 **learn_kwargs)

In [None]:
plot_env_response(make_env(), x0, agent)

In [None]:
env = make_env()
env_ = make_xform_env()
state_xform, action_xform, info = get_transforms(agent, env, env_,
                                                 buffer_episodes, 'episodes',
                                                 info.F_A, info.F_B, data_driven_source)

In [None]:
# fine-tine source policy on target environment
agent_new = learn_rl(make_xform_env(),
                     reuse_parameters_of=agent,
                     tensorboard_log=name+'/Tuned', **learn_kwargs)

In [None]:
# append a transformation to source policy
agent_xform = transform_rl_policy(agent, state_xform, action_xform)

In [None]:
# fine-tine the transformed policy, except xforms
agent_xform_tuned = learn_rl(
    make_xform_env(),
    reuse_parameters_of=agent_xform,
    learnable_transformation=False,
    tensorboard_log=name+'/XformedTuned', **learn_kwargs
)
print('state_xform', agent_xform_tuned.policy.state_xform)
print('action_xform', agent_xform_tuned.policy.action_xform)

In [None]:
# fine-tine the transformed policy, including xforms
agent_xform_tuned_all = learn_rl(
    make_xform_env(),
    reuse_parameters_of=agent_xform,
    learnable_transformation=True,
    tensorboard_log=name+'/XformedTunedAll', **learn_kwargs
)
print('state_xform', agent_xform_tuned_all.policy.state_xform.data)
print('action_xform', agent_xform_tuned_all.policy.action_xform.data)

In [None]:
print('Source policy on source task')
print(evaluate_rl(agent, make_env(), n_eval_episodes=10))
print('Reusing source policy')
print(evaluate_rl(agent, make_xform_env(), n_eval_episodes=10))
print('Tuning source policy')
print(evaluate_rl(agent_new, make_xform_env(), n_eval_episodes=10))
print('Transforming source policy')
print(evaluate_rl(agent_xform, make_xform_env(), n_eval_episodes=10))
print('Tuning transformed policy, except for transformations')
print(evaluate_rl(agent_xform_tuned, make_xform_env(), n_eval_episodes=10))
print('Tuning transformed policy, including transformations')
print(evaluate_rl(agent_xform_tuned_all, make_xform_env(), n_eval_episodes=10))

### Plots

In [None]:
multiple_response_plots([
#     r'$\pi_s$ on $P_s$ ',
#     lambda: plot_env_response(make_env(), x0, agent),
    r'$\pi_s$ on $P_t$ ',
    lambda: plot_env_response(make_xform_env(), x0, agent),
    r'$\pi_s^*$ on $P_t$ ',
    lambda: plot_env_response(make_xform_env(), x0, agent_new, legend=False),
    r'$\pi_t$ on $P_t$ ',
    lambda: plot_env_response(make_xform_env(), x0, agent_xform, legend=False),
    r'$\pi_t^-$ on $P_t$ ',
    lambda: plot_env_response(make_xform_env(), x0, agent_xform_tuned, legend=False),
    r'$\pi_t^*$ on $P_t$ ',
    lambda: plot_env_response(make_xform_env(), x0, agent_xform_tuned_all, legend=False)
], figsize=(6,8))

In [None]:
# remember to specify up-to-date directory
# name = env.__class__.__name__
df = get_tensorboard_scalar_frame('tensorboard/%s/Source_1' % name)
dft = get_tensorboard_scalar_frame('tensorboard/%s/Tuned_1' % name)
dfxt = get_tensorboard_scalar_frame('tensorboard/%s/XformedTuned_1' % name)
dfxta = get_tensorboard_scalar_frame('tensorboard/%s/XformedTunedAll_1' % name)

%matplotlib inline
last_tstep = df.index[-1]
plt.figure(figsize=(6,2))
for i, (frame, label) in enumerate([
    (df, '$\pi_s$ on $P_s$'),
    (dft, '$\pi_s^*$ on $P_t$'),
    (dfxt, '$\pi_t^-$ on $P_t$'),
    (dfxta, '$\pi_t^+$ on $P_t$')
]):
    if i > 0:
        frame.index = frame.index + last_tstep
    plt.plot(frame['rollout', 'ep_rew_mean'], label=label)
if name.startswith('Simp'):
    plt.legend()
plt.ylabel('Mean episodic reward')
plt.xlabel('Learning time steps')
plt.setp(plt.xticks()[1], rotation=15)
plt.grid(True, 'both')