In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import os
import inspect
currentdir = os.path.dirname(os.path.abspath(
    inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
os.sys.path.insert(1, parentdir+'/src')
import torch
import torch.nn.functional as F
from torchvision import transforms
import numpy as np
import time
from tqdm import trange
import matplotlib.pyplot as plt
import matplotlib

In [3]:
import gym
import pybullet as p
import stage.envs
from stage.tasks.twolink.reaching import TwoLinkReaching
from stage.utils.nn import use_gpu
use_gpu()

In [4]:
savepath = parentdir + '/data/twolink/'

In [5]:
task = TwoLinkReaching(render=False)

# Read task parameters, should we use a separate class to wrap this?

nq, nv, nu, nx = task.nq, task.nv, task.nu, task.nx
dt_control, dt_env = task.dt_control, task.dt_env
q_lb, q_ub = task.q_lb, task.q_ub
v_lb, v_ub = -100*torch.ones_like(q_lb), 100*torch.ones_like(q_ub)

In [6]:
# Setup action parameterization

from stage.controllers.actor import Actor
from stage.controllers.trivial import Identity
from stage.controllers.pd import PD


# na = 2
# action_ub = torch.Tensor(task.env.action_space.high)
# action_lb = torch.Tensor(task.env.action_space.low)
# actor = Actor(na, Identity(nq, nv, nu), action_lb, action_ub)

na = 4
gain_ub = 50 * torch.ones((nq))
gain_lb = 0. * torch.ones((nq))
action_ub = torch.cat((gain_ub, q_ub))
action_lb = torch.cat((gain_lb, q_lb))
actor = Actor(na, PD(nq, nv, nu), action_lb, action_ub)

task.cost.actor = actor 

In [7]:
# Setup model learning

from stage.dynamics.probabilistic_ensemble import ProbabilisticEnsemble, DefaultDx

ensemble_size = 5
nn_epochs = 10
batch_size = 64
    
dynamics = ProbabilisticEnsemble(nq, nv, na, dt_control, 
                                 DefaultDx,
                                 ensemble_size, 
                                 learning_rate = 0.001)

dynamics.state_lb = torch.cat((q_lb, v_lb))
dynamics.state_ub = torch.cat((q_ub, v_ub))

In [8]:
# Setup controller

from stage.controllers.tsmpc import TSMPC


plan_horizon = 30
n_particles = 10
pop_size = 400
assert n_particles % ensemble_size == 0

controller = TSMPC(dynamics, task.cost, actor,
                   plan_horizon, n_particles, pop_size)

In [9]:
# Setup learner

from stage.learners.learn_and_control_model import LearnAndControlModel
learner = LearnAndControlModel(task, dynamics, controller)

In [None]:
controller.regularize(1)
_ = learner.learn(50, verbose=True)
learner.save_training_data(savepath+'data_pd_reg')

Network training: 100%|██████████| 10/10 [00:00<00:00, 40.82epoch(s)/s, Training loss MSE=1.3777943]


Iteration:  0
Initial state:  tensor([1.5708, 0.0000, 0.0000, 0.0000])
avg. decision time:  0.004621183872222901
obs. reward:  -236.51744
act. reward:  -1.7556142


Network training: 100%|██████████| 10/10 [00:00<00:00, 14.09epoch(s)/s, Training loss MSE=0.29596186]


Iteration:  1
Initial state:  tensor([1.5708, 0.0000, 0.0000, 0.0000])
avg. decision time:  0.19952522039413453
obs. reward:  -201.50882
act. reward:  -0.51097184


Network training: 100%|██████████| 10/10 [00:01<00:00,  9.83epoch(s)/s, Training loss MSE=0.17012823]


Iteration:  2
Initial state:  tensor([1.5708, 0.0000, 0.0000, 0.0000])
avg. decision time:  0.20103683471679687
obs. reward:  -185.651
act. reward:  -0.3431901


Network training: 100%|██████████| 10/10 [00:01<00:00,  7.68epoch(s)/s, Training loss MSE=0.13699481]


Iteration:  3
Initial state:  tensor([1.5708, 0.0000, 0.0000, 0.0000])
avg. decision time:  0.20146214008331298
obs. reward:  -189.37177
act. reward:  -0.07916472


Network training: 100%|██████████| 10/10 [00:01<00:00,  6.37epoch(s)/s, Training loss MSE=0.07583066]


Iteration:  4
Initial state:  tensor([1.5708, 0.0000, 0.0000, 0.0000])
avg. decision time:  0.20157475709915162
obs. reward:  -52.93712
act. reward:  -0.26388386


Network training: 100%|██████████| 10/10 [00:01<00:00,  5.31epoch(s)/s, Training loss MSE=0.22095731]


Iteration:  5
Initial state:  tensor([1.5708, 0.0000, 0.0000, 0.0000])
avg. decision time:  0.20215813636779786
obs. reward:  -741.3115
act. reward:  -1.9593898


Network training: 100%|██████████| 10/10 [00:02<00:00,  4.85epoch(s)/s, Training loss MSE=0.1401268]


Iteration:  6
Initial state:  tensor([1.5708, 0.0000, 0.0000, 0.0000])
avg. decision time:  0.20255730628967286
obs. reward:  -60.736763
act. reward:  -0.57165956


Network training: 100%|██████████| 10/10 [00:02<00:00,  4.22epoch(s)/s, Training loss MSE=0.16511297]


Iteration:  7
Initial state:  tensor([1.5708, 0.0000, 0.0000, 0.0000])
avg. decision time:  0.20350538015365602
obs. reward:  -68.14297
act. reward:  -2.186529


Network training: 100%|██████████| 10/10 [00:02<00:00,  3.45epoch(s)/s, Training loss MSE=0.15133101]


Iteration:  8
Initial state:  tensor([1.5708, 0.0000, 0.0000, 0.0000])
avg. decision time:  0.20351219892501832
obs. reward:  -44.503937
act. reward:  -0.8429198


Network training: 100%|██████████| 10/10 [00:03<00:00,  3.19epoch(s)/s, Training loss MSE=0.14950696]


Iteration:  9
Initial state:  tensor([1.5708, 0.0000, 0.0000, 0.0000])
avg. decision time:  0.2041699457168579
obs. reward:  -43.501266
act. reward:  -0.9734469


Network training: 100%|██████████| 10/10 [00:03<00:00,  3.03epoch(s)/s, Training loss MSE=0.13670002]


Iteration:  10
Initial state:  tensor([1.5708, 0.0000, 0.0000, 0.0000])
avg. decision time:  0.2033783721923828
obs. reward:  -37.434044
act. reward:  -0.61772126


Network training: 100%|██████████| 10/10 [00:03<00:00,  2.67epoch(s)/s, Training loss MSE=0.11917182]


Iteration:  11
Initial state:  tensor([1.5708, 0.0000, 0.0000, 0.0000])
avg. decision time:  0.20320673465728759
obs. reward:  -33.230015
act. reward:  -0.7916581


Network training: 100%|██████████| 10/10 [00:03<00:00,  2.71epoch(s)/s, Training loss MSE=0.10668345]


Iteration:  12
Initial state:  tensor([1.5708, 0.0000, 0.0000, 0.0000])
avg. decision time:  0.20349219799041748
obs. reward:  -33.316372
act. reward:  -0.81428385


Network training: 100%|██████████| 10/10 [00:04<00:00,  2.36epoch(s)/s, Training loss MSE=0.10591865]


Iteration:  13
Initial state:  tensor([1.5708, 0.0000, 0.0000, 0.0000])
avg. decision time:  0.20343077898025513
obs. reward:  -32.22158
act. reward:  -0.8652014


Network training: 100%|██████████| 10/10 [00:04<00:00,  2.33epoch(s)/s, Training loss MSE=0.10163815]


Iteration:  14
Initial state:  tensor([1.5708, 0.0000, 0.0000, 0.0000])
avg. decision time:  0.20420576095581056
obs. reward:  -31.567673
act. reward:  -0.9699277


Network training: 100%|██████████| 10/10 [00:04<00:00,  2.05epoch(s)/s, Training loss MSE=0.094385654]


Iteration:  15
Initial state:  tensor([1.5708, 0.0000, 0.0000, 0.0000])
avg. decision time:  0.20413365364074706
obs. reward:  -32.744102
act. reward:  -0.88687325


Network training: 100%|██████████| 10/10 [00:05<00:00,  1.85epoch(s)/s, Training loss MSE=0.0914144] 


Iteration:  16
Initial state:  tensor([1.5708, 0.0000, 0.0000, 0.0000])
avg. decision time:  0.2044636583328247
obs. reward:  -30.8816
act. reward:  -1.0232216


Network training: 100%|██████████| 10/10 [00:05<00:00,  1.79epoch(s)/s, Training loss MSE=0.085623495]


Iteration:  17
Initial state:  tensor([1.5708, 0.0000, 0.0000, 0.0000])
avg. decision time:  0.2037623143196106
obs. reward:  -36.339718
act. reward:  -0.72220355


Network training: 100%|██████████| 10/10 [00:05<00:00,  1.71epoch(s)/s, Training loss MSE=0.076477304]


Iteration:  18
Initial state:  tensor([1.5708, 0.0000, 0.0000, 0.0000])
avg. decision time:  0.20416337490081787
obs. reward:  -36.789764
act. reward:  -0.70429754


Network training: 100%|██████████| 10/10 [00:06<00:00,  1.64epoch(s)/s, Training loss MSE=0.066132106]


Iteration:  19
Initial state:  tensor([1.5708, 0.0000, 0.0000, 0.0000])
avg. decision time:  0.20380664348602295
obs. reward:  -30.981985
act. reward:  -1.023


Network training: 100%|██████████| 10/10 [00:06<00:00,  1.54epoch(s)/s, Training loss MSE=0.08139923]


Iteration:  20
Initial state:  tensor([1.5708, 0.0000, 0.0000, 0.0000])
avg. decision time:  0.20393330097198487
obs. reward:  -31.66814
act. reward:  -0.9821745


Network training: 100%|██████████| 10/10 [00:06<00:00,  1.50epoch(s)/s, Training loss MSE=0.08328985]


Iteration:  21
Initial state:  tensor([1.5708, 0.0000, 0.0000, 0.0000])
avg. decision time:  0.2036115550994873
obs. reward:  -33.895695
act. reward:  -0.8365367


Network training: 100%|██████████| 10/10 [00:07<00:00,  1.42epoch(s)/s, Training loss MSE=0.065896295]


Iteration:  22
Initial state:  tensor([1.5708, 0.0000, 0.0000, 0.0000])
avg. decision time:  0.20352036952972413
obs. reward:  -35.359886
act. reward:  -0.7944973


Network training: 100%|██████████| 10/10 [00:06<00:00,  1.47epoch(s)/s, Training loss MSE=0.058860738]


Iteration:  23
Initial state:  tensor([1.5708, 0.0000, 0.0000, 0.0000])
avg. decision time:  0.20358419179916382
obs. reward:  -31.648674
act. reward:  -1.0393066


Network training:  80%|████████  | 8/10 [00:06<00:01,  1.29epoch(s)/s, Training loss MSE=0.05351327] 

In [None]:
controller.regularize(1)
traj, log = task.perform(task.goal, controller)
act_seq = traj[:, nx:nx+na]
initial_obs = traj[0, :nx]
final_obs = traj[-1, :nx]

In [None]:
n_sample = 50
traj_pred = dynamics.unroll(initial_obs, 
                            act_seq, 
                            n_sample)

predicted_err = torch.norm(traj_pred[-1, :, :]-final_obs.expand(n_sample, -1), p=2, dim=1)
print (predicted_err.mean())
print (predicted_err.std())

In [None]:
task_horizon = task.task_horizon

traj_pred_mean = torch.mean(traj_pred, dim=1)
traj_pred_std = torch.std(traj_pred, dim=1)
traj_pred_mean = traj_pred_mean.detach().cpu().numpy()
traj_pred_std = traj_pred_std.detach().cpu().numpy()
traj = traj.detach().cpu().numpy()

desired = task.cost.desired.repeat((task_horizon, 1))
desired = desired.detach().cpu().numpy()

In [None]:
font = {'family' : 'serif',
        'size'   : 22}

matplotlib.rc('font', **font)
np.set_printoptions(precision=3, linewidth=200, suppress=True)

In [None]:
# traj_pred_mean = np.load('traj_pred_mean.npy')
# traj_pred_std = np.load('traj_pred_std.npy')
# traj = np.load('traj.npy')

In [None]:
d = nq
dt = dt_control
fig, ax = plt.subplots(d, figsize=(10, d * 6))
t = np.arange(0.0, task_horizon*dt, dt)
dlb = traj_pred_mean - traj_pred_std
ub = traj_pred_mean + traj_pred_std

for i in range(d):
    ax[i].plot(t, traj[:, i], lw=4, color='orange', label='actual')
    ax[i].plot(t, traj_pred_mean[:, i], lw=4, color='b', label='predicted mean')
    ax[i].plot(t, desired[:, i], lw=2, color='k', ls='-.', label='desired')
    lb = traj_pred_mean - traj_pred_std
    ub = traj_pred_mean + traj_pred_std
    ax[i].fill_between(t, lb[:, i], ub[:, i], facecolor='blue',
                alpha=0.2)
    _ = ax[i].grid()
    _ = ax[i].set_ylim([-3.2, 3.2])
    ax[i].legend(loc='upper center', bbox_to_anchor=(0.5, 1.3),
             ncol=3, fancybox=True, shadow=True)
# fig.savefig('prediction_with_reg_150steps' + '.png', bbox_inches='tight')

In [None]:
# import numpy as np
# from stage.tasks.twolink.reaching import TwoLinkReaching
# from stage.utils.nn import use_gpu
# use_gpu()
# savepath = parentdir + '/data/twolink/'
# data_train = np.load(savepath+'data_pd_reg.npy')
# task = TwoLinkReaching(render=True)
# task.visualize_training_data(data_train, 0)