### Make a RL Environment about `Non-prehensile task` on table-top scene training with `Deep Latent Policy Gradient`

In [14]:
import mujoco,torch
import numpy as np
import matplotlib.pyplot as plt
from mujoco_parser import MuJoCoParserClass
from util import r2rpy, sample_xyzs
from np_env import NonPrehensileMarkovDecisionProcessClass
np.set_printoptions(precision=2,suppress=True,linewidth=100)
plt.rc('xtick',labelsize=6); plt.rc('ytick',labelsize=6)
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
print ("Torch version:[%s]"%(torch.__version__))
print ("MuJoCo version:[%s]"%(mujoco.__version__))

Torch version:[1.13.1+cu116]
MuJoCo version:[2.3.4]


### Parse `UR5e`

In [15]:
xml_path = '../asset/ur5e/scene_ur5e_rg2_obj.xml'
env = MuJoCoParserClass(name='UR5e',rel_xml_path=xml_path,VERBOSE=False)
# Instantiate MDP
mdp = NonPrehensileMarkovDecisionProcessClass(env,HZ=50,history_total_sec=1.0,history_intv_sec=0.1,VERBOSE=True)

obj_names = [body_name for body_name in env.body_names
             if body_name is not None and (body_name.startswith("obj_"))]
n_obj = len(obj_names)
# Place objects
xyzs = sample_xyzs(n_sample=n_obj,
                   x_range=[0.72,0.95],y_range=[-0.38,0.38],z_range=[0.81,0.81],min_dist=0.2)
colors = np.array([plt.cm.gist_rainbow(x) for x in np.linspace(0,1,n_obj)])
for obj_idx,obj_name in enumerate(obj_names):
    jntadr = env.model.body(obj_name).jntadr[0]
    env.model.joint(jntadr).qpos0[:3] = xyzs[obj_idx,:]
    geomadr = env.model.body(obj_name).geomadr[0]
    env.model.geom(geomadr).rgba = colors[obj_idx] # color

# Move tables and robot base
env.model.body('base_table').pos = np.array([0,0,0])
env.model.body('front_object_table').pos = np.array([1.05,0,0])
env.model.body('side_object_table').pos = np.array([0,-0.85,0])
env.model.body('base').pos = np.array([0,0,0.8])
print ("Ready.")

[UR5e] Instantiated
   [info] dt:[0.0200] HZ:[50], env-HZ:[500], mujoco_nstep:[10], state_dim:[26], o_dim:[260], a_dim:[7]
   [history] total_sec:[1.00]sec, n:[50], intv_sec:[0.10]sec, intv_tick:[5]
   [history] ticks:[ 0  5 10 15 20 25 30 35 40 45]
Ready.


In [16]:
max_torque = 2
mdp.env.ctrl_ranges[:,0] = -max_torque
mdp.env.ctrl_ranges[:,1] = +max_torque
print ("mdp.env.ctrl_ranges:\n",mdp.env.ctrl_ranges)

mdp.env.ctrl_ranges:
 [[-2.  2.]
 [-2.  2.]
 [-2.  2.]
 [-2.  2.]
 [-2.  2.]
 [-2.  2.]
 [-2.  2.]]


### Instantiate `DLPG` model

In [17]:
import numpy as np 
import random as rd 
import math 
import torch 
import torch.nn as nn
import matplotlib.pyplot as plt
import json
import wandb
import sys 
# from pathlib import Path
# BASEDIR = str(Path(__file__).parent)
# sys.path.append(BASEDIR)
sys.path.append('..')

from model.dlpg.dlpg import DeepLatentPolicyGradient
from model.dlpg.buffer import BufferClass
from model.utils import torch2np, np2torch, kernel_se, kernel_levse

In [18]:
training_data = [json.loads(line) for line in open('./json/np_buffer_v1.json', 'r')]
print("Total: {}".format(len(training_data)))


Total: 122500


In [19]:
print("Sample-Method:{}".format('uniform'))
# Set random seed 
random_seed = 42
np.random.seed(random_seed)
rd.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print("Device", device)
buffer = BufferClass(xdim=9, cdim=72, buffer_limit=len(training_data), device=device)
for idx, data in enumerate(training_data): 
    buffer.store(x=np.array(data["x"]).reshape(-1), c=data["c"], reward=data["reward"])

DLPG = DeepLatentPolicyGradient(xdim     = 9,
                                cdim     = 72,
                                zdim     = 2,
                                hdims    = [128],
                                actv_enc = nn.LeakyReLU(),
                                actv_dec = None,#nn.LeakyReLU(), 
                                actv_out = nn.Tanh(), 
                                actv_q   = nn.Softplus(),
                                device   = device)
# DLPG.cvae.load_state_dict(torch.load(weight_path))
optimizer = torch.optim.Adam(params=DLPG.cvae.parameters(),lr=0.001,betas=(0.9,0.99),eps=1e-4)

print("Model Instance.")

Sample-Method:uniform
Device cuda:0
Model Instance.


In [20]:
max_epochs = 100
batch_size = 128
epsgrdy = 0.1

# wandb.init(project="dlpg", entity="dlpg")
# wandb.config.max_epochs = max_epochs
# wandb.config.batch_size = batch_size

eval_epochs = 10
n_sample = 10
RENDER = True
WANDB = False
update_every = 1
MAXITER = 1000
sample_method = 'uniform'
runname = 'none'


In [23]:
def random_pose(env, obj_names, x_range=[0.72,0.95],y_range=[-0.38,0.38],z_range=[0.81,0.81],min_dist=0.2):
    """
        Obstacle random spawn 
    """
    n_obj = len(obj_names)
    # Place objects
    xyzs = sample_xyzs(n_sample=n_obj,
                    x_range=x_range,y_range=y_range,z_range=z_range,min_dist=min_dist)
    colors = np.array([plt.cm.gist_rainbow(x) for x in np.linspace(0,1,n_obj)])
    for obj_idx,obj_name in enumerate(obj_names):
        jntadr = env.model.body(obj_name).jntadr[0]
        env.model.joint(jntadr).qpos0[:3] = xyzs[obj_idx,:]
        geomadr = env.model.body(obj_name).geomadr[0]
        env.model.geom(geomadr).rgba = colors[obj_idx] # color

    return xyzs

In [25]:
# Train
for epoch in range(max_epochs):
    print("[Epoch: {}]".format(epoch+1))
    """ ROLLOUT """
    env.reset()
    # Ranodm spawn
    goal_position  = np.random.uniform(-0.1, 0.1) 
    # random_obs = env.random_pose(goal_position=goal_position)
    random_obs = random_pose(env, obj_names, x_range=[0.72,0.95],y_range=[-0.38,0.38],z_range=[0.81,0.81],min_dist=0.2)
    
    # Get a conditional vector
    c_np           = np.array(random_obs).reshape(-1)
    c_np           = np.append(c_np, goal_position) # [(x,y) x n + 1 (y axis of goal posiiton)]
    c              = np2torch(c_np, device=device).reshape(1,-1)
    # Epsgrdy
    EXPLORE        = 1/10**(epoch/epsgrdy)   
    EXPLOIT        = np.random.rand() > EXPLORE 
    # Exploit [Posterior sampling]
    if EXPLOIT:
        z                              = torch.randn(size=(1, DLPG.zdim)).to(device)
        traj, t_test, normed_x_anchor  = DLPG.exploit(z=z, c=c, goal_position=goal_position)
        normed_x_anchor                = normed_x_anchor.reshape(-1)
        x_anchor                       = DLPG.scale_up(normed_x_anchor) # Store it into the buffer 
        # Solve IK 
        q_trajs  = []
        for _, (x, y) in enumerate(zip(t_test, traj)):
            # Make a trajectory by adding q values in every step.
            q = env.solve_ik(P_EE_des=np.array([x, y, 0.9], dtype=object), 
                                R_EE_des=np.array([-math.pi, 0, math.pi], dtype=object))
            q_trajs.append(q)        
            # Target position
            goal = np.array([x, y, 0.9], dtype=object)
            # Interpolation 
            interpoled_q_traj = DLPG.grp.interpolation(x_anchor=q_trajs, num_interpol=5)    
            # Render 
            collision = env.execute_arm(q_des_lst    = interpoled_q_traj, 
                                        gripper_mode = "open", 
                                        goal         = goal, 
                                        obs_pose_lst = random_obs, 
                                        RENDER       = RENDER)
            reward = DLPG.get_reward(collision, normed_x_anchor, random_obs, goal_position) 
            buffer.store(x=x_anchor.reshape(-1), c=c, reward=reward)        
    # Explore [Prior sampling]
    else: 
        # trajs, t_test = DLPG.random_explore(n_sample=n_sample, goal_position=goal_position)
        trajs, t_test = DLPG.random_explore(n_sample=n_sample)
        # Solve IK 
        for traj in trajs:
            q_trajs  = []
            normed_x_anchor = traj[5:7] # Fix index, To sample a point around a position of the target object.
            x_anchor        = DLPG.scale_up(normed_x_anchor) # Store it into the buffer 
            for _, (x, y) in enumerate(zip(t_test, traj)):
                # Make a trajectory by adding q values in every step.
                q = env.solve_ik(P_EE_des=np.array([x, y, 0.9], dtype=object), 
                                    R_EE_des=np.array([-math.pi, 0, math.pi], dtype=object))
                q_trajs.append(q)
            # Target position
            goal = np.array([x, y, 0.9], dtype=object)
            # Interpolation 
            interpoled_q_traj = DLPG.grp.interpolation(x_anchor=q_trajs, num_interpol=5)    
            # Render 
            collision = env.execute_arm(q_des_lst    = interpoled_q_traj, 
                                        gripper_mode = "open", 
                                        goal         = goal, 
                                        obs_pose_lst = random_obs, 
                                        RENDER       = RENDER)
            reward = DLPG.get_reward(collision, normed_x_anchor, random_obs, goal_position) 
            buffer.store(x=x_anchor.reshape(-1), c=c_np, reward=reward)
    """ UPDATE """
    loss_recon_sum=0;loss_kl_sum=0;n_batch_sum=0
    if (epoch+1)%update_every==0 and (epoch+1)>99:
        for it in range(MAXITER):
            if it >= 30: beta = 10 # Heuristic 
            else:        beta = 0.0
            batch = buffer.sample_batch(sample_method=sample_method, batch_size=batch_size)
            x_batch, c_batch, reward_batch = batch["x"], batch["c"], batch["reward"]
            total_loss_out,loss_info = DLPG.cvae.loss_total(x               = x_batch, 
                                                            c               = c_batch, 
                                                            q               = reward_batch, 
                                                            LOSS_TYPE       = 'L1+L2',
                                                            recon_loss_gain = 1,
                                                            beta            = beta,
                                                            STOCHASTICITY   = True)
            optimizer.zero_grad()
            total_loss_out.backward()
            optimizer.step()
            n_batch        = x_batch.shape[0]
            loss_recon_sum = loss_recon_sum + n_batch*loss_info['loss_recon_out']
            loss_kl_sum    = loss_kl_sum + n_batch*loss_info['loss_kl_out']
            n_batch_sum    = n_batch_sum + n_batch
        # Average loss during train
        loss_recon_avg, loss_kl_avg = (loss_recon_sum/n_batch_sum),(loss_kl_sum/n_batch_sum)
        # Print
        print ("[%d/%d] DLPG updated. Total loss:[%.3f] (recon:[%.3f] kl:[%.3f])"%
            (epoch+1,max_epochs,loss_recon_avg+loss_kl_avg,loss_recon_avg,loss_kl_avg))
        if WANDB:
            wandb.log({"Total loss":loss_recon_avg+loss_kl_avg,
                        "recon_loss":loss_recon_avg,
                        "kl_loss":loss_kl_avg}, step=epoch+1)   
        # Save weights         
        torch.save(DLPG.cvae.state_dict(),"weights"+"/"+str(runname)+"/{}steps.pth".format(epoch+1))    
        """ EVALUATE """
        eval_reward=0
        plt.figure(figsize=(6,9))
        with torch.no_grad():
            for it in range(eval_epochs):
                env.reset()
                # Random spawn 
                goal_position  = np.random.uniform(-0.1, 0.1) 
                random_obs = env.random_pose(goal_position=goal_position)
                # Get a conditional vector
                c_np           = np.array(random_obs).reshape(-1)
                c_np           = np.append(c_np, goal_position)  
                c              = np2torch(c_np, device=device).reshape(1,-1)
                z              = torch.randn(size=(1, DLPG.zdim)).to(device)
                traj, t_test, normed_x_anchor = DLPG.exploit(z=z, 
                                                                c=c, 
                                                                goal_position=goal_position)   
                normed_x_anchor = normed_x_anchor.reshape(-1)
                # Solve IK 
                q_trajs  = []
                for _, (x, y) in enumerate(zip(t_test, traj)):
                    # Make a trajectory by adding q values in every step.
                    q = env.solve_ik(P_EE_des=np.array([x, y, 0.9], dtype=object), 
                                        R_EE_des=np.array([-math.pi, 0, math.pi], dtype=object))
                    q_trajs.append(q)
                # Target position
                goal = np.array([x, y, 0.9], dtype=object)
                # Interpolation 
                interpoled_q_traj = DLPG.grp.interpolation(x_anchor=q_trajs, num_interpol=10)    
                # Render 
                collision = env.execute_arm(q_des_lst    = interpoled_q_traj, 
                                            gripper_mode = "open", 
                                            goal         = goal, 
                                            obs_pose_lst = random_obs, 
                                            RENDER       = RENDER)
                reward    = DLPG.get_reward(collision, normed_x_anchor, random_obs, goal_position)  
                eval_reward+=reward
                # Demonstration for exploitation samples
                plt.ylim(-0.45, 0.45)
                plt.xlim(0.5,0.9)
                plt.title("Exploit samples[Epoch{}]".format(epoch), fontsize=20)
                plt.xlabel("X axis", fontsize=15)
                plt.ylabel("Y axis", fontsize=15)
                plt.scatter(0.65, normed_x_anchor[0], s=50) 
                plt.scatter(0.8,  normed_x_anchor[1], s=50)
            plt.savefig("data/exploit_samples{}.png".format(epoch+1))
            # Print
            if WANDB: wandb.log({"Reward":eval_reward/eval_epochs}, step=epoch+1)   
            print("[Evaluate Reward]:{}".format(eval_reward/eval_epochs))
    print("Done.")

[Epoch: 1]
