In [1]:
!pip install mujoco mujoco-python-viewer pyvirtualdisplay opencv-python mediapy
!sudo apt-get install xvfb
!pip install xvfbwrapper

Collecting mujoco
  Downloading mujoco-2.3.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting mujoco-python-viewer
  Downloading mujoco_python_viewer-0.1.3-py3-none-any.whl (10 kB)
Collecting pyvirtualdisplay
  Downloading PyVirtualDisplay-3.0-py3-none-any.whl (15 kB)
Collecting mediapy
  Downloading mediapy-1.1.9-py3-none-any.whl (25 kB)
Collecting glfw (from mujoco)
  Downloading glfw-2.6.2-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-manylinux2014_x86_64.whl (208 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m208.2/208.2 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
Collecting jedi>=0.16 (from ipython->mediapy)
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m64.7 MB/s[0m eta [36m0:00:00[0m
Installing

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/MyDrive/RL/code') # "RL" 폴더 이름 수정할 것

Mounted at /content/drive


### Make `Snapbot` walk using `SAC`

In [3]:
import mujoco,torch,os
import numpy as np
import matplotlib.pyplot as plt
from mujoco_parser import MuJoCoParserClass
from snapbot_env import SnapbotMarkovDecisionProcessClass
from sac import ReplayBufferClass,ActorClass,CriticClass,get_target
np.set_printoptions(precision=2,suppress=True,linewidth=100)
plt.rc('xtick',labelsize=6); plt.rc('ytick',labelsize=6)
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
print ("MuJoCo version:[%s]"%(mujoco.__version__))

MuJoCo version:[2.3.7]


### Initialize `Snapbot` environment

In [4]:
xml_path = '../asset/snapbot/scene_snapbot.xml'
env = MuJoCoParserClass(name='Snapbot',rel_xml_path=xml_path,VERBOSE=False)
mdp = SnapbotMarkovDecisionProcessClass(env,HZ=50,history_total_sec=0.2,history_intv_sec=0.1,VERBOSE=True)

[Snapbot] Instantiated
   [info] dt:[0.0200] HZ:[50], env-HZ:[500], mujoco_nstep:[10], state_dim:[35], o_dim:[70], a_dim:[8]
   [history] total_sec:[0.20]sec, n:[10], intv_sec:[0.10]sec, intv_tick:[5]
   [history] ticks:[0 5]


### `SAC` hyperparameters

In [5]:
n_episode         = 1000 # number of total episodes (rollouts)
max_epi_sec       = 5.0 # maximum episode length in second
max_epi_tick      = int(max_epi_sec*mdp.HZ) # maximum episode length in tick
n_warmup_epi      = 10 # number of warm-up episodes
buffer_limit      = 50000
buffer_warmup     = buffer_limit // 5
init_alpha        = 0.1
max_torque        = 2.0
# Update
lr_actor          = 0.0002
lr_alpha          = 0.0000 # 0.0003
lr_critic         = 0.0001
n_update_per_tick = 1 # number of updates per tick
batch_size        = 256
gamma             = 0.99
tau               = 0.005
# Debug
print_every       = 20
eval_every        = 50
RENDER_EVAL       = False
save_every        = 50
print ("n_episode:[%d], max_epi_sec:[%.2f], max_epi_tick:[%d]"%
       (n_episode,max_epi_sec,max_epi_tick))
print ("n_warmup_epi:[%d], buffer_limit:[%.d], buffer_warmup:[%d]"%
       (n_warmup_epi,buffer_limit,buffer_warmup))

n_episode:[1000], max_epi_sec:[5.00], max_epi_tick:[250]
n_warmup_epi:[10], buffer_limit:[50000], buffer_warmup:[10000]


### Initialize networks

In [6]:
device = 'cpu' # cpu / mps / cuda
replay_buffer = ReplayBufferClass(buffer_limit, device=device)
actor = ActorClass(
    obs_dim=mdp.o_dim,h_dims=[256,256],out_dim=mdp.a_dim,max_out=max_torque,
    init_alpha=init_alpha,lr_actor=lr_actor,lr_alpha=lr_alpha,device=device).to(device)
critic_one = CriticClass(
    obs_dim=mdp.o_dim,a_dim=mdp.a_dim,h_dims=[256,256],out_dim=1,
    lr_critic=lr_critic, device=device).to(device)
critic_two = CriticClass(
    obs_dim=mdp.o_dim,a_dim=mdp.a_dim,h_dims=[256,256],out_dim=1,
    lr_critic=lr_critic, device=device).to(device)
critic_one_trgt = CriticClass(
    obs_dim=mdp.o_dim,a_dim=mdp.a_dim,h_dims=[256,256],out_dim=1,
    lr_critic=lr_critic, device=device).to(device)
critic_two_trgt = CriticClass(
    obs_dim=mdp.o_dim,a_dim=mdp.a_dim,h_dims=[256,256],out_dim=1,
    lr_critic=lr_critic, device=device).to(device)
print ("Ready.")

Ready.


### Helper functions

In [7]:
def np2torch(x_np,device): return torch.tensor(x_np,dtype=torch.float32,device=device)
def torch2np(x_torch): return x_torch.detach().cpu().numpy()
print ("Ready.")

Ready.


### Modify torque ranges

In [8]:
mdp.env.ctrl_ranges[:,0] = -max_torque
mdp.env.ctrl_ranges[:,1] = +max_torque
print ("mdp.env.ctrl_ranges:\n",mdp.env.ctrl_ranges)

mdp.env.ctrl_ranges:
 [[-2.  2.]
 [-2.  2.]
 [-2.  2.]
 [-2.  2.]
 [-2.  2.]
 [-2.  2.]
 [-2.  2.]
 [-2.  2.]]


### Train

In [9]:
REMOVE_EXISTING_PTH = False
SAVE_CURRENT_PTH    = False

In [12]:
print ("Start training.")
for epi_idx in range(n_episode): # for each episode
    zere_to_one = epi_idx/n_episode
    one_to_zero = 1-zere_to_one
    # Reset MDP
    s = mdp.reset()
    # Loop
    USE_RANDOM_POLICY = (np.random.rand()<(0.1*one_to_zero)) or (epi_idx < n_warmup_epi)
    reward_total,reward_forward = 0.0,0.0
    for tick in range(max_epi_tick): # for each tick in an episode
        if USE_RANDOM_POLICY:
            a_np = mdp.sample_action()
        else:
            a,log_prob = actor(np2torch(s,device=device))
            a_np = torch2np(a)
        # Step
        s_prime,reward,done,info = mdp.step(a_np,max_time=max_epi_sec)
        replay_buffer.put((s,a_np,reward,s_prime,done))
        reward_total += reward
        reward_forward += info['r_forward']
        s = s_prime
        if done is True: break # terminate condition

        # Replay buffer
        if replay_buffer.size() > buffer_warmup:
             for _ in range(n_update_per_tick):
                mini_batch = replay_buffer.sample(batch_size)
                # Update critics
                td_target = get_target(actor,critic_one_trgt,critic_two_trgt,
                                       gamma=gamma,mini_batch=mini_batch,device=device)
                critic_one.train(td_target,mini_batch)
                critic_two.train(td_target,mini_batch)
                # Update actor
                actor.train(critic_one,critic_two,target_entropy=-mdp.a_dim,mini_batch=mini_batch)
                # Soft update of critics
                critic_one.soft_update(tau=tau,net_target=critic_one_trgt)
                critic_two.soft_update(tau=tau,net_target=critic_two_trgt)

    # Compute x_diff
    x_diff = mdp.env.get_p_body('torso')[0]

    # Print
    if (epi_idx%print_every)==0 or (epi_idx==(n_episode-1)):
        epi_tick = tick
        print ("[%d/%d][%.1f%%] reward:[%.3f] x_diff:[%.3f] epi_len:[%d/%d] buffer_size:[%d] alpha:[%.2f]"%
               (epi_idx,n_episode,100.0*(epi_idx/n_episode),reward_total,x_diff,epi_tick,max_epi_tick,
                replay_buffer.size(),actor.log_alpha.exp()))

    # Evaluation
    if (epi_idx%eval_every)==0 or (epi_idx==(n_episode-1)):
        if RENDER_EVAL: mdp.init_viewer()
        s = mdp.reset()
        reward_total = 0.0
        for tick in range(max_epi_tick):
            a,_ = actor(np2torch(s,device=device),SAMPLE_ACTION=False)
            s_prime,reward,done,info = mdp.step(torch2np(a),max_time=max_epi_sec)
            reward_total += reward
            if RENDER_EVAL and ((tick%5) == 0):
                mdp.render(TRACK_TORSO=True,PLOT_WORLD_COORD=True,PLOT_TORSO_COORD=True,
                           PLOT_SENSOR=True,PLOT_CONTACT=True,PLOT_TIME=True)
            s = s_prime
            if RENDER_EVAL:
                if not mdp.is_viewer_alive(): break
        if RENDER_EVAL: mdp.close_viewer()
        x_diff = mdp.env.get_p_body('torso')[0]
        print ("  [Eval] reward:[%.3f] x_diff:[%.3f] epi_len:[%d/%d]"%
               (reward_total,x_diff,tick,max_epi_tick))

    # Save network
    if (epi_idx%save_every)==0 or (epi_idx==(n_episode-1)):
        pth_path = '../result/weights/sac_%s/episode_%d.pth'%(mdp.name.lower(),epi_idx)
        dir_path = os.path.dirname(pth_path)
        if not os.path.exists(dir_path): os.makedirs(dir_path)
        if epi_idx == 0: # remove all existing files (if epi_idx is 0)
            files = os.listdir(path=dir_path)
            if REMOVE_EXISTING_PTH:
                print ("  [Save] Remove existing [%d] pth files."%(len(files)))
                for file in files: os.remove(os.path.join(dir_path,file))
        if SAVE_CURRENT_PTH:
            torch.save(actor.state_dict(),pth_path)
            print ("  [Save] [%s] saved."%(pth_path))

print ("Done.")

Start training.
[0/1000][0.0%] reward:[-37.223] x_diff:[0.082] epi_len:[249/250] buffer_size:[850] alpha:[0.10]
  [Eval] reward:[2.938] x_diff:[0.037] epi_len:[249/250]
[20/1000][2.0%] reward:[3.678] x_diff:[0.058] epi_len:[249/250] buffer_size:[5850] alpha:[0.10]


KeyboardInterrupt: ignored