In [1]:
import argparse
import os
import sys
import pickle
import time
sys.path.append(os.getcwd())

from khrylib.utils import *
from khrylib.rl.core.policy_gaussian import PolicyGaussian
from khrylib.rl.core.critic import Value
from khrylib.rl.agents import AgentPPO
from khrylib.models.mlp import MLP
from torch.utils.tensorboard import SummaryWriter
from motion_imitation.envs.humanoid_im import HumanoidEnv
from motion_imitation.envs.humanoid_ia import HumanoidImpAwareEnv
from motion_imitation.utils.config import Config
from motion_imitation.reward_function import reward_func
from tqdm import tqdm





In [2]:
class Args:
    cfg = "0202"
    render = False
    test = False
    num_threads = 1
    max_iter_num = 100
    gpu_index = 0
    iter = 0
    show_noise = False
    save_model_interval = 1
    imp_aware = False

args = Args()
if args.render:
    args.num_threads = 1
cfg = Config(args.cfg, args.test, create_dirs=not (args.render or args.iter > 0))
if args.imp_aware:
    change_config_path_via_args(cfg, args.cfg, '_ia')
if args.save_model_interval is not None:
    cfg.save_model_interval = int(args.save_model_interval)
dtype = torch.float64
torch.set_default_dtype(dtype)
device = torch.device('cuda', index=args.gpu_index) if torch.cuda.is_available() else torch.device('cpu')
if torch.cuda.is_available():
    torch.cuda.set_device(args.gpu_index)
np.random.seed(cfg.seed)
torch.manual_seed(cfg.seed)
tb_logger = SummaryWriter(cfg.tb_dir) if not args.render else None
logger = create_logger(os.path.join(cfg.log_dir, 'log.txt'), file_handle=not args.render)

"""environment"""
env = HumanoidImpAwareEnv(cfg) if args.imp_aware else HumanoidEnv(cfg)
env.seed(cfg.seed)
actuators = env.model.actuator_names
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
running_state = ZFilter((state_dim,), clip=5)

"""define actor and critic"""
policy_net = PolicyGaussian(MLP(state_dim, cfg.policy_hsize, cfg.policy_htype), action_dim, log_std=cfg.log_std, fix_std=cfg.fix_std)
value_net = Value(MLP(state_dim, cfg.value_hsize, cfg.value_htype))
if args.iter > 0:
    cp_path = '%s/iter_%04d.p' % (cfg.model_dir, args.iter)
    logger.info('loading model from checkpoint: %s' % cp_path)
    model_cp = pickle.load(open(cp_path, "rb"))
    policy_net.load_state_dict(model_cp['policy_dict'])
    value_net.load_state_dict(model_cp['value_dict'])
    running_state = model_cp['running_state']
to_device(device, policy_net, value_net)


if cfg.policy_optimizer == 'Adam':
    optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=cfg.policy_lr, weight_decay=cfg.policy_weightdecay)
else:
    optimizer_policy = torch.optim.SGD(policy_net.parameters(), lr=cfg.policy_lr, momentum=cfg.policy_momentum, weight_decay=cfg.policy_weightdecay)
if cfg.value_optimizer == 'Adam':
    optimizer_value = torch.optim.Adam(value_net.parameters(), lr=cfg.value_lr, weight_decay=cfg.value_weightdecay)
else:
    optimizer_value = torch.optim.SGD(value_net.parameters(), lr=cfg.value_lr, momentum=cfg.value_momentum, weight_decay=cfg.value_weightdecay)


In [None]:
optimizer_policy
policy_net, value_net

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.0003
    maximize: False
    weight_decay: 0.0
)

In [3]:
# reward functions
expert_reward = reward_func[cfg.reward_id]

"""create agent"""
agent = AgentPPO(env=env, dtype=dtype, device=device, running_state=running_state,
                 custom_reward=expert_reward, mean_action=args.render and not args.show_noise,
                 render=args.render, num_threads=args.num_threads,
                 policy_net=policy_net, value_net=value_net,
                 optimizer_policy=optimizer_policy, optimizer_value=optimizer_value, opt_num_epochs=cfg.num_optim_epoch,
                 gamma=cfg.gamma, tau=cfg.tau, clip_epsilon=cfg.clip_epsilon,
                 policy_grad_clip=[(policy_net.parameters(), 40)], end_reward=cfg.end_reward,
                 use_mini_batch=cfg.mini_batch_size < cfg.min_batch_size, mini_batch_size=cfg.mini_batch_size)


def pre_iter_update(i_iter):
    cfg.update_adaptive_params(i_iter)
    agent.set_noise_rate(cfg.adp_noise_rate)
    set_optimizer_lr(optimizer_policy, cfg.adp_policy_lr)
    if cfg.fix_std:
        policy_net.action_log_std.fill_(cfg.adp_log_std)
    return



In [16]:
pre_iter_update(args.iter)
agent.sample(10)

(<khrylib.rl.core.trajbatch.TrajBatch at 0x7f6ab9279d60>,
 <khrylib.rl.core.logger_rl.LoggerRL at 0x7f6ab9279820>)

In [None]:

# for i_iter in tqdm(range(args.iter, args.max_iter_num)):
i_iter = 0

"""generate multiple trajectories that reach the minimum batch_size"""
pre_iter_update(i_iter)
batch, log = agent.sample(cfg.min_batch_size)
if cfg.end_reward:
    agent.env.end_reward = log.avg_c_reward * cfg.gamma / (1 - cfg.gamma)   




In [None]:

"""update networks"""
t0 = time.time()
agent.update_params(batch)
t1 = time.time()
print(t1-t0)

In [8]:
batch.actions.shape, batch.rewards.shape, batch.masks.shape, batch.states.shape,batch.exps.shape
# batch.rewards

((50015, 38), (50015,), (50015,), (50015, 76), (50015,))