In [21]:
import time, sys, os
import numpy as np
sys.path.insert(0, os.path.abspath('..'))
from DMControl.DMControlEnv import DMControlEnv
%load_ext autoreload
%autoreload 2


def env_test(domain_name, task_name):
    start_time = time.time()
    
    print('domain_name:', domain_name)
    print('task_name:', task_name)

    env = DMControlEnv(domain_name, task_name, dt=0.03)

    print('state_dim:', env.state_dim)
    print('action_dim:', env.action_dim)
    print('action_min:', env.action_min)
    print('action_max:', env.action_max)
    print('step_limit:', env.env._step_limit)
    
    state = env.reset()
    print('initial_state:', state)
    
    fixed_action = np.ones(env.action_dim)
    next_state, reward, done, _ = env.step(fixed_action)
    print('next_state:', next_state)
    print('reward:', reward)
    print('done:', done)
    print('real dt:', env.env.physics.data.time)
    
    def get_session(state, episode_n, action, step_type='step'):
        states, rewards = [state], []
        for _ in range(episode_n):
            if step_type == 'step':
                state, reward, done, _ = env.step(action)
            elif step_type == 'virtual_step':
                state, reward, done, _ = env.virtual_step(state, action)
            states.append(state)
            rewards.append(reward)
            if done:
                break
        return states, rewards
    
    episode_n = int(env.inner_step_limit / env.inner_step_n) - 1
    print('episode_n:', episode_n)
    
    initial_state = env.reset()
    states, rewards = get_session(initial_state, episode_n, 
                                  fixed_action, step_type='step')
    
    print('total_reward:', sum(rewards))
    
    initial_state = env.reset()
    new_states, new_rewards = get_session(initial_state, episode_n, 
                                          fixed_action, step_type='step')
    
    state_diff = np.max(np.linalg.norm(np.array(states) - np.array(new_states), axis=1))
    print('state difference in two attempt:', state_diff)
    reward_diff = np.max(np.abs(np.array(rewards) - np.array(new_rewards)))
    print('reward difference in two attempt:', reward_diff)
    
    initial_state = env.reset()
    virt_states, virt_rewards = get_session(initial_state, episode_n, 
                                            fixed_action, step_type='virtual_step')
    
    state_diff = np.max(np.linalg.norm(np.array(states) - np.array(virt_states), axis=1))
    print('state difference with virual step:', state_diff)
    reward_diff = np.max(np.abs(np.array(rewards) - np.array(virt_rewards)))
    print('reward difference with virual step:', reward_diff)
    
    mid_episode = int(episode_n / 2) 
    mid_virt_states, mid_virt_rewards = get_session(states[mid_episode], episode_n - mid_episode, 
                                                    fixed_action, step_type='virtual_step')

    state_diff = np.max(np.linalg.norm(np.array(states[mid_episode:]) - np.array(mid_virt_states), axis=1))
    print('state difference with virual step from the middle:', state_diff)
    reward_diff = np.max(np.abs(np.array(rewards[mid_episode:]) - np.array(mid_virt_rewards)))
    print('reward difference with virual step from the middle:', reward_diff)    
    print('time:', time.time() - start_time)
    print('\n')
    

domain_name, task_name = 'cheetah', 'run'
env_test(domain_name, task_name)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
domain_name: cheetah
task_name: run
state_dim: 18
action_dim: 6
action_min: [-1. -1. -1. -1. -1. -1.]
action_max: [1. 1. 1. 1. 1. 1.]
step_limit: 1000.0
initial_state: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
next_state: [ 0.01312239 -0.00935882 -0.04109939  0.09649481  0.14012104  0.1683507
  0.02070408  0.16647046  0.11370937  0.54693767 -0.4472241  -1.88019192
  4.69290717  6.17567871  7.56102531 -0.13626821  8.17694774  5.54283914]
reward: 0.003025796887300758
done: False
real dt: 0.03
episode_n: 332
total_reward: 0.05969411266716018
state difference in two attempt: 0.0
reward difference in two attempt: 0.0
state difference with virual step: 277.0788251132033
reward difference with virual step: 3.0
state difference with virual step from the middle: 56.49291572989734
reward difference with virual step from the middle: 3.0
time: 1.4091212749481201




In [22]:
from dm_control import suite

max_len = max(len(d) for d, _ in suite.BENCHMARKING)
for domain, task in suite.BENCHMARKING:
    env_test(domain, task)

domain_name: acrobot
task_name: swingup
state_dim: 4
action_dim: 1
action_min: [-1.]
action_max: [1.]
step_limit: 1000.0
initial_state: [0. 0. 0. 0.]
next_state: [-0.00353233  0.01124544 -0.23549702  0.74971488]
reward: 3.0
done: False
real dt: 0.03
episode_n: 332
total_reward: 126.44781464280857
state difference in two attempt: 0.0
reward difference in two attempt: 0.0
state difference with virual step: 0.0
reward difference with virual step: 0.0
state difference with virual step from the middle: 0.0
reward difference with virual step from the middle: 0.0
time: 0.8120222091674805


domain_name: acrobot
task_name: swingup_sparse
state_dim: 4
action_dim: 1
action_min: [-1.]
action_max: [1.]
step_limit: 1000.0
initial_state: [0. 0. 0. 0.]
next_state: [-0.00353233  0.01124544 -0.23549702  0.74971488]
reward: 3.0
done: False
real dt: 0.03
episode_n: 332
total_reward: 29.0
state difference in two attempt: 0.0
reward difference in two attempt: 0.0
state difference with virual step: 0.0
rewar