In [5]:
import time, sys, os
import numpy as np
sys.path.insert(0, os.path.abspath('..'))
from DMControlEnv.DMControlEnvWithPhysics import DMControlEnvWithPhysics
%load_ext autoreload
%autoreload 2


def env_test(domain_name, task_name):
    start_time = time.time()
    
    print('domain_name:', domain_name)
    print('task_name:', task_name)

    env = DMControlEnvWithPhysics(domain_name, task_name)

    print('state_dim:', env.state_dim)
    print('action_dim:', env.action_dim)
    print('action_min:', env.action_min)
    print('action_max:', env.action_max)
    
    state = env.reset()
    print('initial_state:', state)
    
    fixed_action = np.ones(env.action_dim)
    next_state, reward, done, _ = env.step(fixed_action)
    print('next_state:', next_state)
    print('reward:', reward)
    print('done:', done)
    print('dt:', env.env.physics.data.time)
    
    def get_session(initial_state, episode_n, action):
        states, rewards = [initial_state], []
        for _ in range(episode_n):
            state, reward, done, _ = env.step(action)
            states.append(state)
            rewards.append(reward)
        return states, rewards
    
    initial_state = env.reset()
    states, rewards = get_session(initial_state, 200, fixed_action)
    print('total_reward:', sum(rewards))
    
    initial_state = env.reset()
    new_states, new_rewards = get_session(initial_state, 200, fixed_action)
    
    state_diff = np.max(np.linalg.norm(np.array(states) - np.array(new_states), axis=1))
    print('state difference in two attempt:', state_diff)
    reward_diff = np.max(np.abs(np.array(rewards) - np.array(new_rewards)))
    print('reward difference in two attempt:', reward_diff)
    
    state = env.reset()
    virt_states, virt_rewards = [state], []
    for _ in range(200):
        state, reward, done, _ = env.virtual_step(state, fixed_action)
        virt_states.append(state)
        virt_rewards.append(reward)
    
    state_diff = np.max(np.linalg.norm(np.array(states) - np.array(virt_states), axis=1))
    print('state difference with virual step:', state_diff)
    reward_diff = np.max(np.abs(np.array(rewards) - np.array(virt_rewards)))
    print('reward difference with virual step:', reward_diff)
    
    state = states[100]
    mid_virt_states, mid_virt_rewards = [state], []
    for _ in range(100):
        state, reward, done, _ = env.virtual_step(state, fixed_action)
        mid_virt_states.append(state)
        mid_virt_rewards.append(reward)

    state_diff = np.max(np.linalg.norm(np.array(states[100:]) - np.array(mid_virt_states), axis=1))
    print('state difference with virual step from the middle:', state_diff)
    reward_diff = np.max(np.abs(np.array(rewards[100:]) - np.array(mid_virt_rewards)))
    print('reward difference with virual step from the middle:', reward_diff)    
    print('time:', time.time() - start_time)
    print('\n')
    

domain_name, task_name = 'pendulum', 'swingup'
env_test(domain_name, task_name)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
domain_name: pendulum
task_name: swingup
state_dim: 2
action_dim: 1
action_min: [-1.]
action_max: [1.]
initial_state: [0. 0.]
next_state: [0.00158103 0.07905138]
reward: 1.0
done: False
dt: 0.02
total_reward: 23.0
state difference in two attempt: 0.0
reward difference in two attempt: 0.0
state difference with virual step: 0.0
reward difference with virual step: 0.0
state difference with virual step from the middle: 0.0
reward difference with virual step from the middle: 0.0
time: 1.6696059703826904




In [4]:
from dm_control import suite

max_len = max(len(d) for d, _ in suite.BENCHMARKING)
for domain, task in suite.BENCHMARKING:
    env_test(domain, task)

domain_name: acrobot
task_name: swingup
state_dim: 4
action_dim: 1
action_min: [-1.]
action_max: [1.]
initial_state: [0. 0. 0. 0.]
next_state: [-0.00039307  0.00125134 -0.07855389  0.25007942]
reward: 1.0
done: False
dt: 0.01
total_reward: 49.90536711616123
state difference in two attempt: 0.0
reward difference in two attempt: 0.0
state difference with virual step: 0.0
reward difference with virual step: 0.0
state difference with virual step from the middle: 0.0
reward difference with virual step from the middle: 0.0
time: 1.8524985313415527


domain_name: acrobot
task_name: swingup_sparse
state_dim: 4
action_dim: 1
action_min: [-1.]
action_max: [1.]
initial_state: [0. 0. 0. 0.]
next_state: [-0.00039307  0.00125134 -0.07855389  0.25007942]
reward: 1.0
done: False
dt: 0.01
total_reward: 20.0
state difference in two attempt: 0.0
reward difference in two attempt: 0.0
state difference with virual step: 0.0
reward difference with virual step: 0.0
state difference with virual step from the m