In [40]:
import time, sys, os
import numpy as np
sys.path.insert(0, os.path.abspath('..'))
from Gym.GymEnv import GymEnv
%load_ext autoreload
%autoreload 2


def env_test(domain_name):
    start_time = time.time()
    
    print('domain_name:', domain_name)

    env = GymEnv(domain_name, dt=0.03)
    
    print('state_dim:', env.state_dim)
    print('action_dim:', env.action_dim)
    print('action_min:', env.action_min)
    print('action_max:', env.action_max)
    
    state = env.reset()
    #print('initial_state:', state)
    #print(state.physics)
    
    fixed_action = np.ones(env.action_dim)
    next_state, reward, done, _ = env.step(fixed_action)
    print('real dt:', env.env.data.time)
    print('reward:', reward)
    print('frame_skip:', env.env.env.frame_skip)
    print('max_episode_steps', env.max_episode_steps)
    

#    print('dt:', env.env.physics.data.time)
    
    def get_session(state, episode_n, action, step_type='step'):
        states, rewards = [state], []
        for _ in range(episode_n):
            if step_type == 'step':
                state, reward, done, _ = env.step(action)
            elif step_type == 'virtual_step':
                state, reward, done, _ = env.virtual_step(state, action)
            states.append(state)
            rewards.append(reward)
            if done:
                break
        return states, rewards
    
    episode_n = env.max_episode_steps - 1
    print('episode_n:', episode_n)
    
    initial_state = env.reset()
    print('initial state difference:', np.linalg.norm(np.array(initial_state) - np.array(state)))
    states, rewards = get_session(initial_state, episode_n, 
                                  fixed_action, step_type='step')
    
    initial_state = env.reset()
    new_states, new_rewards = get_session(initial_state, episode_n, 
                                                  fixed_action, step_type='step')
    
    state_diff = np.max(np.linalg.norm(np.array(states) - np.array(new_states), axis=1))
    print('state difference in two attempt:', state_diff)
    reward_diff = np.max(np.abs(np.array(rewards) - np.array(new_rewards)))
    print('reward difference in two attempt:', reward_diff)
    
    initial_state = env.reset()
    virt_states, virt_rewards = get_session(initial_state, episode_n, 
                                            fixed_action, step_type='virtual_step')    
    
    state_diff = np.max(np.linalg.norm(np.array(states) - np.array(virt_states), axis=1))
    print('state difference with virual step:', state_diff)
    reward_diff = np.max(np.abs(np.array(rewards) - np.array(virt_rewards)))
    print('reward difference with virual step:', reward_diff)
    
    
    mid_episode = int(len(states) / 2) 
    mid_virt_states, mid_virt_rewards = get_session(states[mid_episode], episode_n - mid_episode, 
                                                    fixed_action, step_type='virtual_step')

    state_diff = np.max(np.linalg.norm(np.array(states[mid_episode:]) - np.array(mid_virt_states), axis=1))
    print('state difference with virual step from the middle:', state_diff)
    reward_diff = np.max(np.abs(np.array(rewards[mid_episode:]) - np.array(mid_virt_rewards)))
    print('reward difference with virual step from the middle:', reward_diff)    
    print('time:', time.time() - start_time)
    print('\n')
    

domain_name = 'HalfCheetah-v3'
env_test(domain_name)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
domain_name: HalfCheetah-v3
state_dim: 17
action_dim: 6
action_min: [-1. -1. -1. -1. -1. -1.]
action_max: [1. 1. 1. 1. 1. 1.]
real dt: 0.03
reward: 0.11267473480315826
frame_skip: 3
max_episode_steps 1000
episode_n: 999
initial state difference: 0.0
state difference in two attempt: 0.0
reward difference in two attempt: 0.0
state difference with virual step: 1.2104698750217736e-13
reward difference with virual step: 2.7755575615628914e-15
state difference with virual step from the middle: 2.009841808917283e-14
reward difference with virual step from the middle: 2.7755575615628914e-15
time: 0.537001371383667




In [41]:
for domain_name in ['Ant-v3', 'HalfCheetah-v3', 'Hopper-v3', 'Humanoid-v3', 
                    'HumanoidStandup-v2', 'InvertedDoublePendulum-v2', 'InvertedPendulum-v2',
                    'Reacher-v2', 'Swimmer-v3', 'Walker2d-v3']:
    env_test(domain_name)

domain_name: Ant-v3
state_dim: 111
action_dim: 8
action_min: [-1. -1. -1. -1. -1. -1. -1. -1.]
action_max: [1. 1. 1. 1. 1. 1. 1. 1.]
real dt: 0.03
reward: -2.9979921613949783
frame_skip: 3
max_episode_steps 1000
episode_n: 999
initial state difference: 0.0
state difference in two attempt: 0.0
reward difference in two attempt: 0.0
state difference with virual step: 4.4515916823129694e-13
reward difference with virual step: 1.6969911525951282
state difference with virual step from the middle: 2.6501101057926703e-15
reward difference with virual step from the middle: 1.6882391951417497
time: 1.681488037109375


domain_name: HalfCheetah-v3
state_dim: 17
action_dim: 6
action_min: [-1. -1. -1. -1. -1. -1.]
action_max: [1. 1. 1. 1. 1. 1.]
real dt: 0.03
reward: 0.11267473480315826
frame_skip: 3
max_episode_steps 1000
episode_n: 999
initial state difference: 0.0
state difference in two attempt: 0.0
reward difference in two attempt: 0.0
state difference with virual step: 1.2104698750217736e-13
r