# unity mlagents common code to use for RL

In [1]:
import mlagents
from mlagents_envs.registry import default_registry
from mlagents_envs.environment import ActionTuple, UnityEnvironment as UE

env =  UE(file_name="stage0_160523\stage0_copy",seed=1,side_channels=[])
env.reset()
#after opening new env, always do env.reset() first

if the env screen that shows up is blank or (not responding), dont fret, you manually right click and open another window or just ignore first

if you open a new window when running the code you will have 2 rl_env screens, one that was the error one and the one that is working
but then when you run your action loop, the screen that is moving might be one of the other, just alt+tab and check which screen is moving

In [2]:
behavior_name=list(env.behavior_specs)[0]
print(f"Name of the environment behavior: {behavior_name}")

Name of the environment behavior: stage0?team=0


In [3]:
behavior_spec=env.behavior_specs[behavior_name]
print(f"Number of observations: {len(behavior_spec.observation_specs)}")
print(f"Observation vector shape: {behavior_spec.observation_specs}")

Number of observations: 1
Observation vector shape: [ObservationSpec(shape=(128, 128, 3), dimension_property=(<DimensionProperty.TRANSLATIONAL_EQUIVARIANCE: 2>, <DimensionProperty.TRANSLATIONAL_EQUIVARIANCE: 2>, <DimensionProperty.NONE: 1>), observation_type=<ObservationType.DEFAULT: 0>, name='CameraSensor')]


In [5]:
if behavior_spec.action_spec.is_continuous():
    print("action is continuous")
    
if behavior_spec.action_spec.is_discrete():
    print("action is discrete")
    
# our current env uses discrete actions, hence the action spec is discrete

action is discrete


In [6]:
#this line basically gives you a summary of the current state of the env
decision_steps,terminal_steps = env.get_steps(behavior_name)
#decision steps tell you which agents in the env are still requesting a decision
#terminal steps tell you which agents in the env have reached a terminal stage, i.e. ended the episode

print(f"decision_steps (this means how many agents are requesting for decision, 0 refers to first agent at index 0): {list(decision_steps)}")
# if decision steps at the start of the env is [0] means correct cause the env hasnt ended so the agent in the env is still requesting decision

print(f"terminal steps (this refers to which agents have already reached terminal stage which means already ended): {list(terminal_steps)}")
# terminal stage means reached end of the game, e.g. fell of platform, hit the target etc, depending on your env, so if the start of the game, its [] is correct since none of the objects have started moving

decision_steps (this means how many agents are requesting for decision, 0 refers to first agent at index 0): [0]
terminal steps (this refers to which agents have already reached terminal stage which means already ended): []


In [8]:
action_space = env.behavior_specs[behavior_name].action_spec

print(f"Action space is: {action_space}")
# actions space is 2,2,2,2 since each of them can do that or do nothing
# there are 4 indices since the agent can do 4 actions: move forward, move backward, rotate right, rotate left

print(f"Action size is: {len(action_space[1]) if behavior_spec.action_spec.is_discrete() else len(action_space[0])}")
# action size is 4 since move forward backward rotate right left are 4 different actions

Action space is: Continuous: 0, Discrete: (2, 2, 2, 2)
Action size is: 4


In [9]:
#to get the agents input (in our case the rgb values, run this command)
decision_steps.obs[0].shape #[0] refers to our agent at the 0th index
# this corresponds to the rgb input of height width =128, 3 rgb channels

(1, 128, 128, 3)

In [23]:
#to create a random action using unity
action = behavior_spec.action_spec.random_action(len(decision_steps))
action.discrete,type(action.discrete)
#this is an example of random action, there are 4 values, corresponding to 4 possible actions

(array([[0, 0, 0, 1]]), numpy.ndarray)

In [18]:
#to pass this action to an agent, you need to first pass the action to a Unity ActionTuple(), this action will be output of model
action_tuple=ActionTuple()
action_tuple

<mlagents_envs.base_env.ActionTuple at 0x24bad8f1080>

In [24]:
action_tuple.add_discrete(action.discrete)
action_tuple.discrete 
#action tuple will be of shape (num_agents,num_actions)

array([[0, 0, 0, 1]])

In [None]:
# now we set the env with the action of the actiontuple generated
env.set_actions(behavior_name,action_tuple)

#move env by 1 step
env.step()

# sample code to move agent and run episodes using random actions

you need to download the sample zip file as well

In [29]:
# -----------------
# This code is used to close an env that might not have been closed before
try:
  env.close()
except:
  pass
# -----------------

from mlagents_envs.registry import default_registry
from mlagents_envs.environment import ActionTuple, UnityEnvironment as UE
import matplotlib.pyplot as plt
import os
%matplotlib inline

env =  UE(file_name="stage0_160523\stage0_copy",seed=1,side_channels=[])
print("environment created.")
env.reset()

environment created.


In [30]:
behavior_name=list(env.behavior_specs)[0]
behavior_spec=env.behavior_specs[behavior_name]

num_episodes=10
for episode in range(num_episodes):
    env.reset()
    decision_steps, terminal_steps = env.get_steps(behavior_name)
    tracked_agent = -1 # -1 indicates not yet tracking
    done = False # For the tracked_agent
    episode_rewards = 0 # For the tracked_agent
    while not done:
        # Track the first agent we see if not tracking
        # Note : len(decision_steps) = [number of agents that requested a decision]
        if tracked_agent == -1 and len(decision_steps) >= 1:
            tracked_agent = decision_steps.agent_id[0]
            
        # Generate an action for all agents
        action = behavior_spec.action_spec.random_action(len(decision_steps))
        # Set the actions
        env.set_actions(behavior_name, action)
        # Move the simulation forward
        env.step()
        
        # Get the new simulation results
        decision_steps, terminal_steps = env.get_steps(behavior_name)
        if tracked_agent in decision_steps: # The agent requested a decision
            episode_rewards += decision_steps[tracked_agent].reward
        if tracked_agent in terminal_steps: # The agent terminated its episode
            episode_rewards += terminal_steps[tracked_agent].reward
            done = True #set done = True since agent reach terminal state, episode ends so set to True to break out of loop
            
    print(f"Total rewards for episode {episode+1}/{num_episodes}: {episode_rewards}")
    
env.close()

Total rewards for episode 1/10: -19.0
Total rewards for episode 2/10: -38.0
Total rewards for episode 3/10: -3.0
Total rewards for episode 4/10: 0.0
Total rewards for episode 5/10: -53.0
Total rewards for episode 6/10: -4.0
Total rewards for episode 7/10: -19.0
Total rewards for episode 8/10: -1.0
Total rewards for episode 9/10: -6.0
Total rewards for episode 10/10: -22.0
