In [None]:
# Laurent LEQUIEVRE
# Research Engineer, CNRS (France)
# Institut Pascal UMR6602
# laurent.lequievre@uca.fr

# CartPole : A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track.

<img src="cartpole.png">

# The 'step' function returns four values :

- observation (object): 
an environment-specific object representing your observation of the environment. 
For example, pixel data from a camera, joint angles and joint velocities of a robot, or the board state in a board game.

- reward (float): 
amount of reward achieved by the previous action. 
The scale varies between environments, but the goal is always to increase your total reward.

- done (boolean): 
whether it’s time to reset the environment again. 
Most (but not all) tasks are divided up into well-defined episodes, and done being True indicates the episode has terminated. 
(For example, perhaps the pole tipped too far, or you lost your last life.)

- info (dict): 
diagnostic information useful for debugging. 
It can sometimes be useful for learning (for example, it might contain the raw probabilities behind the environment’s last state change). 
However, official evaluations of your agent are not allowed to use this for learning.


Example for CartPole : array([-0.00842369, -0.17244261, -0.0034994 ,  0.24360119]), 1.0, False, {}
    
    observation = array([x position of cart, x velocity of cart, angular position of pole, angular velocity of pole])
    reward = 1.0
    done = False
    info = {}
    
# The 'reset' function returns :

- observation (object): 
an environment-specific object representing your observation of the environment. 
For example, pixel data from a camera, joint angles and joint velocities of a robot, or the board state in a board game.


# About 'done' :

- The environment will return done=True if either 200 timesteps have elapsed (episode success) 
or if the pole has fallen over (angular position of the pole has reached +- 12 degrees) 
or the cart has left the simulation space (cart position has reached +- 2.4), in which case the episode failed.

In [2]:
import gym

In [3]:
env = gym.make('CartPole-v0')  # CartPole-v0 only runs for 200 steps. CartPole-v1 runs for 500 steps.
env.reset() # reset the environment

array([ 0.03751017, -0.01408691,  0.01957232, -0.04018988])

# Environment comes with an action_space and an observation_space :

These attributes are of type Space, and they describe the format of valid actions and observations.

The 'Discrete' space allows a fixed range of non-negative numbers.
- Discrete(2) -> in this case valid actions are either 0 or 1.
- Discrete(8) -> Set with 8 elements {0, 1, 2, ..., 7}

The 'Box' space represents an n-dimensional box.
- Box(4,) -> valid observations will be an array of 4 numbers (in that example, there is only one dimension).

In [20]:
# About action and observation space

# action_space
print("action space = {}".format(env.action_space)) # Discrete(2)
print("nb actions = {}".format(env.action_space.n)) # 2

for _ in range(3):
    print("a sample of action = {}".format(env.action_space.sample())) # Randomly sample an element of this space == random(2)

# observation_space
print("observation space = {}".format(env.observation_space))
# observation = x position of cart, x velocity of cart, angular position of pole, angular velocity of pole

# We can also check the Box’s bounds
print("observation high = {}".format(env.observation_space.high))
print("observation low = {}".format(env.observation_space.low))

# In details
print("x cart pos high = {}".format(env.observation_space.high[0]))
print("x cart pos low = {}".format(env.observation_space.low[0]))

print("x cart velocity high = {}".format(env.observation_space.high[1]))
print("x cart velocity low = {}".format(env.observation_space.low[1]))

print("Angular pole pos high = {}".format(env.observation_space.high[2]))
print("Angular pole pos low = {}".format(env.observation_space.low[2]))

print("Angular pole velocity high = {}".format(env.observation_space.high[3]))
print("Angular pole velocity low = {}".format(env.observation_space.low[3]))


action space = Discrete(2)
nb actions = 2
a sample of action = 1
a sample of action = 1
a sample of action = 0
observation space = Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)
observation high = [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]
observation low = [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]
x cart pos high = 4.800000190734863
x cart pos low = -4.800000190734863
x cart velocity high = 3.4028234663852886e+38
x cart velocity low = -3.4028234663852886e+38
Angular pole pos high = 0.41887903213500977
Angular pole pos low = -0.41887903213500977
Angular pole velocity high = 3.4028234663852886e+38
Angular pole velocity low = -3.4028234663852886e+38


In [None]:
# A simple loop to play with the cartPole environment
for i in range(100):
    env.render() # will open a GUI window and show you the cartpole.
    v = env.step(env.action_space.sample()) # apply a random action to the environment
env.close() # close the environment