In [None]:
# Laurent LEQUIEVRE
# Research Engineer, CNRS (France)
# Institut Pascal UMR6602
# laurent.lequievre@uca.fr

Pendulum-v0
https://github.com/openai/gym/wiki/Pendulum-v0
    
The problem  is to  keep a frictionless pendulum standing up.

Action :
Box(1) -> Joint Effort between -2.0 and 2.0

Starting state :
Random angle from −π to π, and random velocity between -1 and 1

Reward :
Reward = -(theta^2 + 0.1\*theta_dt^2 + 0.001\*action^2)

Reward is based on the angle of the pendulum (1), the angular velocity (2) of the pendulum, 
and the force applied (3).
Agents get increased reward for keeping the pendulum (1) upright, (2) still, and (3) using little force.

Theta is normalized between -pi and pi

Therefore, the lowest reward is -(pi^2 + 0.1\*8^2 + 0.001\*2^2) = -16.2736044, 
and the highest reward is 0. 
In essence, the goal is to remain at zero angle (vertical), 
with the least rotational velocity, and the least effort.


Episode Termination :
There is no specified termination.

In [1]:
import gym
import numpy as np

In [10]:
env = gym.make('Pendulum-v0')
env.reset()

array([ 0.02603071, -0.99966114,  0.14493233])

In [11]:
print("action space = {}".format(env.action_space))
# Box(-2.0, 2.0, (1,), float32)
# 1 float value between -2.0 and 2.0
# Action = between -2.0 and 2.0

print("observation space = {}".format(env.observation_space))
# Box(-8.0, 8.0, (3,), float32)
# 3 float values between -8.0 and 8.0
# Observation/State = [cos(theta), sin(theta), theta dot]
# index 0 -> cos(theta) [-1.0, 1.0] 
# index 1 -> sin(theta) [-1.0, 1.0]
# index 2 -> theta dot [-8.0, 8.0]
# The angles(thetas) are passed through the sin() and cos() function so that the observations are in the range [-1,1]. 
# This fixed range of [-1,1] helps in stabilising the training in the neural networks

observation = env.reset()
print("observation = {}".format(observation))
print("internal state = {}".format(env.state)) # env state is a numpy array [theta, theta_dot]
print("theta={}, cos(theta)={}, sin(theta)={}, theta dot={}".format(env.state[0],np.cos(env.state[0]), np.sin(env.state[0]), env.state[1]))

action space = Box(-2.0, 2.0, (1,), float32)
observation space = Box(-8.0, 8.0, (3,), float32)
observation = [-0.97323528 -0.22981098 -0.91775318]
internal state = [-2.9097092  -0.91775318]
theta=-2.9097091957090253, cos(theta)=-0.9732352821977132, sin(theta)=-0.22981097773069434, theta dot=-0.9177531794044946


In [23]:
from tqdm import tqdm

for i in range(3):
    # Randomly sample an element of this space => Joint Effort between -2.0 and 2.0
    an_action = env.action_space.sample()
    print("------------------------------------------------")
    print("a sample of action = {}".format(an_action))
    
    # launch this action to the environnement
    observation = env.step(env.action_space.sample())
    # observation = (state = [cos(theta), sin(theta), theta dot] , reward, done, info)
    
    print("[{}] observation = {}".format(i,observation))
    print("[{}] reward = {}".format(i,observation[1]))
    print("[{}] internal state = {}".format(i,env.state)) # env state is a numpy array [theta, theta_dot]
    print("[{}] theta={}, cos(theta)={}, sin(theta)={}, theta dot={}".format(i,env.state[0],np.cos(env.state[0]), np.sin(env.state[0]), env.state[1]))
    
    #env.render()

#env.close()

------------------------------------------------
a sample of action = [-1.0639498]
[0] observation = (array([-0.99326451, -0.11586893,  0.56330775]), -9.361322651388539, False, {})
[0] reward = -9.361322651388539
[0] internal state = [-3.02546288  0.56330775]
[0] theta=-3.0254628764432385, cos(theta)=-0.9932645121729079, sin(theta)=-0.11586892964861335, theta dot=0.5633077549624914
------------------------------------------------
a sample of action = [0.7401112]
[1] observation = (array([-0.9898852 , -0.14187071,  0.52442409]), -9.18525965634952, False, {})
[1] reward = -9.18525965634952
[1] internal state = [-2.99924167  0.52442409]
[1] theta=-2.9992416721648345, cos(theta)=-0.9898851967274155, sin(theta)=-0.1418707062783789, theta dot=0.5244240855680763
------------------------------------------------
a sample of action = [0.2110196]
[2] observation = (array([-0.98758351, -0.1570949 ,  0.30794708]), -9.023491171561112, False, {})
[2] reward = -9.023491171561112
[2] internal state = [

In [None]:
# When we deal with high-dimensional state space or action spaces 
# we have to introduce complex and non-linear function approximators 
# such as deep neural networks

# DDPG is for dealing with continuous, hence high-dimensional, action spaces in a Reinforcement Learning framework.
# DDPG = Deep Deterministic Policy Gradient
# https://spinningup.openai.com/en/latest/algorithms/ddpg.html
# https://github.com/openai/spinningup.git


In [13]:
# https://blog.paperspace.com/physics-control-tasks-with-deep-reinforcement-learning/
# https://github.com/antocapp/paperspace-ddpg-tutorial/blob/master/ddpg-pendulum-250.ipynb