# Solving CartPole OpenAI environment using DQNAgent

#### first install rl package - not done in the container as it requires a bit older version of keras

In [2]:
!pip install keras-rl

# these are needed to show display
# !apt-get install xvfb
# !apt-get install python-opengl



In [None]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory


ENV_NAME = 'CartPole-v0'


# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

# Next, we build a very simple model.
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=50000, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.

# Visualization is OFF untill we figure out how to export display correctly.

dqn.fit(env, nb_steps=50000, visualize=False, verbose=2)

# After training is done, we save the final weights.
dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
dqn.test(env, nb_episodes=5, visualize=False)

Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_2 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_3 (Activation)    (None, 16)                0         
__________



    23/50000: episode: 1, duration: 1.024s, episode steps: 23, steps per second: 22, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.609 [0.000, 1.000], mean observation: -0.082 [-2.106, 1.171], loss: 0.480381, mean_absolute_error: 0.504366, mean_q: 0.034857
    33/50000: episode: 2, duration: 0.096s, episode steps: 10, steps per second: 104, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.300 [0.000, 1.000], mean observation: 0.157 [-1.129, 1.948], loss: 0.424381, mean_absolute_error: 0.512619, mean_q: 0.116559




    69/50000: episode: 3, duration: 0.295s, episode steps: 36, steps per second: 122, episode reward: 36.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.528 [0.000, 1.000], mean observation: 0.134 [-0.535, 0.891], loss: 0.262810, mean_absolute_error: 0.556174, mean_q: 0.438657
    86/50000: episode: 4, duration: 0.140s, episode steps: 17, steps per second: 121, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: 0.081 [-0.803, 1.185], loss: 0.107777, mean_absolute_error: 0.621345, mean_q: 0.861255
   130/50000: episode: 5, duration: 0.371s, episode steps: 44, steps per second: 118, episode reward: 44.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.523 [0.000, 1.000], mean observation: -0.153 [-2.216, 0.852], loss: 0.041801, mean_absolute_error: 0.715026, mean_q: 1.303339
   144/50000: episode: 6, duration: 0.123s, episode steps: 14, steps per second: 114, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000],

   655/50000: episode: 32, duration: 0.135s, episode steps: 15, steps per second: 111, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.079 [-1.027, 1.661], loss: 0.212586, mean_absolute_error: 2.861350, mean_q: 5.446199
   665/50000: episode: 33, duration: 0.088s, episode steps: 10, steps per second: 113, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.800 [0.000, 1.000], mean observation: -0.125 [-2.206, 1.394], loss: 0.192192, mean_absolute_error: 2.875637, mean_q: 5.469489
   693/50000: episode: 34, duration: 0.237s, episode steps: 28, steps per second: 118, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.429 [0.000, 1.000], mean observation: 0.072 [-0.939, 1.798], loss: 0.195467, mean_absolute_error: 2.959061, mean_q: 5.694083
   703/50000: episode: 35, duration: 0.097s, episode steps: 10, steps per second: 103, episode reward: 10.000, mean reward: 1.000 [1.000, 1.0

  2239/50000: episode: 61, duration: 1.087s, episode steps: 129, steps per second: 119, episode reward: 129.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.473 [0.000, 1.000], mean observation: -0.275 [-1.859, 1.166], loss: 0.847046, mean_absolute_error: 9.295400, mean_q: 18.827444
  2332/50000: episode: 62, duration: 0.785s, episode steps: 93, steps per second: 118, episode reward: 93.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.473 [0.000, 1.000], mean observation: -0.269 [-1.302, 0.828], loss: 0.871044, mean_absolute_error: 9.728604, mean_q: 19.685436
  2532/50000: episode: 63, duration: 1.744s, episode steps: 200, steps per second: 115, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: -0.182 [-1.795, 1.201], loss: 0.973263, mean_absolute_error: 10.409055, mean_q: 21.050152
  2689/50000: episode: 64, duration: 1.311s, episode steps: 157, steps per second: 120, episode reward: 157.000, mean reward: 1.000