# Solving CartPole OpenAI environment using DQNAgent

#### first install rl package - not done in the container as it requires a bit older version of keras

In [1]:
!pip install keras-rl

Collecting keras-rl
  Downloading keras-rl-0.3.1.tar.gz
Collecting keras<2.0.7,>=1.0.7 (from keras-rl)
  Downloading Keras-2.0.6.tar.gz (228kB)
[K    100% |################################| 235kB 2.5MB/s ta 0:00:01
[?25hCollecting theano (from keras<2.0.7,>=1.0.7->keras-rl)
  Downloading Theano-0.9.0.tar.gz (3.1MB)
[K    100% |################################| 3.1MB 408kB/s eta 0:00:01
Building wheels for collected packages: keras-rl, keras, theano
  Running setup.py bdist_wheel for keras-rl ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/8b/3f/0e/d0dbbcddddf6d14b412935b2286098872de5464123fdaeb7d9
  Running setup.py bdist_wheel for keras ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/c2/80/ba/2beab8c2131e2dcc391ee8a2f55e648af66348115c245e0839
  Running setup.py bdist_wheel for theano ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/d5/5b/93/433299b86e3e9b25f0f600e4e4ebf18e38eb7534ea518eba13
Successfully built keras-rl keras

Standard imports

In [None]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

#### Open environment
extract the number of actions. 
There are two discrete actions - move left and move Right

In [None]:
env = gym.make('CartPole-v0')
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

#### Build a simple NN

In [None]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

#### Configure and compile our agent. 
You can use every built-in Keras optimizer and even the metrics!


In [None]:
memory = SequentialMemory(limit=50000, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model, 
               nb_actions=nb_actions, 
               memory=memory, 
               nb_steps_warmup=10,
               target_model_update=1e-2, 
               policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

#### Actual learning 
Visualization is OFF untill we figure out how to export display correctly.

In [None]:
dqn.fit(env, nb_steps=50000, visualize=False, verbose=2)

# After training is done, we save the final weights.
dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

#### Evaluate our algorithm for 5 episodes.

In [2]:
dqn.test(env, nb_episodes=5, visualize=False)

Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_2 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_3 (Activation)    (None, 16)                0         
__________



    23/50000: episode: 1, duration: 0.888s, episode steps: 23, steps per second: 26, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.609 [0.000, 1.000], mean observation: -0.082 [-2.106, 1.171], loss: 0.480381, mean_absolute_error: 0.504366, mean_q: 0.034857
    33/50000: episode: 2, duration: 0.075s, episode steps: 10, steps per second: 133, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.300 [0.000, 1.000], mean observation: 0.157 [-1.129, 1.948], loss: 0.424156, mean_absolute_error: 0.511362, mean_q: 0.114185




    69/50000: episode: 3, duration: 0.275s, episode steps: 36, steps per second: 131, episode reward: 36.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.528 [0.000, 1.000], mean observation: 0.134 [-0.535, 0.891], loss: 0.266964, mean_absolute_error: 0.553619, mean_q: 0.425473
    82/50000: episode: 4, duration: 0.113s, episode steps: 13, steps per second: 115, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.385 [0.000, 1.000], mean observation: 0.085 [-1.000, 1.519], loss: 0.114552, mean_absolute_error: 0.623867, mean_q: 0.845792
    97/50000: episode: 5, duration: 0.133s, episode steps: 15, steps per second: 113, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.667 [0.000, 1.000], mean observation: -0.087 [-1.870, 1.028], loss: 0.077943, mean_absolute_error: 0.693653, mean_q: 1.107926
   112/50000: episode: 6, duration: 0.130s, episode steps: 15, steps per second: 115, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000],

   625/50000: episode: 33, duration: 0.116s, episode steps: 12, steps per second: 104, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.167 [0.000, 1.000], mean observation: 0.115 [-1.544, 2.522], loss: 0.272199, mean_absolute_error: 2.751459, mean_q: 5.245492
   650/50000: episode: 34, duration: 0.212s, episode steps: 25, steps per second: 118, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.069 [-1.131, 2.025], loss: 0.202475, mean_absolute_error: 2.793996, mean_q: 5.315974
   663/50000: episode: 35, duration: 0.114s, episode steps: 13, steps per second: 114, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.769 [0.000, 1.000], mean observation: -0.102 [-2.371, 1.540], loss: 0.270679, mean_absolute_error: 2.822477, mean_q: 5.306091
   699/50000: episode: 36, duration: 0.288s, episode steps: 36, steps per second: 125, episode reward: 36.000, mean reward: 1.000 [1.000, 1.0

  3128/50000: episode: 62, duration: 1.656s, episode steps: 200, steps per second: 121, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.286 [-1.959, 1.190], loss: 1.078630, mean_absolute_error: 12.793987, mean_q: 26.051723
  3328/50000: episode: 63, duration: 1.645s, episode steps: 200, steps per second: 122, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.297 [-1.974, 1.000], loss: 1.116100, mean_absolute_error: 13.697876, mean_q: 27.912025
  3509/50000: episode: 64, duration: 1.493s, episode steps: 181, steps per second: 121, episode reward: 181.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.464 [0.000, 1.000], mean observation: -0.385 [-2.427, 1.048], loss: 1.461792, mean_absolute_error: 14.460250, mean_q: 29.414192
  3709/50000: episode: 65, duration: 1.603s, episode steps: 200, steps per second: 125, episode reward: 200.000, mean reward: 1

  8892/50000: episode: 91, duration: 1.595s, episode steps: 200, steps per second: 125, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.460 [0.000, 1.000], mean observation: -0.342 [-2.910, 1.059], loss: 4.266553, mean_absolute_error: 31.698809, mean_q: 64.039505
  9092/50000: episode: 92, duration: 1.602s, episode steps: 200, steps per second: 125, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.256 [-1.816, 0.983], loss: 3.675431, mean_absolute_error: 31.975489, mean_q: 64.714554
  9292/50000: episode: 93, duration: 1.599s, episode steps: 200, steps per second: 125, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.230 [-1.568, 1.051], loss: 4.738186, mean_absolute_error: 32.398472, mean_q: 65.401604
  9492/50000: episode: 94, duration: 1.618s, episode steps: 200, steps per second: 124, episode reward: 200.000, mean reward: 1

 28978/50000: episode: 192, duration: 1.618s, episode steps: 200, steps per second: 124, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.126 [-1.221, 1.154], loss: 5.862950, mean_absolute_error: 44.906303, mean_q: 90.220940
 29178/50000: episode: 193, duration: 1.624s, episode steps: 200, steps per second: 123, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.131 [-1.466, 1.322], loss: 9.148564, mean_absolute_error: 44.982044, mean_q: 90.212784
 29378/50000: episode: 194, duration: 1.612s, episode steps: 200, steps per second: 124, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.102 [-1.330, 1.309], loss: 13.605254, mean_absolute_error: 45.165493, mean_q: 90.328964
 29578/50000: episode: 195, duration: 1.604s, episode steps: 200, steps per second: 125, episode reward: 200.000, mean reward:

 34181/50000: episode: 221, duration: 0.775s, episode steps: 104, steps per second: 134, episode reward: 104.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: 0.089 [-1.538, 1.785], loss: 9.704447, mean_absolute_error: 43.207153, mean_q: 86.547768
 34381/50000: episode: 222, duration: 1.613s, episode steps: 200, steps per second: 124, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.067 [-1.764, 1.521], loss: 4.814377, mean_absolute_error: 42.531528, mean_q: 85.288162
 34581/50000: episode: 223, duration: 1.668s, episode steps: 200, steps per second: 120, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.040 [-1.723, 1.575], loss: 7.417551, mean_absolute_error: 42.578915, mean_q: 85.215775
 34781/50000: episode: 224, duration: 1.591s, episode steps: 200, steps per second: 126, episode reward: 200.000, mean reward: 

 39981/50000: episode: 250, duration: 1.614s, episode steps: 200, steps per second: 124, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.025 [-1.244, 1.253], loss: 7.819655, mean_absolute_error: 42.755814, mean_q: 85.880684
 40102/50000: episode: 251, duration: 0.927s, episode steps: 121, steps per second: 131, episode reward: 121.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.496 [0.000, 1.000], mean observation: 0.017 [-1.534, 1.435], loss: 7.563911, mean_absolute_error: 42.735138, mean_q: 85.934830
 40302/50000: episode: 252, duration: 1.597s, episode steps: 200, steps per second: 125, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.020 [-1.639, 1.715], loss: 11.181011, mean_absolute_error: 42.948872, mean_q: 86.051338
 40502/50000: episode: 253, duration: 1.609s, episode steps: 200, steps per second: 124, episode reward: 200.000, mean reward:

 45702/50000: episode: 279, duration: 1.607s, episode steps: 200, steps per second: 124, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.004 [-1.584, 1.317], loss: 17.094990, mean_absolute_error: 43.585098, mean_q: 87.273415
 45902/50000: episode: 280, duration: 1.666s, episode steps: 200, steps per second: 120, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.004 [-1.133, 1.261], loss: 15.279386, mean_absolute_error: 43.371414, mean_q: 87.185440
 46102/50000: episode: 281, duration: 1.578s, episode steps: 200, steps per second: 127, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: -0.007 [-1.240, 1.478], loss: 9.960597, mean_absolute_error: 43.395546, mean_q: 87.231369
 46292/50000: episode: 282, duration: 1.531s, episode steps: 190, steps per second: 124, episode reward: 190.000, mean rewar

<keras.callbacks.History at 0x7f01f31210b8>