In [1]:
import gym

In [2]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent

In [3]:
env_name = 'CartPole-v1'
env = gym.make(env_name)
env.reset()  # reset the environment to the initial state
for _ in range(200):  # play for max 200 iterations
    env.render(mode="human")  # render the current game state on your screen
    random_action = env.action_space.sample()  # chose a random action
    env.step(random_action)  # execute that action
env.close()  # close the environment



In [4]:
nb_actions =env.action_space.n

In [5]:
nb_obs = env.observation_space.shape

In [6]:
nb_obs

(4,)

In [7]:
model = Sequential()

model.add(Flatten(input_shape=(1,) + env.observation_space.shape))

model.add(Dense(16))
model.add(Activation('relu'))

model.add(Dense(32))
model.add(Activation('relu'))

model.add(Dense(nb_actions))
model.add(Activation('linear'))

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 4)                 0         
                                                                 
 dense (Dense)               (None, 16)                80        
                                                                 
 activation (Activation)     (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 32)                544       
                                                                 
 activation_1 (Activation)   (None, 32)                0         
                                                                 
 dense_2 (Dense)             (None, 2)                 66        
                                                                 
 activation_2 (Activation)   (None, 2)                 0

In [8]:
from rl.memory import SequentialMemory

In [9]:
memory = SequentialMemory(limit=20000, window_length=1)

In [10]:
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy

In [11]:
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                             attr='eps',
                             value_max=1.0,
                             value_min=0.1,
                             value_test=0.05,
                             nb_steps=20000)

In [12]:
dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               memory=memory,
               nb_steps_warmup=10,
               target_model_update=100,
               policy=policy)


In [13]:
optimizer = Adam(learning_rate=0.001)
import tensorflow as tf
dqn.compile(optimizer=Adam(learning_rate=0.001),
    metrics=['mae']
)

In [14]:
# Train the agent
dqn.fit(env, nb_steps=10000, visualize=True, verbose=2)

# # Evaluate the agent
# dqn.test(env, nb_episodes=5, visualize=True)

Training for 10000 steps ...


  updates=self.state_updates,


   20/10000: episode: 1, duration: 0.865s, episode steps:  20, steps per second:  23, episode reward: 20.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.400 [0.000, 1.000],  loss: 0.574686, mae: 0.600634, mean_q: 0.195825, mean_eps: 0.999325




   36/10000: episode: 2, duration: 0.294s, episode steps:  16, steps per second:  54, episode reward: 16.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.625 [0.000, 1.000],  loss: 0.467207, mae: 0.601295, mean_q: 0.374379, mean_eps: 0.998762
   74/10000: episode: 3, duration: 0.636s, episode steps:  38, steps per second:  60, episode reward: 38.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.553 [0.000, 1.000],  loss: 0.261169, mae: 0.609071, mean_q: 0.690888, mean_eps: 0.997548
   91/10000: episode: 4, duration: 0.335s, episode steps:  17, steps per second:  51, episode reward: 17.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.353 [0.000, 1.000],  loss: 0.098596, mae: 0.664253, mean_q: 1.070842, mean_eps: 0.996310
  132/10000: episode: 5, duration: 0.724s, episode steps:  41, steps per second:  57, episode reward: 41.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.610 [0.000, 1.000],  loss: 0.288722, mae: 0.985243, mean_q: 1.415618, mean_eps: 0

<keras.callbacks.History at 0x15977582820>

In [15]:
# Evaluate the agent
dqn.test(env, nb_episodes=5, visualize=True, verbose=2)

Testing for 10000 episodes ...
Episode 1: reward: 404.000, steps: 404
Episode 2: reward: 210.000, steps: 210
Episode 3: reward: 236.000, steps: 236
Episode 4: reward: 206.000, steps: 206
Episode 5: reward: 234.000, steps: 234
Episode 6: reward: 190.000, steps: 190
Episode 7: reward: 202.000, steps: 202
Episode 8: reward: 219.000, steps: 219
Episode 9: reward: 269.000, steps: 269
Episode 10: reward: 285.000, steps: 285
Episode 11: reward: 306.000, steps: 306
Episode 12: reward: 264.000, steps: 264
Episode 13: reward: 255.000, steps: 255
Episode 14: reward: 241.000, steps: 241
Episode 15: reward: 250.000, steps: 250
Episode 16: reward: 204.000, steps: 204
Episode 17: reward: 189.000, steps: 189
Episode 18: reward: 271.000, steps: 271
Episode 19: reward: 251.000, steps: 251
Episode 20: reward: 210.000, steps: 210
Episode 21: reward: 324.000, steps: 324
Episode 22: reward: 208.000, steps: 208
Episode 23: reward: 208.000, steps: 208
Episode 24: reward: 236.000, steps: 236
Episode 25: reward

KeyboardInterrupt: 