In [5]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

In [6]:
ENV_NAME = 'CartPole-v0'

# Get the environment and extract the number of actions available in the Cartpole problem
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [7]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

Instructions for updating:
keep_dims is deprecated, use keepdims instead
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 34        
_________________________________________________________________
activation_2 (Activation)    (None, 2)                 0         
Total params: 114
Trainable params: 114
Non-trainable params: 0
_________________________________________________________________
None


In [10]:
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this slows down training quite a lot. 
dqn.fit(env, nb_steps=5000, visualize=True, verbose=2)

Training for 5000 steps ...




   15/5000: episode: 1, duration: 0.737s, episode steps: 15, steps per second: 20, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.092 [-0.625, 1.231], loss: 0.352846, mean_absolute_error: 4.238831, mean_q: 7.771027




   38/5000: episode: 2, duration: 0.380s, episode steps: 23, steps per second: 61, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.478 [0.000, 1.000], mean observation: 0.069 [-0.551, 0.981], loss: 1.132134, mean_absolute_error: 4.413056, mean_q: 8.058371
  133/5000: episode: 3, duration: 1.584s, episode steps: 95, steps per second: 60, episode reward: 95.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.011 [-0.949, 0.513], loss: 0.755383, mean_absolute_error: 4.632724, mean_q: 8.877714
  158/5000: episode: 4, duration: 0.414s, episode steps: 25, steps per second: 60, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.088 [-0.796, 0.195], loss: 0.724562, mean_absolute_error: 4.910100, mean_q: 9.561622
  177/5000: episode: 5, duration: 0.319s, episode steps: 19, steps per second: 60, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean actio

 1007/5000: episode: 31, duration: 0.716s, episode steps: 43, steps per second: 60, episode reward: 43.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.512 [0.000, 1.000], mean observation: 0.046 [-0.378, 0.781], loss: 1.486288, mean_absolute_error: 7.026457, mean_q: 13.753178
 1046/5000: episode: 32, duration: 0.650s, episode steps: 39, steps per second: 60, episode reward: 39.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.513 [0.000, 1.000], mean observation: 0.065 [-0.346, 0.752], loss: 1.789668, mean_absolute_error: 7.177750, mean_q: 13.965510
 1082/5000: episode: 33, duration: 0.602s, episode steps: 36, steps per second: 60, episode reward: 36.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.043 [-0.426, 0.779], loss: 1.421510, mean_absolute_error: 7.276426, mean_q: 14.243847
 1107/5000: episode: 34, duration: 0.413s, episode steps: 25, steps per second: 61, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean 

<keras.callbacks.History at 0x7fcfbd6739b0>

In [9]:
dqn.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 13.000, steps: 13
Episode 2: reward: 14.000, steps: 14
Episode 3: reward: 14.000, steps: 14
Episode 4: reward: 14.000, steps: 14
Episode 5: reward: 18.000, steps: 18


<keras.callbacks.History at 0x7fcff54ac668>

In [11]:
from utilities import Displayable, flip

ModuleNotFoundError: No module named 'utilities'