In [1]:
from PIL import Image
import numpy as np
import gym

In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, Convolution2D, Permute
from tensorflow.keras.optimizers import Adam

In [3]:
from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint

In [5]:
env = gym.make("BreakoutDeterministic-v4")

In [6]:
nb_actions = env.action_space.n

In [7]:
nb_actions

4

In [8]:
IMG_SHAPE = (84, 84)
WINDOW_LENGTH = 4

In [9]:
class ImageProcessor(Processor):
    def process_observation(self, observation):
        img = Image.fromarray(observation)
        img = img.resize(IMG_SHAPE)
        img = img.convert("L")
        img = np.array(img)
        
        return img.astype("uint8")
    def process_state_batch(self, batch):
        process_batch = batch.astype("float32") / 255.0
        return process_batch
    def process_reward(self, reward):
        return np.clip(reward, -1.0, 1.0)

In [10]:
input_shape = (WINDOW_LENGTH, IMG_SHAPE[0], IMG_SHAPE[1])
input_shape

(4, 84, 84)

In [11]:
model = Sequential()

In [12]:
model.add(Permute((2, 3, 1), input_shape=input_shape))
model.add(Convolution2D(32, (8, 8), strides=(4, 4), kernel_initializer="he_normal"))
model.add(Activation("relu"))
model.add(Convolution2D(64, (4, 4), strides=(2, 2), kernel_initializer="he_normal"))
model.add(Activation("relu"))
model.add(Convolution2D(64, (3, 3), strides=(1, 1), kernel_initializer="he_normal"))
model.add(Activation("relu"))
model.add(Flatten())
model.add(Dense(512))
model.add(Activation("relu"))
model.add(Dense(nb_actions))
model.add(Activation("linear"))

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
permute (Permute)            (None, 84, 84, 4)         0         
_________________________________________________________________
conv2d (Conv2D)              (None, 20, 20, 32)        8224      
_________________________________________________________________
activation (Activation)      (None, 20, 20, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 9, 9, 64)          32832     
_________________________________________________________________
activation_1 (Activation)    (None, 9, 9, 64)          0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 7, 7, 64)          36928     
_________________________________________________________________
activation_2 (Activation)    (None, 7, 7, 64)          0

In [14]:
memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)

In [15]:
processor = ImageProcessor()

In [22]:
checkpoint_callback = ModelIntervalCheckpoint(checkpoint_filename,
                                              interval=100000
                                             )

In [24]:
model.load_weights("weights/dqn_BreakoutDeterministic-v4_weights_900000.h5f")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x1b808eb91d0>

In [25]:
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), 
                              attr="eps",
                              value_max=0.2,
                              value_min=0.1,
                              value_test=0.05,
                              nb_steps=1000000
                             )

In [26]:
dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               policy=policy,
               memory=memory,
               processor=processor,
               nb_steps_warmup=50000,
               gamma=0.99,
               target_model_update=10000,
               delta_clip=1
              )

In [27]:
dqn.compile(Adam(learning_rate=0.00025), metrics=["mae"])

In [21]:
weights_filename = "DQN_BO.h5f"
checkpoint_filename = "DQN_CHECKPOINT.h5f"

In [28]:
dqn.fit(env, nb_steps=1000, callbacks=[checkpoint_callback], log_interval=500, visualize=False)

Training for 1000 steps ...
Interval 1 (0 steps performed)




1 episodes - episode_reward: 4.000 [4.000, 4.000] - ale.lives: 3.384

Interval 2 (500 steps performed)
done, took 15.370 seconds


<keras.callbacks.History at 0x1b8092fbe48>

In [29]:
dqn.test(env, nb_episodes=1, visualize=True)

Testing for 1 episodes ...


KeyboardInterrupt: 

In [30]:
model.load_weights("weights/dqn_BreakoutDeterministic-v4_weights_1200000.h5f")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x1b81ef6c748>

In [31]:
policy = EpsGreedyQPolicy(0.1)

In [32]:
dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               policy=policy,
               memory=memory,
               processor=processor
              )
dqn.compile(Adam(learning_rate=0.00025), metrics=["mae"])

In [34]:
dqn.test(env, nb_episodes=1, visualize=True)

Testing for 1 episodes ...
Episode 1: reward: 40.000, steps: 1513


<keras.callbacks.History at 0x1b948ab6c88>