In [1]:

# coding: utf-8

# In[9]:


from __future__ import division
import argparse

from PIL import Image
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Convolution2D, Permute
from keras.optimizers import Adam
import keras.backend as K

from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint


INPUT_SHAPE = (84, 84)
WINDOW_LENGTH = 4


class AtariProcessor(Processor):
    def process_observation(self, observation):
        assert observation.ndim == 3  # (height, width, channel)
        img = Image.fromarray(observation)
        img = img.resize(INPUT_SHAPE).convert('L')  # resize and convert to grayscale
        processed_observation = np.array(img)
        assert processed_observation.shape == INPUT_SHAPE
        return processed_observation.astype('uint8')  # saves storage in experience memory

    def process_state_batch(self, batch):
        # We could perform this processing step in `process_observation`. In this case, however,
        # we would need to store a `float32` array instead, which is 4x more memory intensive than
        # an `uint8` array. This matters if we store 1M observations.
        processed_batch = batch.astype('float32') / 255.
        return processed_batch

    def process_reward(self, reward):
        return np.clip(reward, -1., 1.)

env_name=('MsPacman-v0')
weights=None

# Get the environment and extract the number of actions.
env = gym.make(env_name)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

# Next, we build our model. We use the same model that was described by Mnih et al. (2015).
input_shape = (WINDOW_LENGTH,) + INPUT_SHAPE
model = Sequential()
if K.image_dim_ordering() == 'tf':
    # (width, height, channels)
    model.add(Permute((2, 3, 1), input_shape=input_shape))
elif K.image_dim_ordering() == 'th':
    # (channels, width, height)
    model.add(Permute((1, 2, 3), input_shape=input_shape))
else:
    raise RuntimeError('Unknown image_dim_ordering.')
model.add(Convolution2D(32, 8, 8, subsample=(4, 4)))
model.add(Activation('relu'))
model.add(Convolution2D(64, 4, 4, subsample=(2, 2)))
model.add(Activation('relu'))
model.add(Convolution2D(64, 3, 3, subsample=(1, 1)))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
processor = AtariProcessor()

# Select a policy. We use eps-greedy action selection, which means that a random action is selected
# with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that
# the agent initially explores the environment (high eps) and then gradually sticks to what it knows
# (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05
# so that the agent still performs some random actions. This ensures that the agent cannot get stuck.
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05,
                              nb_steps=1000000)

# The trade-off between exploration and exploitation is difficult and an on-going research topic.
# If you want, you can experiment with the parameters or use a different policy. Another popular one
# is Boltzmann-style exploration:
# policy = BoltzmannQPolicy(tau=1.)
# Feel free to give it a try!

dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory,
               processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000,
               train_interval=4, delta_clip=1.)
dqn.compile(Adam(lr=.00025), metrics=['mae'])


# In[10]:


# Okay, now it's time to learn something! We capture the interrupt exception so that training
# can be prematurely aborted. Notice that you can the built-in Keras callbacks!
weights_filename = 'dqn_{}_weights.h5f'.format(env_name)
checkpoint_weights_filename = 'dqn_' + env_name + '_weights_{step}.h5f'
log_filename = 'dqn_{}_log.json'.format(env_name)
callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000)]
callbacks += [FileLogger(log_filename, interval=100)]
dqn.fit(env, callbacks=callbacks, nb_steps=150000, log_interval=10000)

# After training is done, we save the final weights one more time.
dqn.save_weights(weights_filename, overwrite=True)

# Finally, evaluate our algorithm for 10 episodes.
dqn.test(env, nb_episodes=10, visualize=False)


# In[ ]:


weights_filename = 'dqn_{}_weights.h5f'.format(env_name)
dqn.load_weights(weights_filename)
dqn.test(env, nb_episodes=1000, visualize=True)


# In[ ]:


Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
permute_1 (Permute)          (None, 84, 84, 4)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 20, 20, 32)        8224      
_________________________________________________________________
activation_1 (Activation)    (None, 20, 20, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 9, 9, 64)          32832     
_________________________________________________________________
activation_2 (Activation)    (None, 9, 9, 64)          0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 7, 7, 64)          36928     
_________________________________________________________________
activation_3 (Activation)    (None, 7, 7, 64)          0         
__________

Episode 62: reward: 69.000, steps: 783
Episode 63: reward: 48.000, steps: 1065
Episode 64: reward: 32.000, steps: 835
Episode 65: reward: 46.000, steps: 785
Episode 66: reward: 35.000, steps: 753
Episode 67: reward: 25.000, steps: 580
Episode 68: reward: 46.000, steps: 1242
Episode 69: reward: 24.000, steps: 593
Episode 70: reward: 42.000, steps: 706
Episode 71: reward: 55.000, steps: 963
Episode 72: reward: 37.000, steps: 641
Episode 73: reward: 25.000, steps: 715
Episode 74: reward: 32.000, steps: 639
Episode 75: reward: 35.000, steps: 676
Episode 76: reward: 52.000, steps: 888
Episode 77: reward: 29.000, steps: 593
Episode 78: reward: 31.000, steps: 662
Episode 79: reward: 43.000, steps: 1049
Episode 80: reward: 48.000, steps: 873
Episode 81: reward: 33.000, steps: 837
Episode 82: reward: 27.000, steps: 651
Episode 83: reward: 29.000, steps: 612
Episode 84: reward: 48.000, steps: 954
Episode 85: reward: 31.000, steps: 525
Episode 86: reward: 33.000, steps: 551
Episode 87: reward: 31

Episode 268: reward: 41.000, steps: 1007
Episode 269: reward: 27.000, steps: 541
Episode 270: reward: 53.000, steps: 863
Episode 271: reward: 33.000, steps: 619
Episode 272: reward: 34.000, steps: 597
Episode 273: reward: 32.000, steps: 690
Episode 274: reward: 55.000, steps: 1036
Episode 275: reward: 27.000, steps: 750
Episode 276: reward: 24.000, steps: 687
Episode 277: reward: 45.000, steps: 857
Episode 278: reward: 43.000, steps: 1071
Episode 279: reward: 35.000, steps: 755
Episode 280: reward: 70.000, steps: 1637
Episode 281: reward: 40.000, steps: 735
Episode 282: reward: 29.000, steps: 720
Episode 283: reward: 44.000, steps: 984
Episode 284: reward: 71.000, steps: 1504
Episode 285: reward: 22.000, steps: 659
Episode 286: reward: 32.000, steps: 687
Episode 287: reward: 38.000, steps: 893
Episode 288: reward: 34.000, steps: 791
Episode 289: reward: 36.000, steps: 976
Episode 290: reward: 38.000, steps: 580
Episode 291: reward: 28.000, steps: 752
Episode 292: reward: 31.000, steps:

Episode 473: reward: 26.000, steps: 527
Episode 474: reward: 46.000, steps: 724
Episode 475: reward: 36.000, steps: 704
Episode 476: reward: 38.000, steps: 732
Episode 477: reward: 64.000, steps: 1317
Episode 478: reward: 43.000, steps: 638
Episode 479: reward: 36.000, steps: 899
Episode 480: reward: 45.000, steps: 1118
Episode 481: reward: 63.000, steps: 1008
Episode 482: reward: 68.000, steps: 974
Episode 483: reward: 31.000, steps: 685
Episode 484: reward: 37.000, steps: 729
Episode 485: reward: 23.000, steps: 629
Episode 486: reward: 63.000, steps: 888
Episode 487: reward: 28.000, steps: 658
Episode 488: reward: 31.000, steps: 734
Episode 489: reward: 48.000, steps: 1107
Episode 490: reward: 46.000, steps: 958
Episode 491: reward: 47.000, steps: 1035
Episode 492: reward: 54.000, steps: 693
Episode 493: reward: 28.000, steps: 566
Episode 494: reward: 33.000, steps: 900
Episode 495: reward: 28.000, steps: 672
Episode 496: reward: 33.000, steps: 987
Episode 497: reward: 59.000, steps:

Episode 677: reward: 23.000, steps: 618
Episode 678: reward: 37.000, steps: 952
Episode 679: reward: 46.000, steps: 670
Episode 680: reward: 42.000, steps: 644
Episode 681: reward: 40.000, steps: 589
Episode 682: reward: 28.000, steps: 804
Episode 683: reward: 28.000, steps: 1024
Episode 684: reward: 45.000, steps: 807
Episode 685: reward: 23.000, steps: 630
Episode 686: reward: 57.000, steps: 829
Episode 687: reward: 28.000, steps: 968
Episode 688: reward: 18.000, steps: 685
Episode 689: reward: 44.000, steps: 877
Episode 690: reward: 52.000, steps: 705
Episode 691: reward: 39.000, steps: 1048
Episode 692: reward: 49.000, steps: 781
Episode 693: reward: 37.000, steps: 1160
Episode 694: reward: 83.000, steps: 1169
Episode 695: reward: 33.000, steps: 750
Episode 696: reward: 54.000, steps: 1084
Episode 697: reward: 48.000, steps: 845
Episode 698: reward: 31.000, steps: 612
Episode 699: reward: 53.000, steps: 1497
Episode 700: reward: 25.000, steps: 666
Episode 701: reward: 47.000, steps

KeyboardInterrupt: 