<div class="alert alert-primary alert-info">

# DQN
- ## MsPacman

</div>

---

In [1]:
import tensorflow as tf
import numpy as np

import dataclasses
import pickle
import datetime

import gym
import baselines.common.atari_wrappers as baselines

import logging
import warnings
warnings.filterwarnings('ignore')

---

In [2]:
print(f'{gym.__name__}: {gym.__version__}')
print(f'{tf.__name__}: {tf.__version__}')

gym: 0.15.7
tensorflow: 2.10.0


In [3]:
ENV_NAME = 'MsPacmanNoFrameskip-v4'

# make_atari enables NoopResetEnv and MaxAndSkipEnv
env = baselines.make_atari(ENV_NAME)

SEED = 22
env.seed(SEED)
tf.keras.utils.set_random_seed(SEED)

In [4]:
env = baselines.wrap_deepmind(env,
                            episode_life=True,
                            clip_rewards=True,
                            frame_stack=True,
                            scale=True)

In [5]:
def additionalImageCropping(obs):
    return np.array(obs)[10:, :, :]

In [6]:
format = "%(asctime)s: %(message)s"
logging.basicConfig(format=format, filename=f'{ENV_NAME}.log', filemode='a', level=logging.INFO, datefmt="%H:%M:%S")

---

### Linear Decay

In [7]:
@dataclasses.dataclass
class LinearDecayFactor:
    
    max_steps: int = 50_000  #1_000_000
    epsilon_max: float = 1.0
    epsilon_min: float = 0.1

    decay_factor = (epsilon_max - epsilon_min) / (max_steps)

    def __call__(self, current_step_count: int) -> float:
        self.epsilon_decay = self.epsilon_max - self.decay_factor * current_step_count
        return max(self.epsilon_min, self.epsilon_decay)

---

### Replay Memories

In [8]:
class ReplayExperiences:
    
    def __init__(self):
        self.capacity = 20_000  #100_000
        self.batch_size = 32
        self.idx = 0
        self.num_experiences = 0

        self.actions = np.empty(self.capacity, dtype=np.int32)
        self.obss = np.empty(self.capacity, dtype=np.ndarray)
        self.next_obss = np.empty(self.capacity, dtype=np.ndarray)
        self.rewards = np.empty(self.capacity, dtype=np.float32)
        self.dones = np.empty(self.capacity, dtype=np.float32)

    def append(self, action, obs, next_obs, reward, done):
        self.actions[self.idx] = action
        self.obss[self.idx] = obs
        self.next_obss[self.idx] = next_obs
        self.rewards[self.idx] = reward
        self.dones[self.idx] = done
        self.idx = (self.idx + 1) % self.capacity
        self.num_experiences = min(self.capacity, self.num_experiences + 1)
        
    def sample(self, sampling_with_replacement=False):
        indices = np.random.choice(self.num_experiences, size=self.batch_size, replace=sampling_with_replacement)
        actions = np.array([self.actions[i] for i in indices])
        obss = np.array([self.obss[i] for i in indices])
        next_obss = np.array([self.next_obss[i] for i in indices])
        rewards = np.array([self.rewards[i] for i in indices])
        dones = np.array([self.dones[i] for i in indices])
        return actions, obss, next_obss, rewards, dones

---

### DoubleDQN

In [9]:
def make_doubledqn(learning_rate, action_space_dim):
    
    dqn = tf.keras.Sequential(name='double_dqn')
    dqn.add(tf.keras.Input(shape=(84, 84, 4)))
    dqn.add(tf.keras.layers.Conv2D(filters=32, kernel_size=[8, 8], strides=[4, 4], activation='relu', kernel_initializer=tf.keras.initializers.VarianceScaling(scale=2.0)))
    dqn.add(tf.keras.layers.Conv2D(filters=64, kernel_size=[4, 4], strides=[2, 2], activation='relu', kernel_initializer=tf.keras.initializers.VarianceScaling(scale=2.0)))
    dqn.add(tf.keras.layers.Conv2D(filters=64, kernel_size=[3, 3], strides=[1, 1], activation='relu', kernel_initializer=tf.keras.initializers.VarianceScaling(scale=2.0)))
    dqn.add(tf.keras.layers.Flatten())
    dqn.add(tf.keras.layers.Dense(units=512, activation='relu', kernel_initializer=tf.keras.initializers.VarianceScaling(scale=2.0)))
    dqn.add(tf.keras.layers.Dense(units=action_space_dim, kernel_initializer=tf.keras.initializers.VarianceScaling(scale=2.0)))
    #print(dqn.summary())
    
    optimizer = tf.keras.optimizers.Adam(learning_rate, clipnorm=1.0)
    dqn.compile(optimizer=optimizer, loss=tf.keras.losses.Huber())
    
    return dqn

---

### DoubleDuelDQN

In [10]:
def make_doubledueldqn(learning_rate, action_space_dim):
    
    inputs = tf.keras.Input(shape=(84, 84, 4), name='Input_layer')

    layer1 = tf.keras.layers.Conv2D(filters=32, kernel_size=[8, 8], strides=[4, 4], activation='relu')(inputs)
    layer2 = tf.keras.layers.Conv2D(filters=64, kernel_size=[4, 4], strides=[2, 2], activation='relu')(layer1)
    layer3 = tf.keras.layers.Conv2D(filters=64, kernel_size=[3, 3], strides=[1, 1], activation='relu')(layer2)
    layer4 = tf.keras.layers.Flatten()(layer3)
    layer5 = tf.keras.layers.Dense(units=512, activation='relu')(layer4)

    action = tf.keras.layers.Dense(units=action_space_dim, activation='linear')(layer5)

    state_values = tf.keras.layers.Dense(1)(layer5)
    raw_advantages = tf.keras.layers.Dense(action_space_dim)(layer5)
    advantages = raw_advantages - tf.reduce_max(raw_advantages, axis=1, keepdims=True)
    Q_values = state_values + advantages

    dqn = tf.keras.Model(inputs=inputs, outputs=[Q_values], name='double_duel_dqn')
    #print(dqn.summary())

    optimizer = tf.keras.optimizers.Adam(learning_rate, clipnorm=1.0)
    dqn.compile(optimizer=optimizer, loss=tf.keras.losses.Huber())
        
    return dqn

---

### Train

In [11]:
def train(env, model_name, retrain_model=False):
    
    discount_factor_gamma = 0.99

    update_main_model_freq = 4
    update_target_model_freq = 5000  #10_000

    if not retrain_model:
        current_frame, episode_count, best_episode_reward = 0, 0, 0
        memories = ReplayExperiences()
        main_dqn = make_doubledqn(learning_rate=0.0001, action_space_dim=env.action_space.n)
        target_dqn = make_doubledqn(learning_rate=0.0001, action_space_dim=env.action_space.n)
        target_dqn.set_weights(main_dqn.get_weights())
    else:
        with open('memories.pkl', 'rb') as replay, \
             open('current_frame.pkl', 'rb') as decay, \
             open('episode_count.pkl', 'rb') as episode, \
             open('best_reward.pkl', 'rb') as best_run:
                memories = pickle.load(replay)
                current_frame = pickle.load(decay)
                episode_count = pickle.load(episode)
                best_episode_reward = pickle.load(best_run)
                logging.info(f'Loading current_frame: {current_frame}')
                logging.info(f'Loading episode_count: {episode_count}')
                logging.info(f'Loading best_episode_reward: {best_episode_reward}')
                logging.info(f'Memories buffer length: {memories.num_experiences}')
        
        main_dqn = tf.keras.models.load_model(f'{model_name}_main.h5')
        target_dqn = tf.keras.models.load_model(f'{model_name}_target.h5')

        
    max_frames = 5 * 1e7
    max_episode_length = 18_000

    epsilon_decay = LinearDecayFactor()

    while current_frame < max_frames:

        obs = env.reset()
        episode_reward = 0

        for _ in range(max_episode_length):

            current_frame += 1

            if epsilon_decay(current_frame) > np.random.uniform():
                action = np.random.choice(env.action_space.n)
            else:
                q_values = main_dqn(obs[np.newaxis], training=False)
                action = np.argmax(q_values)

            next_obs, reward, done, _ = env.step(action)

            memories.append(action, obs, next_obs, reward, done)
            episode_reward += reward
            obs = next_obs

            if current_frame % update_main_model_freq == 0 and current_frame > memories.batch_size:
                
                sampled_actions, sampled_obs, sampled_next_obs, sampled_rewards, sampled_dones = memories.sample()
                
                next_q_values_model = main_dqn.predict(sampled_next_obs, verbose=0)
                best_actions_next_q_values_model = next_q_values_model.argmax(axis=1)

                next_action_mask = tf.one_hot(best_actions_next_q_values_model, env.action_space.n).numpy()
                next_q_values_target = target_dqn.predict(sampled_next_obs, verbose=0)
                next_q_values_target = tf.reduce_sum(next_q_values_target * next_action_mask, axis=1).numpy()
                next_targeted_q_values = sampled_rewards + discount_factor_gamma * next_q_values_target * (1 - sampled_dones)
                next_targeted_q_values = next_targeted_q_values.reshape(-1, 1)

                current_action_mask = tf.one_hot(sampled_actions, env.action_space.n)

                with tf.GradientTape() as tape:
                    current_q_values = main_dqn(sampled_obs)
                    current_q_values = tf.reduce_sum(current_q_values * current_action_mask, axis=1, keepdims=True)
                    loss = tf.keras.losses.Huber()(next_targeted_q_values, current_q_values)

                grads = tape.gradient(loss, main_dqn.trainable_variables)
                main_dqn.optimizer.apply_gradients(zip(grads, main_dqn.trainable_variables))

            if current_frame % update_target_model_freq == 0 and current_frame > 2000:
                target_dqn.set_weights(main_dqn.get_weights())

            if done:
                break

        episode_count += 1
        logging.info(f'Episode: {episode_count}, Reward: {episode_reward}, Frame#: {current_frame}')

        if episode_reward > best_episode_reward:
            best_episode_reward = episode_reward
            main_dqn.save(f'{model_name}_main_{episode_reward}.h5')
            target_dqn.save(f'{model_name}_target_{episode_reward}.h5')
            logging.info(f'Saving main and target models at episode: {episode_count}, Reward: {episode_reward}')
    
        if episode_count % 300000 == 0:
            logging.info(f'Dumping model at episode_count: {episode_count}, current_frame: {current_frame}')
            main_dqn.save(f'{model_name}_main.h5')
            target_dqn.save(f'{model_name}_target.h5')
            with open('memories.pkl', 'wb') as replay, \
                 open('current_frame.pkl', 'wb') as decay, \
                 open('episode_count.pkl', 'wb') as episode, \
                 open('best_reward.pkl', 'wb') as best_run:
                    pickle.dump(memories, replay)
                    pickle.dump(current_frame, decay)
                    pickle.dump(episode_count, episode)
                    pickle.dump(best_episode_reward, best_run)
            #break

    main_dqn.save(f'{model_name}_main.h5')
    target_dqn.save(f'{model_name}_target.h5')
    logging.info(f'Saving completed main and target models at episode: {episode_count}, Reward: {episode_reward}')

In [12]:
if __name__ == '__main__':
    
    train(env, ENV_NAME)

---

### Evaluation

In [13]:
def evaluate(env, eval_model, output_dir, num_of_evals=10):

    env = gym.wrappers.Monitor(env, output_dir, force=True)

    for _ in range(num_of_evals):

        obs = env.reset()

        total_reward, step_count = 0, 0

        while True:

            step_count += 1

            env.render()

            obs_t = tf.convert_to_tensor(obs)
            obs_t = tf.expand_dims(obs_t, 0)

            action_values = trained_model.predict(obs_t, verbose=0)
            action = tf.argmax(action_values[0]).numpy()

            obs, reward, done, info = env.step(action)

            total_reward += reward

            if done: break

In [14]:
if __name__ == '__main__':
    
    model_name = f'{ENV_NAME}_main.h5'
    trained_model = tf.keras.models.load_model(model_name)
    
    output_dir = f'./Evaluation_{datetime.datetime.now().strftime("%d%m%Y")}'
    evaluate(env, trained_model, output_dir)

    env.close()

---

### Starting out $\ldots$

<img src='utils/pacman_starting_out.gif' height='420' width='320'/>

---

### On the way $\ldots$

<img src='utils/pacman_on_the_way.gif' height='420' width='320'/>

---

### Still on the way $\ldots$

<img src='utils/pacman_still_on_way1.gif' height='420' width='320'/>

<img src='utils/pacman_still_on_way2.gif' height='420' width='320'/>

<img src='utils/pacman_still_on_way3.gif' height='420' width='320'/>

<img src='utils/pacman_still_on_way4.gif' height='420' width='320'/>

<img src='utils/pacman_still_on_way5.gif' height='420' width='320'/>