# Prepare

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install gym[atari]
!pip install autorom[accept-rom-license]

In [None]:
import gym

import numpy as np
from collections import deque

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import time
import matplotlib.pyplot as plt
from tqdm import tqdm
from IPython import display


print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))

## Wrappers

In [8]:
class ConcatObs(gym.Wrapper):
    def __init__(self, env, k=4):
        gym.Wrapper.__init__(self, env)
        self.k = k
        self.frames = deque([], maxlen=k)
        shp = env.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=((k,) + shp), dtype=env.observation_space.dtype)

    def reset(self):
        ob = self.env.reset()
        for _ in range(self.k):
            self.frames.append(ob)

        return self._get_ob()

    def step(self, action):
        total_reward = 0.0
        done = None
        for i in range(self.k):
            obs, reward, done, info = self.env.step(action)
            total_reward += reward
            self.frames.append(obs)
            
            # only count one live each episode
            done = True if info['lives'] < 4 else False
            if done:  
                break
        return self._get_ob(), total_reward, done, info

    def _get_ob(self):
        return np.array(self.frames)

In [9]:
# A bunch of wrappers to get us started, please use these
class ObservationWrapper(gym.ObservationWrapper):
    def __init__(self, env, GRAYSCALE=False, NORMALIZE=False):
        self.GRAYSCALE = GRAYSCALE
        self.NORMALIZE = NORMALIZE
        super().__init__(env)
    
    def observation(self, obs):
        # Normalise observation by 255
        if self.NORMALIZE:
            obs = obs / 255.0
            
        if self.GRAYSCALE:
            obs = tf.image.rgb_to_grayscale(obs)
                    
        image = obs[:,2:-9,8:,:]
        image = tf.image.resize(image,[84,84])
        image = tf.transpose(tf.reshape(image, image.shape[:-1]),perm = [1,2,0])
        return image

class RewardWrapper(gym.RewardWrapper):
    def __init__(self, env):
        super().__init__(env)
    
    def reward(self, reward):
        # Clip reward between 0 to 1
        #return np.clip(reward, 0, 1)
        return reward
    
class ActionWrapper(gym.ActionWrapper):
    def __init__(self, env):
        super().__init__(env)
    
    def action(self, action):
        return action

class FireResetEnv(gym.Wrapper):
    def __init__(self, env):
        """Take action on reset for environments that are fixed until firing."""
        super().__init__(env)
        assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
        assert len(env.unwrapped.get_action_meanings()) >= 3

    def reset(self, **kwargs):
        self.env.reset(**kwargs)
        obs, _, done, _ = self.env.step(1)
        if done:
            self.env.reset(**kwargs)
        obs, _, done, _ = self.env.step(2)
        if done:
            self.env.reset(**kwargs)
        return obs

    def step(self, ac):
        return self.env.step(ac)

# Environment

In [10]:
env = gym.make("ALE/Riverraid-v5")

In [11]:
# Use wrappers for the environment
env = ObservationWrapper(RewardWrapper(ActionWrapper(ConcatObs(FireResetEnv(env),k=4))), GRAYSCALE=True, NORMALIZE=True)
obs = env.reset()

# DQN

## Build model

In [12]:
hidden_size = 512
n_action = env.action_space.n

### Orignial DQN

In [13]:
def create_dqn(input_shape, hidden_size, n_action):
    inputs = layers.Input(shape=input_shape)
    cnn1 = layers.Conv2D(32, 8, strides=4, activation="relu")(inputs)
    cnn2 = layers.Conv2D(64, 4, strides=2, activation="relu")(cnn1)
    cnn3 = layers.Conv2D(64, 3, strides=1, activation="relu")(cnn2)
    flatten = layers.Flatten()(cnn3)
    
    adv_dense1 = keras.layers.Dense(hidden_size, activation='relu')(flatten) 
    # val_dense1 = keras.layers.Dense(hidden_size, activation='relu')(flatten)
    
    adv_out = keras.layers.Dense(n_action)(adv_dense1)
    # val_out = keras.layers.Dense(1)(val_dense1)
    
    # out = tf.math.add(adv_out, val_out - tf.reduce_mean(adv_out))
    
    return keras.Model(inputs=inputs, outputs=adv_out)

## Dueling DQN

In [14]:
def create_dueling_dqn(input_shape, hidden_size, n_action):
    inputs = layers.Input(shape=input_shape)
    cnn1 = layers.Conv2D(32, 8, strides=4, activation="relu")(inputs)
    cnn2 = layers.Conv2D(64, 4, strides=2, activation="relu")(cnn1)
    cnn3 = layers.Conv2D(64, 3, strides=1, activation="relu")(cnn2)
    flatten = layers.Flatten()(cnn3)
    
    adv_dense1 = keras.layers.Dense(hidden_size, activation='relu')(flatten) 
    val_dense1 = keras.layers.Dense(hidden_size, activation='relu')(flatten)
    
    adv_out = keras.layers.Dense(n_action)(adv_dense1)
    val_out = keras.layers.Dense(1)(val_dense1)
    
    out = tf.math.add(adv_out, val_out - tf.reduce_mean(adv_out))
    
    return keras.Model(inputs=inputs, outputs=out)

In [None]:
qmodel = create_dueling_dqn(input_shape=obs.shape, hidden_size=hidden_size, n_action=n_action)
print(qmodel.summary())

In [None]:
qmodel = create_dqn(input_shape=obs.shape, hidden_size=hidden_size, n_action=n_action)
print(qmodel.summary())

# Hyper parameters

In [17]:
# Hyper parameters
max_step = 100000
gamma = 0.99
decay_rate = 0.995
eps = 0.99
eps_threshold = 0.1
tau = 5
check_point = 100

MEM_SIZE = 10000
BATCH_SIZE = 64

# Train

In [23]:
def take_action(env, q_values, state, eps=0.1, greedy=False):
    if not greedy and np.random.rand(1) < eps:
        action = np.random.randint(n_action)
    else:
        action = tf.math.argmax(q_values, axis=1).numpy()[0] # greedy
    return action


def train_one_episode(env, qmodel, target_model, trainer, eps, gamma, tau,
                      max_step, is_double=False,
                      replay_buffer=None, batch_size=BATCH_SIZE):
    start_time = time.time()
    total_reward = 0
    obs = env.reset()
    step = 0

    for step in range(max_step):
        with tf.GradientTape() as tape:
            q_values = qmodel(np.array([obs]))
            action = take_action(env,q_values ,obs ,eps)

            obs_new,reward ,done ,info=env.step(action)
            done=True if info['lives']<4 else False

            # Add experience to replay buffer
            if replay_buffer is not None:
                replay_buffer.add(obs=obs,
                                  action=action,
                                  reward=reward,
                                  next_obs=obs_new,
                                  done=done)

            # Sample from replay buffer and update model
            if replay_buffer is not None and len(replay_buffer) > batch_size:
                experiences = replay_buffer.sample(batch_size)
                obses = experiences['obs']
                actions = experiences['action']
                rewards = experiences['reward']
                next_obses = experiences['next_obs']
                dones = experiences['done']

                q_values_next_target=tf.stop_gradient(target_model(next_obses))
                
                if is_double:
                    q_values_next_online=qmodel(next_obses)
                    actions_next=tf.argmax(q_values_next_online,axis=1)
                else:
                    actions_next=tf.argmax(q_values_next_target,axis=1)

                action_q=tf.reduce_sum(qmodel(obses)*tf.one_hot(actions,n_action),axis=1)
                
                action_q_next=tf.reduce_sum(q_values_next_target*tf.one_hot(actions_next,n_action),axis=1)
                
                action_target=rewards+(1-tf.cast(dones ,tf.float32))*gamma*action_q_next
                
                error=action_target-action_q
                loss=tf.reduce_sum(tf.square(error))
                
            else: # If no replay buffer or not enough samples yet
                q_values_next_target=tf.stop_gradient(target_model(np.array([obs_new])))

                if is_double:
                    q_values_next_online=qmodel(np.array([obs_new]))
                    action_next=tf.argmax(q_values_next_online,axis=1)
                else:
                    action_next=tf.argmax(q_values_next_target,axis=1)
                    
                action_q=tf.reduce_sum(q_values*tf.one_hot(action,n_action),axis=1)
                
                action_q_next=tf.reduce_sum(q_values_next_target*tf.one_hot(action_next,n_action),axis=1)
                
                action_target=reward+(1-tf.cast(done ,tf.float32))*gamma*action_q_next
                
                error=action_target-action_q
                loss=tf.reduce_sum(tf.square(error))
                    
        grad=tape.gradient(loss,qmodel.trainable_variables)
        trainer.apply_gradients(zip(grad,qmodel.trainable_variables))
        total_reward+=reward
        
        obs=obs_new
        if done:
            break
        
        if (step+1)%tau==0:
            target_model.set_weights(qmodel.get_weights())
 
    end_time=time.time()
    runtime=end_time-start_time
    
    return total_reward,runtime ,step

In [24]:
def show_state(env, step=0, info=""):
    plt.figure(3)
    plt.clf()
    plt.imshow(env.render(mode='rgb_array'))
    plt.title("%s | Step: %d %s" % (env.spec.id, step, info))
    plt.axis('off')

    display.clear_output(wait=True)
    display.display(plt.gcf())

In [25]:
class ReplayBuffer:
    def __init__(self, size):
        self.size = size
        self.buffer = []
        self.next_idx = 0

    def add(self, obs, action, reward, next_obs, done):
        data = (obs, action, reward, next_obs, done)

        if self.next_idx >= len(self.buffer):
            self.buffer.append(data)
        else:
            self.buffer[self.next_idx] = data
        self.next_idx = (self.next_idx + 1) % self.size

    def sample(self, batch_size):
        idxs = np.random.randint(0,len(self.buffer),size=batch_size)
        obses ,actions ,rewards ,next_obses ,dones= [],[],[],[],[]
        
        for idx in idxs:
            data=self.buffer[idx]
            obs ,action ,reward ,next_obs ,done=data
            obses.append(np.array(obs,copy=False))
            actions.append(np.array(action,copy=False))
            rewards.append(reward)
            next_obses.append(np.array(next_obs,copy=False))
            dones.append(done)
            
        return dict(obs=np.array(obses),
                    action=np.array(actions),
                    reward=np.array(rewards),
                    next_obs=np.array(next_obses),
                    done=np.array(dones))

    def __len__(self):
        return len(self.buffer)

In [26]:
def train(check_point, env, qmodel, target_model, trainer, eps, decay_rate, eps_threshold, gamma, tau, max_step, n_iteration, save_name, is_double=False, replay_size=-1):
    reward_list = []
    runtime_list = []
    n_step_list = []

    avg_reward_list = []

    eps = eps

    if(replay_size>0):
        replay_buffer = ReplayBuffer(replay_size)
    else:
        replay_buffer = None

    p_bar = tqdm(range(n_iteration))
    for i in p_bar:
        eps = max(eps*decay_rate, eps_threshold)
        reward, runtime, n_step = train_one_episode(env, qmodel, target_model, trainer, eps, gamma, tau, max_step, is_double, replay_buffer)
        reward_list.append(reward)
        runtime_list.append(runtime)
        n_step_list.append(n_step)

        if (i+1) % check_point == 0:
            avg_reward = np.mean(reward_list[-check_point:])
            avg_reward_list.append(avg_reward)
            p_bar.set_postfix_str(f"Iteration:{i} current reward:{reward_list[-1]} current n_step:{n_step_list[-1]} last {check_point} average reward:{avg_reward}", refresh=False)
    
    qmodel.save(save_name+'.h5') 
    np.save(save_name+'_reward.npy', reward_list)
    np.save(save_name+'_avg_reward.npy', avg_reward_list)
    np.save(save_name+'_runtime.npy', runtime_list)
    np.save(save_name+'_step.npy', n_step_list)
    return avg_reward_list, reward_list, runtime_list, n_step_list

# Train for DQN, without Dueling network, without Double

In [None]:
qmodel = create_dqn(input_shape=obs.shape, hidden_size=hidden_size, n_action=n_action)
target_model = create_dqn(input_shape=obs.shape, hidden_size=hidden_size, n_action=n_action)
trainer = keras.optimizers.Adam(learning_rate=0.001)

In [None]:
n_iteration = 3000
save_name = "DQN"
avg_reward_list, reward_list, runtime_list, n_step_list = train(check_point, env, qmodel, target_model, trainer, eps, decay_rate, eps_threshold, gamma, tau, max_step, n_iteration, save_name, is_double=False)

In [None]:
print("average_reward:{}".format(sum(reward_list)/n_iteration))
print("average_time:{}".format(sum(runtime_list)/n_iteration))
print("max_step:{}".format(max(n_step_list)))

plt.plot(reward_list)
plt.title('Reward for episodes')
plt.ylabel('Reward')
plt.xlabel('Episodes')
# plt.savefig('/content/drive/MyDrive/INF581/rewardDDQN.jpg')
plt.show()

plt.plot(n_step_list, color='green')
plt.title('Run steps for episodes')
plt.ylabel('Steps')
plt.xlabel('Episodes')
# plt.savefig('/content/drive/MyDrive/INF581/runtimeDDQN.jpg')
plt.show()

# Train for DQN, with Dueling network, without double

In [None]:
qmodel = create_dueling_dqn(input_shape=obs.shape, hidden_size=hidden_size, n_action=n_action)
target_model = create_dueling_dqn(input_shape=obs.shape, hidden_size=hidden_size, n_action=n_action)
trainer = keras.optimizers.Adam(learning_rate=0.001)

In [None]:
n_iteration = 3000
save_name = "DuelingDQN"
avg_reward_list, reward_list, runtime_list, n_step_list = train(check_point, env, qmodel, target_model, trainer, eps, decay_rate, eps_threshold, gamma, tau, max_step, n_iteration, save_name, is_double=False)

In [None]:
print("average_reward:{}".format(sum(reward_list)/n_iteration))
print("average_time:{}".format(sum(runtime_list)/n_iteration))
print("max_step:{}".format(max(n_step_list)))

plt.plot(reward_list)
plt.title('Reward for episodes')
plt.ylabel('Reward')
plt.xlabel('Episodes')
# plt.savefig('/content/drive/MyDrive/INF581/rewardDDQN.jpg')
plt.show()

plt.plot(n_step_list, color='green')
plt.title('Run steps for episodes')
plt.ylabel('Steps')
plt.xlabel('Episodes')
# plt.savefig('/content/drive/MyDrive/INF581/runtimeDDQN.jpg')
plt.show()

# Train for DQN, with Dueling Network, with Double

In [None]:
qmodel = create_dueling_dqn(input_shape=obs.shape, hidden_size=hidden_size, n_action=n_action)
target_model = create_dueling_dqn(input_shape=obs.shape, hidden_size=hidden_size, n_action=n_action)
trainer = keras.optimizers.Adam(learning_rate=0.001)

In [None]:
n_iteration = 3000
save_name = "DoubleDuelingDQN"
avg_reward_list, reward_list, runtime_list, n_step_list = train(check_point, env, qmodel, target_model, trainer, eps, decay_rate, eps_threshold, gamma, tau, max_step, n_iteration, save_name, is_double=True)

In [None]:
print("average_reward:{}".format(sum(reward_list)/n_iteration))
print("average_time:{}".format(sum(runtime_list)/n_iteration))
print("max_step:{}".format(max(n_step_list)))

plt.plot(reward_list)
plt.title('Reward for episodes')
plt.ylabel('Reward')
plt.xlabel('Episodes')
# plt.savefig('/content/drive/MyDrive/INF581/rewardDDQN.jpg')
plt.show()

plt.plot(n_step_list, color='green')
plt.title('Run steps for episodes')
plt.ylabel('Steps')
plt.xlabel('Episodes')
# plt.savefig('/content/drive/MyDrive/INF581/runtimeDDQN.jpg')
plt.show()

# Train for DQN, with Dueling Network, with Double, with replay buffer

In [27]:
qmodel = create_dueling_dqn(input_shape=obs.shape, hidden_size=hidden_size, n_action=n_action)
target_model = create_dueling_dqn(input_shape=obs.shape, hidden_size=hidden_size, n_action=n_action)
trainer = keras.optimizers.Adam(learning_rate=0.001)

In [None]:
n_iteration = 3000
save_name = "ReplayDoubleDuelingDQN"
avg_reward_list, reward_list, runtime_list, n_step_list = train(check_point, env, qmodel, target_model, trainer, eps, decay_rate, eps_threshold, gamma, tau, max_step, n_iteration, save_name, is_double=False,replay_size=MEM_SIZE)

In [None]:
qmodel.save('DuelingReplay'+ str(3000) + '.h5')

In [None]:
print("average_reward:{}".format(sum(reward_list)/n_iteration))
print("average_time:{}".format(sum(runtime_list)/n_iteration))
print("max_step:{}".format(max(n_step_list)))

plt.plot(reward_list)
plt.title('Reward for episodes')
plt.ylabel('Reward')
plt.xlabel('Episodes')
# plt.savefig('/content/drive/MyDrive/INF581/rewardDDQN.jpg')
plt.show()

plt.plot(n_step_list, color='green')
plt.title('Run steps for episodes')
plt.ylabel('Steps')
plt.xlabel('Episodes')
# plt.savefig('/content/drive/MyDrive/INF581/runtimeDDQN.jpg')
plt.show()

# Save training log

In [None]:
import os
import shutil

destination_folder = '/content/drive/MyDrive/'

for file in os.listdir():
    if file.endswith('.npy'):
        shutil.move(file, destination_folder)
for file in os.listdir():
    if file.endswith('.h5'):
        shutil.move(file, destination_folder)

# Test

In [None]:
from keras.models import load_model
model = load_model('DDQN_v3.h5')

target_model = create_ddqn(input_shape=obs.shape, hidden_size=hidden_size, n_action=n_action)
target_model.set_weights(model.get_weights())

In [None]:
total_reward = 0
obs = env.reset()
for step in range(max_step):
    q_values = model.predict(np.array([obs]))
    action = take_action(env, q_values, obs, greedy=True)

    obs_new, reward, done, info = env.step(action)

    show_state(env, step, info)

    q_values_new = tf.stop_gradient(target_model(np.array([obs_new])))

    action_q = tf.reduce_sum(q_values * tf.one_hot(action, n_action), axis=1)
    action_next = tf.argmax(q_values_new, axis=1)
    action_q_next = tf.reduce_sum(q_values_new * tf.one_hot(action_next, n_action), axis=1)
    action_target = reward + (1 - tf.cast(done, tf.float32)) * gamma * action_q_next

    error = action_target - action_q

    loss = tf.reduce_sum(tf.square(error))
        
    total_reward += reward

    obs = obs_new
    if done and info['lives']==0:
      print(total_reward)
      break

In [None]:
avg_rewards_plot = [154.2,
  191.3,
  201.1,
  187.3,
  97.9,
  299.3,
  384.9,
  232.3,
  285.5,
  114.4,
  106.0,
  247.3,
  197.3,
  101.1,
  286.5,
  386.3,
  452.6,
  386.9,
  463.0,
  401.4,
  468.2,
  525.4,
  552.9,
  453.0,
  481.5,
  529.8,
  518.8,
  532.2,
  533.8,
  523.4,
  361.0,
  557.9,
  505.7,
  499.4,
  487.8,
  468.3,
  494.9,
  551.5,
  447.1,
  540.7,
  493.8,
  433.4,
  541.6,
  566.7,
  528.6,
  541.1,
  460.1,
  379.5,
  387.0,
  434.3]

In [None]:
fig = plt.figure()
plt.plot(range(0,5000,100),avg_rewards_plot)
plt.xlabel("episode")
plt.ylabel("avg rewards in 100 episode")
plt.title("Average reward for episodes")
fig.savefig("avg_reward_episode")