<a href="https://colab.research.google.com/github/akj0811/RL_Atari/blob/master/D_DDQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
# !apt-get install -y xvfb python-opengl > /dev/null 2>&1
# !pip install gym pyvirtualdisplay > /dev/null 2>&1

from google.colab import drive
drive.mount('/content/drive')

import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import pickle
import random
import cv2 as cv
from google.colab.patches import cv2_imshow

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
class ReplayMemory:
    def __init__(self, max_size):
        self.buffer = [None] * max_size
        self.max_size = max_size
        self.index = 0
        self.size = 0

    def append(self, obj):
        self.buffer[self.index] = obj
        self.size = min(self.size + 1, self.max_size)
        self.index = (self.index + 1) % self.max_size

    def sample(self, batch_size):
        indices = random.sample(range(self.size), batch_size)
        return [self.buffer[index] for index in indices]

In [0]:
class Model:
    def __init__(self, frame_size, num_actions):
        self.frame_size = frame_size
        self.num_actions = num_actions
        self.model = self.build_network()
    
    def build_network(self):
        input = keras.Input(shape = (84, 84, self.frame_size))
        layer1 = keras.layers.Conv2D(32, kernel_size = (3, 3), activation = 'relu')(input)
        max_pool = keras.layers.MaxPooling2D(pool_size = (2, 2))(layer1)
        layer2 = keras.layers.Conv2D(32, kernel_size = (5, 5), activation = 'relu')(max_pool)
        max_pool = keras.layers.MaxPooling2D(pool_size = (2, 2))(layer2)
        flatten = keras.layers.Flatten()(max_pool)
        advantage = keras.layers.Dense(self.num_actions, activation = 'relu')(flatten)
        ad = keras.layers.Lambda(lambda x :tf.reduce_mean(x, axis = -1))(advantage)
        ans = keras.layers.Add()([advantage, -ad])
        state_value = keras.layers.Dense(1)(flatten)
        output = keras.layers.Add()([advantage, state_value])

        self.model = keras.Model(
            inputs = [input],
            outputs = [output]
        )
        self.model.compile(loss = 'mse', optimizer = 'adam', metrics = ['accuracy'])
        return self.model

In [0]:
class Agent:
    def __init__(self, agent_params):
        self.discount = agent_params['discount']
        self.epsilon = agent_params['epsilon']
        self.num_actions = agent_params['num_actions']
        self.exp_size = agent_params['exp_size']
        self.frame_size = agent_params['frame_size']
        self.merge = agent_params['merge_threshold']
        self.batch_size = agent_params['batch_size']
        self.last_action = None
        self.last_state = None
        self.exp = ReplayMemory(self.exp_size)
        self.frames = np.zeros((1, 84, 84, self.frame_size))
        self.target_model = Model(self.frame_size, self.num_actions)
        self.value_model = Model(self.frame_size, self.num_actions)
        self.num_updates = 0
        self.num_steps = 0
        self.num_games = 0
        self.num_merge = 0

    def phi(self, state):
        new = self.frames[:, :, :, :self.frame_size - 1].copy()
        new = np.concatenate([new, state/255.0], axis = -1)
        self.frames = new.copy()
        return self.frames

    def run(self):
        updates = min(self.batch_size, self.exp.size)
        self.num_updates += updates
        batch = self.exp.sample(updates)

        input = [last_state for last_state, last_action, reward, state, terminal in batch]
        inp = [state for last_state, last_action, reward, state, terminal in batch]
        x_train = np.concatenate(input, axis = 0)
        y_train = self.value_model.model.predict(x_train)
        x_target = np.concatenate(inp, axis = 0)
        y_target = self.target_model.model.predict(x_target)

        for count, memory in enumerate(batch):
            last_state, last_action, reward, state, terminal = memory
            if terminal == 0:
                y_train[count][last_action] = reward + self.discount*(y_target[count][np.argmax(y_train[count])])
            else:
                y_train[count][last_action] = reward
            
        self.value_model.model.fit(x_train, y_train, verbose = 0, epochs = 1)

        if self.num_updates >= self.merge:
            self.merge_model()

    def merge_model(self):
        self.target_model.model.set_weights(self.value_model.model.get_weights())
        self.num_merge += 1
        self.num_updates = 0

    def epsilon_decay(self):
        if self.epsilon > 0.1:
            self.epsilon -= 0.9/1000000

    def choose_action(self, state):
        if np.random.random() < self.epsilon:
            action = np.random.choice(self.num_actions)
        else:
            action = np.argmax(self.value_model.model.predict(state)[0])
    
        self.num_steps += 1
        self.epsilon_decay()
        return action

    def agent_start(self, env_state):
        state = self.phi(env_state)
        self.last_action = self.choose_action(state)
        self.last_state = state
        return self.last_action

    def agent_step(self, reward, env_state):
        state = self.phi(env_state)
        memory = (self.last_state, self.last_action, reward, state, 0)
        self.exp.append(memory)
        self.run()
        self.last_action = self.choose_action(state)
        self.last_state = state
        return self.last_action

    def agent_end(self, reward):
        memory = (self.last_state, self.last_action, reward, np.zeros((1, 84, 84, 4)), 1)
        self.exp.append(memory)
        self.run()
        self.num_games += 1

    def plot(self, states):
        y = np.mean(np.max(self.value_model.model.predict(states), axis = -1))
        return y
    

In [0]:
def pre_process(image):
    image = cv.cvtColor(image, cv.COLOR_RGB2GRAY)
    image = cv.resize(image, (84, 110))
    image = image[18:102, :]
    image = image.reshape(1, 84, 84, 1)
    return image

In [0]:
agent_params = {
    'discount' :0.99,
    'epsilon' : 1.0,
    'num_actions' : 6,
    'exp_size': 100000,
    'frame_size' : 4,
    'merge_threshold' : 100000,
    'batch_size' : 32,
}
agent = Agent(agent_params)
episodes = []
q_value = []
avg_reward = []

In [0]:
env = gym.make('Pong-v0')

In [0]:
indices = np.random.choice(agent.exp.size, 10000)
test_states = np.zeros((10000, 84, 84, 4))
for count, index in enumerate(indices):
    test_states[count, :] = agent.exp.buffer[index][0]

In [0]:
def train(agent, env):
  
    num_games = 20
    total = 0
    sum = 0
    for _ in range(1, num_games+1):
        action = agent.agent_start(pre_process(env.reset()))
        observations, reward, done, info = env.step([action])
        total += reward
        sum += reward
        while not done:
            observations, reward, done, info = env.step([agent.agent_step(reward, pre_process(observations))])
            total += reward

        agent.agent_end(reward)
        if agent.num_games % 50 == 0:
            episodes.append(agent.num_games)
            q_value.append(agent.plot(test_states))
            avg_reward.append(total/50)
            total = 0

        if agent.num_games % 100 == 0:
            print('Games = {}, Steps = {}, Reward = {}'.format(_, agent.num_steps, sum/100))
            sum = 0

        if agent.num_games % 400 == 0:
            with open('/content/drive/My Drive/DDQN/episodes0.pkl', 'wb') as f:
                pickle.dump(episodes, f)
            with open('/content/drive/My Drive/DDQN/q_value0.pkl', 'wb') as f:
                pickle.dump(q_value, f)
            with open('/content/drive/My Drive/DDQN/avg_reward0.pkl', 'wb') as f:
                pickle.dump(avg_reward, f)
            with open('/content/drive/My Drive/DDQN/test_states0.pkl', 'wb') as f:
                pickle.dump(test_states, f)
            with open('/content/drive/My Drive/DDQN/agent0.pkl', 'wb') as f:
                pickle.dump(agent, f)
        elif agent.num_games % 200 == 0:
            with open('/content/drive/My Drive/DDQN/episodes1.pkl', 'wb') as f:
                pickle.dump(episodes, f)
            with open('/content/drive/My Drive/DDQN/q_value1.pkl', 'wb') as f:
                pickle.dump(q_value, f)
            with open('/content/drive/My Drive/DDQN/avg_reward1.pkl', 'wb') as f:
                pickle.dump(avg_reward, f)
            with open('/content/drive/My Drive/DDQN/test_states1.pkl', 'wb') as f:
                pickle.dump(test_states, f)
            with open('/content/drive/My Drive/DDQN/agent1.pkl', 'wb') as f:
                pickle.dump(agent, f)

    plt.plot(episodes, q_value)
    plt.plot(episodes, avg_reward)
    plt.show()

In [0]:
train(agent, env)

In [0]:
with open('/content/drive/My Drive/DDQN/agent0.pkl', 'rb') as f:
    agent = pickle.load(f)
with open('/content/drive/My Drive/DDQN/episodes0.pkl', 'rb') as f:
    episodes = pickle.load(f)
with open('/content/drive/My Drive/DDQN/q_value0.pkl', 'rb') as f:
    q_value = pickle.load(f)
with open('/content/drive/My Drive/DDQN/avg_reward0.pkl', 'rb') as f:
    avg_reward = pickle.load(f)
with open('/content/drive/My Drive/DDQN/test_states0.pkl', 'rb') as f:
    test_states = pickle.load(f)

In [0]:
plt.plot(episodes, q_value)
plt.plot(episodes, avg_reward)
plt.show()

In [0]:
print(agent.num_steps)