# Deep Q Network Tutorial

## 1. Environment Preparation
### 1.1 Mount drive and set project path.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import sys
project_root = '/content/drive/My Drive/DQN_tutorial/'
sys.path.append(project_root)

### 1.2 Download Atari ROM.

In [None]:
! wget http://www.atarimania.com/roms/Roms.rar
! mkdir /content/ROM/
! unrar e /content/Roms.rar /content/ROM/
! python -m atari_py.import_roms /content/ROM/

## 2. Pong Game and Wrapper
### 2.1 Test the pong environment.

In [None]:
import gym
import matplotlib.pyplot as plt

env_name = "PongNoFrameskip-v4"
env = gym.make(env_name)

print("environment:", env_name)
print("action space:", env.action_space.n)
print("action:", env.unwrapped.get_action_meanings())
print("observation space:", env.observation_space.shape)

state = env.reset()
action = env.action_space.sample()
state_next, reward, done, info = env.step(action)
plt.figure()
plt.imshow(state_next)

### 2.2 Environment wrapper.

In [None]:
import numpy as np
from PIL import Image

class PongEnvWrapper(gym.Wrapper):
    def __init__(self, env, k, img_size=(84,84)):
        gym.Wrapper.__init__(self, env)
        self.k = k
        self.img_size = img_size
        obs_shape = env.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(k, img_size[0], img_size[1]), dtype=np.float32)

    def _preprocess(self, state, th=0.4):
        # TODO(Lab-1): Image processing.
        return state

    def reset(self):
        state = self.env.reset()
        state = self._preprocess(state)
        # TODO(Lab-2): Constrct initial stacked frame.
        return state

    def step(self, action):
        # TODO(Lab-3): Construct stacked frames.
        return state_next, reward, done, info

In [None]:
# Test Code
env_pong = PongEnvWrapper(env, k=4, img_size=(84,84))
print("observation space:", env_pong.observation_space.shape)

state = env_pong.reset()
action = env_pong.action_space.sample()
state_next, reward, done, info = env_pong.step(action)
print(state_next.shape)
plt.imshow(state_next[0], cmap="gray")

## 3. Reinforcement Learning
### 3.1 Convolutional Neural Network

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class QNet(nn.Module):
    # TODO(Lab-4): Q-Network architecture.

### 3.2 DQN Algorithm

In [None]:
class DeepQNetwork():
    def __init__(
        self,
        n_actions,
        input_shape,
        qnet,
        device,
        learning_rate = 2e-4,
        reward_decay = 0.99,
        replace_target_iter = 1000,
        memory_size = 10000,
        batch_size = 32,
    ):
        # initialize parameters
        self.n_actions = n_actions
        self.input_shape = input_shape
        self.lr = learning_rate
        self.gamma = reward_decay
        self.replace_target_iter = replace_target_iter
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.device = device
        self.learn_step_counter = 0
        self.init_memory()

        # Network
        self.qnet_eval = qnet(self.input_shape, self.n_actions).to(self.device)
        self.qnet_target = qnet(self.input_shape, self.n_actions).to(self.device)
        self.qnet_target.eval()
        self.optimizer = optim.RMSprop(self.qnet_eval.parameters(), lr=self.lr)

    def choose_action(self, state, epsilon=0):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        actions_value = self.qnet_eval.forward(state)
        if np.random.uniform() > epsilon:   # greedy
            action = torch.max(actions_value, 1)[1].data.cpu().numpy()[0]
        else:   # random
            action = np.random.randint(0, self.n_actions)
        return action

    def learn(self):
        # TODO(Lab-5): DQN core algorithm.

    def init_memory(self):
        self.memory = {
            "s": np.zeros((self.memory_size, *self.input_shape)),
            "a": np.zeros((self.memory_size, 1)),
            "r": np.zeros((self.memory_size, 1)),
            "s_": np.zeros((self.memory_size, *self.input_shape)),
            "done": np.zeros((self.memory_size, 1)),
        }

    def store_transition(self, s, a, r, s_, d):
        if not hasattr(self, 'memory_counter'):
            self.memory_counter = 0
        if self.memory_counter <= self.memory_size:
            index = self.memory_counter % self.memory_size
        else:
            index = np.random.randint(self.memory_size)
        self.memory["s"][index] = s
        self.memory["a"][index] = np.array(a).reshape(-1,1)
        self.memory["r"][index] = np.array(r).reshape(-1,1)
        self.memory["s_"][index] = s_
        self.memory["done"][index] = np.array(d).reshape(-1,1)
        self.memory_counter += 1
    
    def save_load_model(self, op, path="save", fname="qnet.pt"):
        import os
        if not os.path.exists(path):
            os.makedirs(path)
        file_path = os.path.join(path, fname)
        if op == "save":
            torch.save(self.qnet_eval.state_dict(), file_path)
        elif op == "load":
            self.qnet_eval.load_state_dict(torch.load(file_path, map_location=self.device))
            self.qnet_target.load_state_dict(torch.load(file_path, map_location=self.device))

In [None]:
stack_frames = 4
img_size = (84,84)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

agent = DeepQNetwork(
        n_actions = env.action_space.n,
        input_shape = [stack_frames, *img_size],
        qnet = QNet,
        device = device,
        learning_rate = 2e-4, 
        reward_decay = 0.99,
        replace_target_iter = 1000, 
        memory_size = 10000,
        batch_size = 32,)

print(agent.qnet_eval)

## 4. Training and testing process.
### 4.1 Play the game.

In [None]:
def play(env, agent, stack_frames, img_size):
    # Reset environment.
    state = env.reset()
    img_buffer = [Image.fromarray(state[0]*255)]

    # Initialize information.
    step = 0
    total_reward = 0

    # One episode.
    while True:
        # Select action.
        action = agent.choose_action(state, 0)

        # Get next stacked state.
        state_next, reward, done, info = env.step(action)
        if step % 2 == 0:
            img_buffer.append(Image.fromarray(state_next[0]*255))

        state = state_next.copy()
        step += 1
        total_reward += reward
        print('\rStep: {:3d} | Reward: {:.3f} / {:.3f}'\
            .format(step, reward, total_reward), end="")
            
        if done or step>2000:
            print()
            break

    return img_buffer

In [None]:
import os
def save_gif(img_buffer, fname, gif_path=os.path.join(project_root, "gif")):
    if not os.path.exists(gif_path):
        os.makedirs(gif_path)
    img_buffer[0].save(os.path.join(gif_path, fname), save_all=True, append_images=img_buffer[1:], duration=1, loop=0)

In [None]:
# Test Code
img_buffer = play(env_pong, agent, stack_frames, img_size)
save_gif(img_buffer, fname="test.gif")

### 4.2 Epsilon greedy computation.

In [None]:
def epsilon_compute(frame_id, epsilon_max=1, epsilon_min=0.05, epsilon_decay=100000):
    return epsilon_min + (epsilon_max - epsilon_min) * np.exp(-frame_id / epsilon_decay)

In [None]:
# Test Code
frame_ids = np.array(range(400000))
epsilons = epsilon_compute(frame_ids)
plt.plot(epsilons)

### 4.3 Training steps.

In [None]:
def train(env, agent, stack_frames, img_size, save_path="save", max_steps=1000000):
    total_step = 0
    episode = 0
    while True:
        # Reset environment.
        state = env.reset()

        # Initialize information.
        step = 0
        total_reward = 0
        loss = 0

        # One episode.
        while True:
            # TODO(Lab-6): Select action.

            # Get next stacked state.
            state_next, reward, done, info = env.step(action)

            # TODO(Lab-7): Train RL model.

            state = state_next.copy()
            step += 1
            total_step += 1
            total_reward += reward

            if total_step % 100 == 0 or done:
                print('\rEpisode: {:3d} | Step: {:3d} / {:3d} | Reward: {:.3f} / {:.3f} | Loss: {:.3f} | Epsilon: {:.3f}'\
                    .format(episode, step, total_step, reward, total_reward, loss, epsilon), end="")
            
            if total_step % 10000 == 0:
                print("\nSave Model ...")
                agent.save_load_model(op="save", path=save_path, fname="qnet.pt")
                print("Generate GIF ...")
                img_buffer = play(env, agent, stack_frames, img_size)
                save_gif(img_buffer, "train_" + str(total_step).zfill(6) + ".gif")
                print("Done !!")

            if done or step>2000:
                episode += 1
                print()
                break
        
        if total_step > max_steps:
            break

In [None]:
train(env_pong, agent, stack_frames, img_size, save_path=os.path.join(project_root, "save"), max_steps=400000)

### 4.4 Evaluate the trained model.

In [None]:
agent.save_load_model(op="load", path=os.path.join(project_root, "save"), fname="qnet.pt")
img_buffer = play(env_pong, agent, stack_frames, img_size)
save_gif(img_buffer, "eval.gif")