# Navigation Implementation Report

---

## 4-1. Import Some Library

In [None]:
%matplotlib inline
import torch
import random
import collections
import pickle
from unityagents import UnityEnvironment
import numpy as np
import matplotlib.pyplot as plt

In [None]:
class DQNAgent:
    def __init__(self, state_size, action_size, build_network, lr=0.001, discount_factor=.99,
                 replay_mem_size=2500, batch_size=32, smooth_update_tau=0.001):
        
        torch.set_default_tensor_type('torch.cuda.FloatTensor')  # set default tensor type
        torch.cuda.set_device(0) # use gpu
        
        self.state_size = state_size
        self.action_size = action_size
        self.lr = lr
        self.batch_size = batch_size
        self.tau = smooth_update_tau
        self.discount_factor = discount_factor
        self.replay_memory = collections.deque(maxlen=replay_mem_size)
        self.runner_network = build_network.cuda()
        self.target_network = build_network.cuda()

        self.runner_optimizer = torch.optim.Adam(self.runner_network.parameters(), lr=self.lr)
        
        self.update_target_network()

    def update_target_network(self):
        self.target_network.load_state_dict(self.runner_network.state_dict()) # copy network to target
    
    def update_target_network_smooth(self):
        # refer to https://github.com/udacity/deep-reinforcement-learning/blob/master/dqn/solution/dqn_agent.py
        for target, runner in zip(self.target_network.parameters(), self.runner_network.parameters()):
            target.data.copy_(self.tau * runner.data + (1 - self.tau) * target.data)
    
    def optimize(self, x, y):
        loss_func = torch.nn.SmoothL1Loss() # Huber loss
        #loss_func = torch.nn.MSELoss()
        loss = loss_func(x, y)
        self.runner_optimizer.zero_grad()
        loss.backward()
        self.runner_optimizer.step()
    
    def get_action(self, state):
        return np.argmax(self.runner_network(torch.tensor(state, requires_grad=False, dtype=torch.float32)).cpu().detach().numpy())
    
    def max_q(self, state):
        return np.max(self.runner_network(torch.tensor(state, requires_grad=False, dtype=torch.float32)).cpu().detach().numpy())
        
    
    def append_replay_memory(self, state, action, reward, next_state, done):
        self.replay_memory.append([state, action, reward, next_state, done])
    
    def get_batch(self):
        batch = random.sample(self.replay_memory, self.batch_size)  # sample the replay memory
        state, next_state = np.empty([self.batch_size, self.state_size*4]), np.empty([self.batch_size, self.state_size*4])
        action, reward, done = [], [], []
        
        for i in range(self.batch_size):
            state[i] = batch[i][0]  # fill state data
            action.append(batch[i][1])
            reward.append(batch[i][2])
            next_state[i] = batch[i][3]  # fill next_state data
            done.append(batch[i][4])
        return state, action, reward, next_state, done
    
    def train(self):
        s, a, r, s_n, d = self.get_batch()
        tensor_s = torch.tensor(s, requires_grad=False, dtype=torch.float32)
        tensor_s_n = torch.tensor(s_n, requires_grad=False, dtype=torch.float32)
        target_q = self.target_network(tensor_s_n).cpu()
        runner_q = self.runner_network(tensor_s).cpu()
        
        update_target = np.empty([self.state_size, self.action_size])
        update_target = target_q.detach().numpy()
        
        q = torch.tensor(target_q)
        for i in range(self.batch_size):
            if d[i] is True:
                update_target[i][a[i]] = r[i]
            else:
                update_target[i][a[i]] = r[i] + self.discount_factor * torch.max(target_q[i])
        

        current = torch.tensor(runner_q, requires_grad=True, dtype=torch.float32)
        target = torch.tensor(update_target, requires_grad=False, dtype=torch.float32)
        
        loss_func = torch.nn.SmoothL1Loss()
        loss = loss_func(current, target)
        self.runner_optimizer.zero_grad()
        loss.backward()
        self.runner_optimizer.step()
    
    def save_model(self, p):
        torch.save(self.runner_network.state_dict(), p)
    
    def restore_model(self, p):
        self.runner_network.load_state_dict(torch.load(p))

### 4-2. Define a network architecture

In [None]:
"""
My network is
Dense(37*4, ReLU) - Dense(64, ReLU) - Dense(4, Linear)
"""
def network():
    act = torch.nn.ReLU
    model = torch.nn.Sequential(torch.nn.Linear(37*4, 64),
                           act(),
                           torch.nn.Linear(64, 64),
                           act(),
                           torch.nn.Linear(64, 4))
    return model

### 4-3. Build the agent

In [None]:
LEARNING_RATE = 5e-5
DISCOUNT_FACTOR = 0.99
REPLAY_MEMORY_SIZE = 15000
BATCH_SIZE = 64
EPSILON_LOWER_BOUND = 0.008
EPSILON_DECAY_RATIO = 0.90
TOTAL_EPISODES = 4000
OBSERVATION_STEP = 100
MODEL_UPDATE_EPISODE = 50

In [None]:
agent = DQNAgent(state_size=37, action_size=4, build_network=network(), lr=LEARNING_RATE,
                discount_factor=DISCOUNT_FACTOR, replay_mem_size=REPLAY_MEMORY_SIZE, batch_size=BATCH_SIZE)

### 4-4. Train the agent

In [None]:
env = UnityEnvironment(file_name="Banana_Windows_x86_64\\Banana.exe")
brain_name = env.brain_names[0]
brain = env.brains[brain_name]  # Refer to the code "1. Start the Environment" above.
print('[INFO] Environment initialized.')

scores = []
steps = 0
epsilon = 1
epsilon_decay = (1 - EPSILON_LOWER_BOUND) / ((TOTAL_EPISODES * 300) * EPSILON_DECAY_RATIO)  # Calculate how much epsilon will reduce per step.
avg_reward = collections.deque(maxlen=100)
state_data = np.empty([37*4])
for e in range(TOTAL_EPISODES):
    env_info = env.reset(train_mode=True)[brain_name]
    state = env_info.vector_observations[0]
    if e is 0:
        state_data = np.stack([state, state, state, state]).flatten()  # fill observation data
    score = 0
    while True:
        if np.random.rand() <= epsilon:
            action = np.random.randint(0, 4)  # random move
        else:
            action = agent.get_action(state_data)
        env_info = env.step(int(action))[brain_name]
        next_state = env_info.vector_observations[0]
        next_state = np.append(next_state, state_data[0:-37])
        reward = env_info.rewards[0]
        done = env_info.local_done[0]
        
        agent.append_replay_memory(state_data, action, reward, next_state, done)
        
        if steps > OBSERVATION_STEP:
            agent.train()
            agent.update_target_network_smooth()
            if epsilon > EPSILON_LOWER_BOUND:
                epsilon -= epsilon_decay
        score += reward
        state_data = next_state
        steps += 1
        
        if done:
            scores.append(score)
            avg_reward.append(score)
            break
             
    if e % 40 == 0:
        max_q = agent.max_q(state_data)
        if not e >= 100:
            print('[Step ' + str(steps + 1).zfill(8) + '; Episode ' + str(e).zfill(6) + '] reward: ' + str(score),
                  "  max Q: " + str(max_q) +
                  "  ε= " + str(epsilon))
        else:
            print('[Step ' + str(steps + 1).zfill(8) + '; Episode ' + str(e).zfill(6) + '] reward: ' + str(score),
                  "  max Q: " + str(max_q) +
                  "  ε= " + str(epsilon) + 
                 "  Avg. score: " + str(np.average(avg_reward)))
    
   # if e % MODEL_UPDATE_EPISODE == 0:
        #agent.update_target_network()
    
    if len(avg_reward) is 100:
        avg = np.average(avg_reward)
        if avg > 13:
            print("[INFO] Training completed. Total episode: " + str(e))
            break

with open('save\\scores.pickle', 'wb') as f:
    pickle.dump(scores, f, pickle.HIGHEST_PROTOCOL)
    f.close()

agent.save_model('save\\model')

In [None]:
env.close()

### 4-4-2. Evaluate the model

In [None]:
agent = DQNAgent(state_size=37, action_size=4, build_network=network(), lr=LEARNING_RATE,
                discount_factor=DISCOUNT_FACTOR, replay_mem_size=REPLAY_MEMORY_SIZE, batch_size=BATCH_SIZE)
agent.restore_model('save\\model')
env = UnityEnvironment(file_name="Banana_Windows_x86_64\\Banana.exe")
brain_name = env.brain_names[0]
brain = env.brains[brain_name]  # Refer to the code "1. Start the Environment" above.
print('[INFO] Environment initialized.')

steps = 0
avg_reward = collections.deque(maxlen=100)  # Scores from recent 100 episodes
for e in range(100):
    env_info = env.reset(train_mode=True)[brain_name]
    state = env_info.vector_observations[0]
    score = 0
    while True:  # until the episode ends
        action = agent.get_action(state)
        env_info = env.step(int(action))[brain_name]
        next_state = env_info.vector_observations[0]
        reward = env_info.rewards[0]
        done = env_info.local_done[0]
        
        score += reward
        state = next_state
        steps += 1
        
        if done:
            avg_reward.append(score)
            break
             
    if e % 20 == 0:
            print('Episode ' + str(e).zfill(3) + ' reward: ' + str(score))

print("Avg(100 episode) score: " + str(np.average(avg_reward)))

### 4-5. Visualising the training progress

In [None]:
fig, ax = plt.subplots()
plt.plot(range(len(scores)), scores)
ax.set(xlabel='Episode', ylabel="total score")
plt.title("Total Reward")
plt.savefig("save\\reward.png", dpi=200)  # save the plot

## 5. Future work

- I'd like to solve this project with other reinforcement learning methods, such as Dueling DQN, Double DQN, DRQN, and so on.
- I'd like to use DQN-based network in video game invironment