# Deep Q-Learning 

## This Assignment is adopted from University of Illinois
http://slazebni.cs.illinois.edu/fall18/assignment5.html

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

![alt text](https://drive.google.com/uc?id=1b54faj61wVsRJYIU6O98tlTu2XrPO5Cr)

In [12]:
import sys
import gym
import torch
import pylab
import random
import numpy as np
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from skimage.transform import resize
from skimage.color import rgb2gray

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

## Understanding the environment

In the following cell, we initialise our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://gym.openai.com/envs. 

In [0]:
env = gym.make('BreakoutDeterministic-v4')

In [0]:
def find_max_lifes(env):
    env.reset()
    _, _, _, info = env.step(0)
    return info['ale.lives']

def check_live(life, cur_life):
    if life > cur_life:
        return True
    else:
        return False

def get_frame(X):
    x = np.uint8(resize(rgb2gray(X), (HEIGHT, WIDTH), mode='reflect') * 255)
    return x

def get_init_state(history, s):
    for i in range(HISTORY_SIZE):
        history[i, :, :] = get_frame(s)


In [15]:
number_lives = find_max_lifes(env)
state_size = env.observation_space.shape
action_size = 3
rewards, episodes = [], []
print(number_lives)

5


## Mount Google Drive to store / load the model
You need to create a Temp directory in your Google drive

In [16]:
"""
from google.colab import drive
drive.mount('/content/drive')

model_name = 'DQN.ckpt'
model_path = F"/content/drive/My Drive/Temp/{model_name}" 
"""

'\nfrom google.colab import drive\ndrive.mount(\'/content/drive\')\n\nmodel_name = \'DQN.ckpt\'\nmodel_path = F"/content/drive/My Drive/Temp/{model_name}" \n'

## Creating a DQN Agent

Here we create a DQN Agent. 

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

In [0]:
EPISODES = 500000
HEIGHT = 84
WIDTH = 84
HISTORY_SIZE = 4
learning_rate = 0.0001
evaluation_reward_length = 100
Memory_capacity = 1000000
render_breakout = True
batch_size = 32
Update_target_network_frequency = 1000
train_frame = 100000

In [0]:
class DQN(nn.Module):
    def __init__(self, action_size):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.bn3 = nn.BatchNorm2d(64)
        self.fc = nn.Linear(3136, 512)
        self.head = nn.Linear(512, action_size)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        x = F.relu(self.fc(x.view(x.size(0), -1)))
        return self.head(x)


In [0]:
class Agent():
    def __init__(self, action_size):
        self.load_model = False

        self.action_size = action_size

        # These are hyper parameters for the DQN
        self.discount_factor = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.explore_step = 1000000
        self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.explore_step

        # Generate the memory
        self.memory = ReplayMemory()

        # Create the policy net and the target net
        self.policy_net = DQN(action_size)
        self.policy_net.to(device)
        self.target_net = DQN(action_size)
        self.target_net.to(device)

        self.optimizer = optim.Adam(params=self.policy_net.parameters(), lr=learning_rate)

        # initialize target net
        self.update_target_net()

        if self.load_model:
            self.policy_net.load_state_dict(torch.load(model_path))
            self.policy_net.to(device)
            #self.policy_net = torch.load('save_model/breakout_dqn')
            
    def save_model(self):
        torch.save(self.policy_net.state_dict(), model_path)

    # after some time interval update the target net to be same with policy net
    def update_target_net(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

        
    """Get action using policy net using epsilon-greedy policy"""
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            # Choose a random action
            return np.random.randint(0, action_size)
                     
        else:
            z = self.policy_net(torch.from_numpy(state).unsqueeze(0).to(device))
            return z.max(1)[1].item()
            
    # pick samples randomly from replay memory (with batch_size)
    def train_policy_net(self, frame):
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay

        mini_batch = self.memory.sample_mini_batch(frame)
        mini_batch = np.array(mini_batch).transpose()

        history = np.stack(mini_batch[0], axis=0)
        states = np.float32(history[:, :4, :, :]) / 255.
        actions = list(mini_batch[1])
        rewards = list(mini_batch[2])
        next_states = np.float32(history[:, 1:, :, :]) / 255.
        dones = mini_batch[3] # checks if the game is over


        # Compute Q(s_t, a) - Q of the current state
        curr_state = self.policy_net(torch.from_numpy(states).to(device))
        q_value = curr_state.gather(1, torch.LongTensor(actions).to(device).reshape(32, 1))
        q_value = q_value.reshape(32)
      
        # Compute Q function of next state
        next_state = self.target_net(torch.from_numpy(next_states).to(device)).detach()

        # Find maximum Q-value of action at next state from target net
        max_q_values = next_state.max(1)[0]
        
        # Compute expected Q value
        discount = self.discount_factor
        d_new = dones.astype(np.int)
        expected_q_value = torch.Tensor(rewards).to(device) + discount * max_q_values * (1 - torch.from_numpy(d_new).to(device))
        
        ## Huber Loss
        loss = F.smooth_l1_loss(q_value, expected_q_value.data)
        
        # Optimize the model 
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()


In [0]:
class ReplayMemory(object):
    def __init__(self):
        self.memory = deque(maxlen=Memory_capacity)
    
    def push(self, history, action, reward, done):
        self.memory.append((history, action, reward, done))

    def sample_mini_batch(self, frame):
        mini_batch = []
        if frame >= Memory_capacity:
            sample_range = Memory_capacity
        else:
            sample_range = frame

        # history size
        sample_range -= (HISTORY_SIZE + 1)

        idx_sample = random.sample(range(sample_range), batch_size)
        for i in idx_sample:
            sample = []
            for j in range(HISTORY_SIZE + 1):
                sample.append(self.memory[i + j])

            sample = np.array(sample)
            mini_batch.append((np.stack(sample[:, 0], axis=0), sample[3, 1], sample[3, 2], sample[3, 3]))

        return mini_batch

    def __len__(self):
        return len(self.memory)


In [0]:
agent = Agent(action_size)
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0

### Main Training Loop

In [0]:
for e in range(EPISODES):
    done = False
    score = 0

    history = np.zeros([5, 84, 84], dtype=np.uint8)
    step = 0
    state = env.reset()
    life = number_lives

    get_init_state(history, state)

    while not done:
        step += 1
        frame += 1

        # Select and perform an action
        action = agent.get_action(np.float32(history[:4, :, :]) / 255.)

        next_state, reward, done, info = env.step(action + 1)

        frame_next_state = get_frame(next_state)
        history[4, :, :] = frame_next_state
        terminal_state = check_live(life, info['ale.lives'])
        terminal_state = 1 if terminal_state else 0

        life = info['ale.lives']
        r = np.clip(reward, -1, 1)

        # Store the transition in memory 
        agent.memory.push(deepcopy(frame_next_state), action, r, terminal_state)
        # Start training after random sample generation
        if(frame >= train_frame):
            agent.train_policy_net(frame)
            # Update the target network
            if(frame % Update_target_network_frequency)== 0:
                agent.update_target_net()
        score += reward
        history[:4, :, :] = history[1:, :, :]

        
        if frame % 50000 == 0:
            print('now time : ', datetime.now())
            rewards.append(np.mean(evaluation_reward))
            episodes.append(e)
            #pylab.plot(episodes, rewards, 'b')
            
            
        if done:
            evaluation_reward.append(score)
            # every episode, plot the play time
            print("episode:", e, "  score:", score, "  memory length:",
                  len(agent.memory), "  epsilon:", agent.epsilon, "   steps:", step,
                  "    evaluation reward:", np.mean(evaluation_reward))

                
    # if the mean of scores of last 100 episode is bigger than 10
    # stop training
    if done and np.mean(evaluation_reward) > 10:
        print("Training Done ...")
        agent.save_model()
        break

episode: 0   score: 2.0   memory length: 200   epsilon: 1.0    steps: 200     evaluation reward: 2.0
episode: 1   score: 3.0   memory length: 463   epsilon: 1.0    steps: 263     evaluation reward: 2.5
episode: 2   score: 0.0   memory length: 593   epsilon: 1.0    steps: 130     evaluation reward: 1.6666666666666667
episode: 3   score: 0.0   memory length: 718   epsilon: 1.0    steps: 125     evaluation reward: 1.25
episode: 4   score: 0.0   memory length: 851   epsilon: 1.0    steps: 133     evaluation reward: 1.0
episode: 5   score: 3.0   memory length: 1128   epsilon: 1.0    steps: 277     evaluation reward: 1.3333333333333333
episode: 6   score: 1.0   memory length: 1284   epsilon: 1.0    steps: 156     evaluation reward: 1.2857142857142858
episode: 7   score: 2.0   memory length: 1509   epsilon: 1.0    steps: 225     evaluation reward: 1.375
episode: 8   score: 0.0   memory length: 1633   epsilon: 1.0    steps: 124     evaluation reward: 1.2222222222222223
episode: 9   score: 1.0 