## Configurations for Colab

In [3]:
import os
import random
import gym
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn 
import torch.nn.functional as Func
import torch.optim as optim
import cv2
from collections import deque
from atari_wrappers import make_atari
from atari_wrappers import wrap_deepmind

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
BUFFER_SIZE = int(1e4)  # replay buffer size
BATCH_SIZE = 32         # minibatch size
GAMMA = 0.99            # discount factor
TAU = 1              # for soft update of target parameters
LR = 1e-4               # learning rate
UPDATE_EVERY = 4        # how often to update the network
UPDATE_FREN=1000

cuda:0


## Replay buffer


In [0]:
class Buffer:
    def __init__(self, buffer_size, batch_size):
        self.memory = deque(maxlen=buffer_size)  
        self.batch_size = batch_size
  

    def add(self, state, action, reward, next_state, done):
        e=[state, action, reward, next_state, done]
        self.memory.append(e)
    
    def sample(self):
        experiences = random.sample(self.memory, self.batch_size)
        
        
        #(32, 4, 84, 84)

        states = torch.from_numpy(np.stack([e[0] for e in experiences if e is not None])).float().to(device)
        #feed the numpy array into training process tensor
        actions = torch.from_numpy(np.vstack([e[1] for e in experiences if e is not None])).long().to(device)

        rewards = torch.from_numpy(np.vstack([e[2] for e in experiences if e is not None])).float().to(device)

        next_states = torch.from_numpy(np.stack([e[3] for e in experiences if e is not None])).float().to(device)

        dones = torch.from_numpy(np.vstack([e[4] for e in experiences if e is not None]).astype(np.uint8)).int().to(device)

        return (states, actions, rewards, next_states, dones)
    def len_Buffer(self):
        return len(self.memory)

## Network

In [0]:
class Network(nn.Module):
    def __init__(self, state_size, action_size):

        super(Network, self).__init__()

        self.conv = nn.Sequential(
            nn.Conv2d(state_size[1], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
            )

        self.fc = nn.Sequential(
            nn.Linear(64*7*7, 512),
            nn.ReLU(),
            nn.Linear(512, action_size)
        )

    def forward(self, state):
        conv_out = self.conv(state).view(state.size()[0], -1)      
        return self.fc(conv_out)

In [0]:
class Agent():
    def __init__(self, state_size, action_size):
        #state_size=(32, 4, 84, 84)
        #action_size=4
        self.state_size = state_size
        self.action_size = action_size
        self.update_step = 0
        self.update_fren=0
        self.memory = Buffer(BUFFER_SIZE, BATCH_SIZE)
        


        self.qnetwork_original=Network(state_size, action_size).to(device) 
        self.optimizer = torch.optim.RMSprop(self.qnetwork_original.parameters(), lr=LR)
        self.qnetwork_target=Network(state_size, action_size).to(device) 
        

    def step(self, state, action, reward, next_state, done):
        
        self.memory.add(state, action, reward, next_state, done) 
        #into queue

        # Learn every UPDATE_EVERY time steps. UPDATE_EVERY=4
        self.update_step = (self.update_step + 1) % UPDATE_EVERY
        
        if self.update_step == 0:
            if self.memory.len_Buffer()>=BUFFER_SIZE:
                experiences = self.memory.sample()
                #torch from numpy
                self.learn(experiences)
              

    def act(self, state, eps=0.0):
      
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        #since the state from gym is a kind of numpy
        # (1,4,84,84)
        # to be the same form of input for (32,4,84,84)
        
        self.qnetwork_original.eval()
        #evaluation mode
        with torch.no_grad():
        #diabled gradient computation
          action_values = self.qnetwork_original(state)

        self.qnetwork_original.train()
        #training mode

        if random.random() >= eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences
        #(32,1)

        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        #detach() Returns a new Variable, detached from the current graph
        #max(1) maximal
        #max(1)[0] without indices
        #unsequeeze to be like qnetwork_original
        #print(self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1).shape)
        #print(self.qnetwork_target(next_states).detach().max(1)[0].shape)
        
        temp=GAMMA * Q_targets_next * (1 - dones)
        Q_targets = rewards + temp
        #print(Q_targets.shape)

        Q_expected = self.qnetwork_original(states).gather(1, actions)  
        #gather fixed at 1 axis and at 2
        #which is equal to loc in pandas
        #print(self.qnetwork_original(states).shape)
        
        #
        loss = Func.mse_loss(Q_expected, Q_targets)
      
        #Minimize
        #zero the parameter gradients 
        #initialization for each batch gradient descent
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.update_fren = (self.update_fren + 1) % UPDATE_FREN
        
        if self.update_fren == 0:
            self.soft_update(self.qnetwork_original, self.qnetwork_target, TAU)    
            #print("update")                

    def soft_update(self, original_model, target_model, tau):
        #soft update
        #zip(target_model.parameters(), original_model.parameters()) zip for original_model and target_model
        for target_param, original_model in zip(target_model.parameters(), original_model.parameters()):
            target_param.data.copy_(tau*original_model.data + (1.0-tau)*target_param.data)
          


In [0]:
def wrap_atari_dqn(env):
    return wrap_deepmind(env, frame_stack=True, scale=True, episode_life=True,clip_rewards=True)

In [0]:
def wrap_atari_dqn_eva(env):
    return wrap_deepmind(env, frame_stack=True, scale=True, episode_life=True,clip_rewards=False)

In [10]:
seed = random.randint(1,100)
env = make_atari('BreakoutNoFrameskip-v4')
env = wrap_atari_dqn(env)

def seed_torch(seed):
    torch.manual_seed(seed)
    if torch.backends.cudnn.enabled:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

np.random.seed(seed)
seed_torch(seed)
env.seed(seed)

[13, 460929631]

## Environment


In [0]:
def NMSL(timesteps=2000000,eps_start=1.0, eps_end=0.1):
    scores = []  # list containing scores from each episode
    scores_30=[]
    exporation=[]
    scores_window = deque(maxlen=30)  # last 30 scores
    
    eps = eps_start  # initialize epsilon

    obs = env.reset()
    state=np.transpose(np.array(obs), (2, 0, 1)) #4,84,84
    score = 0

    for timestep in range(1, timesteps+1):
        
        action = agent.act(state, eps)
        next_state, reward, done, _ = env.step(action)
        next_state=np.transpose(np.array(next_state), (2, 0, 1))
            # last three frames and current frame as the next state
        agent.step(state, action, reward, next_state, done)
        state = next_state
        score += reward
     
        eps=eps_start-(eps_start-eps_end)*timestep/1000000
        eps = max(eps_end, eps)  # decrease epsilon     
        
        
        if done:
          scores_window.append(score) 
          scores.append(score)
          scores_30.append(np.mean(scores_window))
          score = 0
          obs = env.reset()
          state=np.transpose(np.array(obs), (2, 0, 1))

    
        if timestep % 10000 == 0:
            print('timestep now : {:.2f}'.format(eps))
            print('timestep {} \tAverage Score: {:.2f}'.format(timestep, np.mean(scores_window)))
            print('timestep {} \tThe length of replay buffer now: {}'.format(timestep, agent.memory.len_Buffer()))
            print('timestep {} \tMax score now: {}'.format(timestep, np.max(scores_30)))
            evaluation_score=0
            enva = make_atari('BreakoutNoFrameskip-v4')
            enva = wrap_atari_dqn_eva(enva)
            for ep in range(30):
              for i in range(5):
                  obs = enva.reset()
                  state=np.transpose(np.array(obs), (2, 0, 1)) #4,84,84
                  for j in range(40000):
                      action = agent.act(state, 0.001)
                      next_state, reward, done, _ = enva.step(action)
                      next_state=np.transpose(np.array(next_state), (2, 0, 1))
                      state = next_state
                      evaluation_score += reward
                      if done:
                          break
            print('timestep {} \tEvaluation score now: {}'.format(timestep, evaluation_score/30))
            if evaluation_score/30>=300:
              name='dqn_checkpoint_{}.pth'.format(evaluation_score/30)
              torch.save(agent.qnetwork_original.state_dict(), name)


       
    torch.save(agent.qnetwork_original.state_dict(), 'dqn_checkpoint_final.pth')
    return scores,scores_30


In [0]:
def plot1(episode_rewards):
    fig1 = plt.figure('fig1')
    plt.plot(np.arange(len(episode_rewards)), episode_rewards)
    plt.ylabel('Score')
    plt.xlabel('Episode')
    fig1.savefig('episode_rewards.jpg')
def plot2(mean_30ep_rewards):
    fig2 = plt.figure('fig2')
    plt.plot(np.arange(len(mean_30ep_rewards)), mean_30ep_rewards)
    plt.ylabel('Scores')
    plt.xlabel('Per 30 Episode')
    fig2.savefig('mean_30ep_rewards.jpg')



## Train

In [0]:
agent = Agent((32, 4, 84, 84), 4)  # state size (batch_size, 4 frames, img_height, img_width)
TRAIN = True  # train or test flag


if TRAIN:
        
        scores,scores_30 = NMSL()
      
        plot1(scores)
        plot2(scores_30)

In [0]:
def wrap_atari_dqn_eva(env):
    return wrap_deepmind(env, frame_stack=True, scale=True, episode_life=False,clip_rewards=False)

In [0]:
agent.qnetwork_original.load_state_dict(torch.load('/content/drive/My Drive/dqn/dqn_checkpoint_6708.0.pth'))
enva = make_atari('BreakoutNoFrameskip-v4')
enva = wrap_atari_dqn_eva(enva)
enva=gym.wrappers.Monitor(enva,'/content/drive/My Drive/dqn/video',force=True)



for ep in range(5):
  score=0
  obs = enva.reset()
  state=np.transpose(np.array(obs), (2, 0, 1)) #4,84,84
  for j in range(100000):
      action = agent.act(state, 0.001)
      next_state, reward, done, _ = enva.step(action)
      next_state=np.transpose(np.array(next_state), (2, 0, 1))
      state = next_state
      score += reward
      if done:
          print(score)
          break
              