# INM707 Coursework - Advanced Task
# Space Invaders with Asynchronous Advantage Actor-Critic (A3C) - Feedforward
### By: Elisabeta Monica Furdui: 190045971 and Jasveen Kaur: 190020638 

### Importing the required modules
For this implementation, we have used OpenAI gym and Pytorch library.

In [1]:
###############
#Code referenced and corrected/edited from: https://github.com/ikostrikov/pytorch-a3c
###############

from __future__ import print_function
import cv2
import gym
import numpy as np
from gym.spaces.box import Box
from gym import wrappers
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import time
from collections import deque
import os
import torch.multiprocessing as mp
import math
import torch.optim as optim

### Preprocessing the environment

In [2]:
# Taken from https://github.com/openai/universe-starter-agent
def create_atari_env(env_id, video=False):
    env = gym.make(env_id)
    if video:
        env = wrappers.Monitor(env, 'test', force=True)
    env = MyAtariRescale42x42(env)
    env = MyNormalizedEnv(env)
    return env


def _process_frame42(frame):
    frame = frame[34:34 + 160, :160]
    # Resize by half, then down to 42x42 (essentially mipmapping). If
    # we resize directly we lose pixels that, when mapped to 42x42,
    # aren't close enough to the pixel boundary.
    frame = cv2.resize(frame, (80, 80))
    frame = cv2.resize(frame, (42, 42))
    frame = frame.mean(2)
    frame = frame.astype(np.float32)
    frame *= (1.0 / 255.0)
    #frame = np.reshape(frame, [1, 42, 42])
    return frame


class MyAtariRescale42x42(gym.ObservationWrapper):

    def __init__(self, env=None):
        super(MyAtariRescale42x42, self).__init__(env)
        self.observation_space = Box(0.0, 1.0, [1, 42, 42], dtype = np.float32)

    def observation(self, observation):
        return _process_frame42(observation)


class MyNormalizedEnv(gym.ObservationWrapper):

    def __init__(self, env=None):
        super(MyNormalizedEnv, self).__init__(env)
        self.state_mean = 0
        self.state_std = 0
        self.alpha = 0.9999
        self.num_steps = 0

    def observation(self, observation):
        self.num_steps += 1
        self.state_mean = self.state_mean * self.alpha + observation.mean() * (1 - self.alpha)
        self.state_std = self.state_std * self.alpha + observation.std() * (1 - self.alpha)

        unbiased_mean = self.state_mean / (1 - pow(self.alpha, self.num_steps))
        unbiased_std = self.state_std / (1 - pow(self.alpha, self.num_steps))

        ret = (observation - unbiased_mean) / (unbiased_std + 1e-8)
        return np.expand_dims(ret, axis=0)


### Shared Optimizer

In [3]:
class SharedRMSprop(optim.RMSprop):
    """Implements RMSprop algorithm with shared states.
    """

    def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0, momentum=0, centered=False):
        super(SharedRMSprop, self).__init__(params, lr, alpha, eps, weight_decay, momentum, centered)

    def __setstate__(self, state):
        super(SharedRMSprop, self).__setstate__(state)

    def share_memory(self):
        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                state['square_avg'].share_memory_()
                state['step'].share_memory_()
                state['grad_avg'].share_memory_()
                state['momentum_buffer'].share_memory_()

    def step(self):
        loss = None

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                state = self.state[p]

                square_avg = state['square_avg']
                alpha = group['alpha']

                state['step'] += 1

                if group['weight_decay'] != 0:
                    grad = grad.add(group['weight_decay'], p.data)

                square_avg.mul_(alpha).addcmul_(1 - alpha, grad, grad)

                if group['centered']:
                    grad_avg = state['grad_avg']
                    grad_avg.mul_(alpha).add_(1 - alpha, grad)
                    avg = square_avg.addcmul(-1, grad_avg, grad_avg).sqrt().add_(group['eps'])
                else:
                    avg = square_avg.sqrt().add_(group['eps'])

                if group['momentum'] > 0:
                    buf = state['momentum_buffer']
                    buf.mul_(group['momentum']).addcdiv_(grad, avg)
                    p.data.add_(-group['lr'], buf)
                else:
                    p.data.addcdiv_(-group['lr'], grad, avg)

        return loss
    
class SharedAdam(optim.Adam):

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
        super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay)
        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                state['step'] = torch.zeros(1)
                state['exp_avg'] = p.data.new().resize_as_(p.data).zero_()
                state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_()

    def share_memory(self):
        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                state['step'].share_memory_()
                state['exp_avg'].share_memory_()
                state['exp_avg_sq'].share_memory_()

    def step(self):
        loss = None
        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                state = self.state[p]
                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']
                state['step'] += 1
                if group['weight_decay'] != 0:
                    grad = grad.add(group['weight_decay'], p.data)
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                denom = exp_avg_sq.sqrt().add_(group['eps'])
                bias_correction1 = 1 - beta1 ** state['step'].item()
                bias_correction2 = 1 - beta2 ** state['step'].item()
                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
                p.data.addcdiv_(-step_size, exp_avg, denom)
        return loss


### Initializing the weights and defining the A3C Feedforward network architecture 
Since the agents share the same network, therefore they also share common weights.

In [4]:
# Initializing and setting the variance of a tensor of weights
def normalized_columns_initializer(weights, std=1.0):
    out = torch.randn(weights.size())
    out *= std / torch.sqrt(out.pow(2).sum(1, keepdim = True))
    return out

# Initializing the weights of the neural network in an optimal way for the learning
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        weight_shape = list(m.weight.data.size())
        fan_in = np.prod(weight_shape[1:4])
        fan_out = np.prod(weight_shape[2:4]) * weight_shape[0]
        w_bound = np.sqrt(6. / (fan_in + fan_out))
        m.weight.data.uniform_(-w_bound, w_bound)
        m.bias.data.fill_(0)
    elif classname.find('Linear') != -1:
        weight_shape = list(m.weight.data.size())
        fan_in = weight_shape[1]
        fan_out = weight_shape[0]
        w_bound = np.sqrt(6. / (fan_in + fan_out))
        m.weight.data.uniform_(-w_bound, w_bound)
        m.bias.data.fill_(0)

#A3C Feedforward Model
class ActorCritic(torch.nn.Module):

    def __init__(self, num_inputs, action_space):
        super(ActorCritic, self).__init__()
        
        self.conv1 = nn.Conv2d(num_inputs, 32, 3, stride=2, padding=1)
        self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.conv4 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        
        self.fc = nn.Linear(32 * 3 * 3, 256)
        
        num_outputs = action_space.n
        self.critic_linear = nn.Linear(256, 1)
        self.actor_linear = nn.Linear(256, num_outputs)
        
        self.apply(weights_init)
        self.actor_linear.weight.data = normalized_columns_initializer(self.actor_linear.weight.data, 0.01)
        self.critic_linear.weight.data = normalized_columns_initializer(self.critic_linear.weight.data, 1.0)
        
        self.train()

    def forward(self, inputs):
        
        inputs = inputs[0]
        
        x = F.relu(self.conv1(inputs))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        
        x = x.view(-1, 32 * 3 * 3)
        x = F.relu(self.fc(x))
        
        return self.critic_linear(x), self.actor_linear(x), x


### Function to train the Agent

In [5]:
def ensure_shared_grads(model, shared_model):
    for param, shared_param in zip(model.parameters(), shared_model.parameters()):
        if shared_param.grad is not None:
            return
        shared_param._grad = param.grad

def train(rank, params, shared_model, optimizer):
    torch.manual_seed(params.seed + rank)
    env = create_atari_env(params.env_name)
    env.seed(params.seed + rank)
    
    model = ActorCritic(env.observation_space.shape[0], env.action_space)
    state = env.reset()
    state = torch.from_numpy(state)
    done = True
    
    episode_length = 0
    while True:
        episode_length += 1
        model.load_state_dict(shared_model.state_dict())
        if done:
            x = Variable(torch.zeros(1, 256))
            #hx = Variable(torch.zeros(1, 256))
        else:
            x = Variable(x.data)
            #hx = Variable(hx.data)
            
        values = []
        log_probs = []
        rewards = []
        entropies = []
        
        for step in range(params.num_steps):
            value, action_values, x = model((Variable(state.unsqueeze(0)), x))
            prob = F.softmax(action_values, dim = 1)
            log_prob = F.log_softmax(action_values, dim = 1)
            entropy = -(log_prob * prob).sum(1)
            
            entropies.append(entropy)
            action = prob.multinomial(num_samples = 1).data
            log_prob = log_prob.gather(1, Variable(action))
            
            values.append(value)
            log_probs.append(log_prob)
            
            state, reward, done, _ = env.step(action.numpy())
            done = (done or episode_length >= params.max_episode_length)
            reward = max(min(reward, 1), -1)
            
            if done:
                episode_length = 0
                state = env.reset()
            
            state = torch.from_numpy(state)
            rewards.append(reward)
            if done:
                break
        
        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model((Variable(state.unsqueeze(0)), x))
            R = value.data 
        
        values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        
        for i in reversed(range(len(rewards))):
            R = params.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)
            TD = rewards[i] + params.gamma * values[i + 1].data - values[i].data
            gae = gae * params.gamma * params.tau + TD
            policy_loss = policy_loss - log_probs[i] * Variable(gae) - 0.01 * entropies[i]
        optimizer.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 40)
        ensure_shared_grads(model, shared_model)
        optimizer.step()


### Function to test the agent's performance

In [6]:
def directory(params):
    # logging
    log_dir = os.path.join('logfile', params.model_name)
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    
    log_filename = params.env_name+"."+ params.model_name+".log"
    f = open(os.path.join(log_dir, log_filename), "w")
    # model saver
    ckpt_dir = os.path.join('picklefile', params.model_name)
    
    if not os.path.exists(ckpt_dir):
        os.makedirs(ckpt_dir)
    ckpt_filename = params.env_name +"."+ params.model_name +".pkl"
    return (f, os.path.join(ckpt_dir, ckpt_filename)), (log_dir, ckpt_dir)

def test(rank, params, shared_model):
    torch.manual_seed(params.seed + rank)
    env = create_atari_env(params.env_name, video=True)
    env.seed(params.seed + rank)
    
    model = ActorCritic(env.observation_space.shape[0], env.action_space)
    model.eval()
    
    state = env.reset()
    state = torch.from_numpy(state)
    
    (f, ckpt_path), (log_dir, ckpt_dir) = directory(params)
    
    reward_sum = 0
    done = True
    start_time = time.time()
    actions = deque(maxlen=100)
    episode_length = 0
    episode_i = 0
    while True:
        episode_length += 1
        if done:
            model.load_state_dict(shared_model.state_dict())
            with torch.no_grad():
                x = Variable(torch.zeros(1, 256))
                #hx = Variable(torch.zeros(1, 256))
        else:
            with torch.no_grad():
                x = Variable(x.data)
                #hx = Variable(hx.data)
        with torch.no_grad():
            value, action_value, x = model((Variable(state.unsqueeze(0)), x))
            
        prob = F.softmax(action_value, dim = 1)
        action = prob.max(1)[1].data.numpy()
        state, reward, done, _ = env.step(action[0])
        reward_sum += reward
        
        if done:
            episode_i += 1
            if episode_i % params.save_freq == 0:
                torch.save(model.state_dict(), os.path.join(ckpt_dir, params.env_name + "." + params.model_name +"." + str(episode_i) + ".pkl"))
        
            info = "Time {}, episode reward {}, episode length {}".format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length)
            print(info)
            f.write(info + '\n')
            #print("Time {}, episode reward {}, episode length {}".format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(60)
        state = torch.from_numpy(state)


### Defining the parameters used

In [15]:
class Params():
    def __init__(self):
        self.lr = 0.0001
        self.gamma = 0.99
        self.tau = 1.
        self.seed = 1
        self.num_processes = mp.cpu_count()
        self.num_steps = 20
        self.max_episode_length = 10000
        self.env_name = 'SpaceInvaders-v0'
        self.task = 'eval'
        self.save_freq = 20
        self.model_name = 'ActorCritic'
        self.load_ckpt = 'picklefile/ActorCritic/SpaceInvaders-v0.ActorCritic.5080.pkl'


### Running all the agents in parallel with 7 processors

In [None]:
if __name__ == '__main__':
    os.environ['OMP_NUM_THREADS'] = '1'
    params = Params()
    torch.manual_seed(params.seed)
    env = create_atari_env(params.env_name)
    shared_model = ActorCritic(env.observation_space.shape[0], env.action_space)
    shared_model.share_memory()
    optimizer = SharedAdam(shared_model.parameters(), lr=params.lr)
    optimizer.share_memory()
    
    if params.task == 'train':
        processes = []
        p = mp.Process(target=test, args=(params.num_processes, params, shared_model))
        p.start()
        processes.append(p)
            
        for rank in range(0, params.num_processes):
            p = mp.Process(target=train, args=(rank, params, shared_model, optimizer))
            p.start()
            processes.append(p)
            
        for p in processes:
            p.join()
        
    elif params.task == 'eval':
        shared_model.load_state_dict(torch.load(params.load_ckpt))
        test(params.num_processes, params, shared_model)
        
    elif params.task == 'develop':
        train(0, params, shared_model, optimizer)


Time 00h 00m 10s, episode reward 445.0, episode length 1072
Time 00h 01m 18s, episode reward 320.0, episode length 683
Time 00h 02m 30s, episode reward 715.0, episode length 1191
Time 00h 03m 35s, episode reward 210.0, episode length 506
Time 00h 04m 41s, episode reward 365.0, episode length 640
Time 00h 05m 49s, episode reward 435.0, episode length 831
Time 00h 06m 59s, episode reward 600.0, episode length 1184
Time 00h 08m 09s, episode reward 780.0, episode length 1140
Time 00h 09m 16s, episode reward 315.0, episode length 714
Time 00h 10m 26s, episode reward 515.0, episode length 951
Time 00h 11m 33s, episode reward 360.0, episode length 659
Time 00h 12m 43s, episode reward 575.0, episode length 1125
Time 00h 13m 52s, episode reward 490.0, episode length 935
Time 00h 15m 01s, episode reward 490.0, episode length 914
Time 00h 16m 11s, episode reward 570.0, episode length 1147
Time 00h 17m 23s, episode reward 715.0, episode length 1273
Time 00h 18m 32s, episode reward 600.0, episode l

Time 02h 35m 22s, episode reward 485.0, episode length 846
Time 02h 36m 25s, episode reward 570.0, episode length 921
Time 02h 37m 27s, episode reward 435.0, episode length 823
Time 02h 38m 30s, episode reward 515.0, episode length 854
Time 02h 39m 32s, episode reward 600.0, episode length 967
Time 02h 40m 35s, episode reward 470.0, episode length 973
Time 02h 41m 40s, episode reward 945.0, episode length 1650
Time 02h 42m 43s, episode reward 540.0, episode length 995
Time 02h 43m 45s, episode reward 520.0, episode length 926
Time 02h 44m 50s, episode reward 720.0, episode length 1537
Time 02h 45m 52s, episode reward 575.0, episode length 1032
Time 02h 46m 56s, episode reward 570.0, episode length 1237
Time 02h 47m 58s, episode reward 415.0, episode length 729
Time 02h 49m 01s, episode reward 485.0, episode length 896
Time 02h 50m 04s, episode reward 465.0, episode length 1066
Time 02h 51m 06s, episode reward 555.0, episode length 965
Time 02h 52m 09s, episode reward 605.0, episode len

Time 05h 05m 41s, episode reward 515.0, episode length 863
Time 05h 06m 48s, episode reward 405.0, episode length 745
Time 05h 07m 58s, episode reward 460.0, episode length 890
Time 05h 09m 11s, episode reward 600.0, episode length 1263
Time 05h 10m 21s, episode reward 600.0, episode length 1068
Time 05h 11m 33s, episode reward 570.0, episode length 1172
Time 05h 12m 48s, episode reward 695.0, episode length 1439
Time 05h 13m 57s, episode reward 360.0, episode length 754
Time 05h 15m 08s, episode reward 570.0, episode length 1132
Time 05h 16m 18s, episode reward 465.0, episode length 867
Time 05h 17m 29s, episode reward 600.0, episode length 1100
Time 05h 18m 41s, episode reward 545.0, episode length 1127
Time 05h 19m 50s, episode reward 405.0, episode length 751
Time 05h 20m 57s, episode reward 345.0, episode length 691
Time 05h 22m 04s, episode reward 305.0, episode length 641
Time 05h 23m 11s, episode reward 280.0, episode length 617
Time 05h 24m 24s, episode reward 545.0, episode l

Time 12h 51m 30s, episode reward 430.0, episode length 849
Time 12h 52m 33s, episode reward 605.0, episode length 1030
Time 12h 53m 35s, episode reward 575.0, episode length 1000
Time 12h 54m 38s, episode reward 550.0, episode length 947
Time 12h 55m 41s, episode reward 570.0, episode length 1045
Time 12h 56m 43s, episode reward 605.0, episode length 1068
Time 12h 57m 46s, episode reward 540.0, episode length 915
Time 12h 58m 49s, episode reward 570.0, episode length 967
Time 12h 59m 54s, episode reward 760.0, episode length 1517
Time 13h 00m 57s, episode reward 640.0, episode length 1179
Time 13h 02m 00s, episode reward 515.0, episode length 1041
Time 13h 03m 02s, episode reward 355.0, episode length 773
Time 13h 04m 05s, episode reward 540.0, episode length 1093
Time 13h 05m 08s, episode reward 680.0, episode length 1196
Time 13h 06m 10s, episode reward 455.0, episode length 838
Time 13h 07m 13s, episode reward 435.0, episode length 840
Time 13h 08m 15s, episode reward 295.0, episode

Time 15h 15m 55s, episode reward 310.0, episode length 681
Time 15h 16m 59s, episode reward 755.0, episode length 1307
Time 15h 18m 03s, episode reward 680.0, episode length 1207
Time 15h 19m 06s, episode reward 580.0, episode length 1133
Time 15h 20m 08s, episode reward 485.0, episode length 763
Time 15h 21m 11s, episode reward 600.0, episode length 1234
Time 15h 22m 15s, episode reward 870.0, episode length 1680
Time 15h 23m 18s, episode reward 580.0, episode length 1051
Time 15h 24m 21s, episode reward 575.0, episode length 1071
Time 15h 25m 23s, episode reward 305.0, episode length 634
Time 15h 26m 25s, episode reward 435.0, episode length 814
Time 15h 27m 27s, episode reward 540.0, episode length 1102
Time 15h 28m 30s, episode reward 355.0, episode length 764
Time 15h 29m 32s, episode reward 485.0, episode length 812
Time 15h 30m 35s, episode reward 695.0, episode length 1136
Time 15h 31m 38s, episode reward 570.0, episode length 1125
Time 15h 32m 41s, episode reward 460.0, episod