# Prepare

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install gym[atari]
!pip install autorom[accept-rom-license]

In [3]:
import gym
import numpy as np
from collections import deque
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import time
import matplotlib.pyplot as plt
from matplotlib import animation
from IPython import display

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

from collections import namedtuple

import copy
from tqdm.notebook import tqdm
import random

# Wrappers

In [5]:
class ConcatObs(gym.Wrapper):
    def __init__(self, env, k=4):
        gym.Wrapper.__init__(self, env)
        self.k = k
        self.frames = deque([], maxlen=k)
        shp = env.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=((k,) + shp), dtype=env.observation_space.dtype)

    def reset(self):
        ob = self.env.reset()
        for _ in range(self.k):
            self.frames.append(ob)

        return self._get_ob()

    def step(self, action):
        total_reward = 0.0
        done = None
        for i in range(self.k):
            obs, reward, done, info = self.env.step(action)
            total_reward += reward
            self.frames.append(obs)
            
            # Only count one live each episode
            done = True if info['lives'] < 4 else False
            if done:  
                break
        return self._get_ob(), total_reward, done, info

    def _get_ob(self):
        return np.array(self.frames)

In [6]:
# A bunch of wrappers to get us started, please use these
class ObservationWrapper(gym.ObservationWrapper):
    def __init__(self, env, GRAYSCALE=False, NORMALIZE=False):
        self.GRAYSCALE = GRAYSCALE
        self.NORMALIZE = NORMALIZE
        super().__init__(env)
    
    def observation(self, obs):
        # Normalise observation by 255
        if self.NORMALIZE:
            obs = obs / 255.0
            
        if self.GRAYSCALE:
            obs = tf.image.rgb_to_grayscale(obs)
                    
        image = obs[:,2:-9,8:,:]

        image = tf.image.resize(image,[84,84])
        # print(image.shape)
        # (4, 84, 84, 1)

        image = tf.reshape(image, image.shape[:-1])
        # print(image.shape)
        # (4, 84, 84)
        
        # image = tf.transpose(image,perm = [1,2,0])
        # print(image.shape)
        # (84, 84, 4)

        return image

class RewardWrapper(gym.RewardWrapper):
    def __init__(self, env):
        super().__init__(env)
    
    def reward(self, reward):
        # Clip reward between 0 to 1
        #return np.clip(reward, 0, 1)
        return reward
    
class ActionWrapper(gym.ActionWrapper):
    def __init__(self, env):
        super().__init__(env)
    
    def action(self, action):
        return action

class FireResetEnv(gym.Wrapper):
    def __init__(self, env):
        """Take action on reset for environments that are fixed until firing."""
        super().__init__(env)
        assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
        assert len(env.unwrapped.get_action_meanings()) >= 3

    def reset(self, **kwargs):
        self.env.reset(**kwargs)
        obs, _, done, _ = self.env.step(1)
        if done:
            self.env.reset(**kwargs)
        obs, _, done, _ = self.env.step(2)
        if done:
            self.env.reset(**kwargs)
        return obs

    def step(self, ac):
        return self.env.step(ac)

# Environment

In [None]:
# env = gym.make("ALE/Breakout-v5")
env = gym.make("ALE/Riverraid-v5")

# Use wrappers for the environment
env = ObservationWrapper(RewardWrapper(ActionWrapper(ConcatObs(FireResetEnv(env),k=4))), GRAYSCALE=True, NORMALIZE=True)

In [None]:
obs = env.reset()
print(obs.shape)

# Actor Critic

In [9]:
# Hyper parameters
INPUT_SHAPE = obs.shape
N_ACTION = env.action_space.n
IMG_SIZE = INPUT_SHAPE[0]
IMG_CHANNELS = INPUT_SHAPE[2]

EPISODES = 100    # how many episode will run
MAX_STEP = 100000 # how many step in one episode
LR = 1e-3         # learning rate
HIDDEN = 512      # hidden layer 
MEM_SIZE = 10000  # memory size for transition
BATCH_SIZE = 64   # batch size for one gradient update
GAMMA = 0.99      # discount reward factor
RUNNING_REWARD = 10

ADVANTAGE = True  # if use A2C

In [None]:
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(DEVICE)

In [None]:
print(INPUT_SHAPE)
print(IMG_SIZE)
print(IMG_CHANNELS)
print(N_ACTION)

In [12]:
def process_state(state):
    state_ = np.array(state)
    # state_ = state_.reshape((IMG_CHANNELS, IMG_SIZE, IMG_SIZE))
    state_ = torch.from_numpy(state_).float().to(DEVICE)

    # Output size: torch.Size([4, 84, 84])

    return state_

In [13]:
SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])

class ActorCritic(torch.nn.Module):
    def __init__(self, input_shape=INPUT_SHAPE, hidden_size=HIDDEN, n_action=N_ACTION, advantage=ADVANTAGE):
        super().__init__()
        self.input_shape = input_shape
        self.action_space = n_action
        self.advantage = advantage

        # CNN
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
            nn.ReLU(),
        )

        # CNN output size
        CNN_output_size = 7 * 7 * 64

        # Actor
        self.actor_fc1 = nn.Linear(CNN_output_size, hidden_size)
        self.actor_fc2 = nn.Linear(hidden_size, self.action_space)

        # Critic
        self.critic_fc1 = nn.Linear(CNN_output_size, hidden_size)
        self.critic_fc2 = nn.Linear(hidden_size, 1)

        # Optimizer
        self.optimizer = optim.Adam(self.parameters(), lr=LR)

        # Action & Reward buffer
        self.saved_actions = []
        self.rewards = []


    def forward(self, state):
        # CNN
        h = self.conv(state)
        # print("h")
        # print(h)
        # print(h.shape)

        # Flatten
        h = h.reshape((-1, 7*7*64))
        # print("h")
        # print(h)
        # print(h.shape)

        # Actor generate the probability for each action on given state
        probs = F.relu(self.actor_fc1(h))
        probs = self.actor_fc2(probs)
        # print("probs")
        # print(probs)
        # print(probs.shape)
        probs = F.softmax(probs, dim=1)   # CAREFUL ABOUT THE DIM!!
        # print(probs)
        # print(probs.shape)

        # Critic the V value for given state
        value = F.relu(self.critic_fc1(h))
        value = self.critic_fc2(value)

        return value, probs


    def choose_action(self, state):
        value, probs = self(state)
        value = torch.squeeze(value)

        # Create a categorical distribution over the list of probabilities of actions
        probs_dist = Categorical(probs)

        # And sample an action using the distribution
        action = probs_dist.sample()

        # Save to action buffer
        self.saved_actions.append(SavedAction(probs_dist.log_prob(action), value))

        # The action to take
        return action.item()

    
    def finish_episode(self):
        """
        Training code. Calculates actor and critic loss and performs backprop.
        """
        eps = np.finfo(np.float32).eps.item()

        R = 0
        saved_actions = self.saved_actions
        policy_losses = [] # list to save actor (policy) loss
        value_losses = [] # list to save critic (value) loss
        returns = [] # list to save the true values

        # Calculate the true value using rewards returned from the environment
        for r in self.rewards[::-1]:
            # Calculate the discounted value
            R = r + GAMMA * R
            returns.insert(0, R)

        returns = torch.tensor(returns).to(DEVICE)
        returns = (returns - returns.mean()) / (returns.std() + eps)

        for (log_prob, value), R in zip(saved_actions, returns):
            # Calculate actor (policy) loss
            if self.advantage:
                advantage = R - value.item()
                policy_losses.append(-log_prob * advantage)
            else:
                policy_losses.append(-log_prob * value.item())

            # Calculate critic (value) loss using L1 smooth loss
            value_losses.append(F.smooth_l1_loss(value, R))

        # Reset gradients
        self.optimizer.zero_grad()

        # Sum up all the values of policy_losses and value_losses
        loss = torch.stack(policy_losses).mean() + torch.stack(value_losses).mean()

        # Perform backprop
        loss.backward()
        self.optimizer.step()

        # Reset rewards and action buffer
        del self.rewards[:]
        del self.saved_actions[:]



# Train with REINFORCE

In [None]:
agent = ActorCritic().to(DEVICE)
print(agent.advantage)

# Print model state
print("Model's state_dict:")
for param_tensor in agent.state_dict():
    print(param_tensor, "\t", agent.state_dict()[param_tensor].size())

print()

# Print optimizer state
print("Optimizer's state_dict:")
for var_name in agent.optimizer.state_dict():
    print(var_name, "\t", agent.optimizer.state_dict()[var_name])

In [None]:
reward_list = []
run_steps = []

best_reward = 0
average_reward = 0
episode_number = []
average_reward_number = []

start_time = time.time()

# Train for EPISODES number
for i in tqdm(range(1, EPISODES)):
    # reset
    state = env.reset()
    state = process_state(state)

    ep_reward = 0

    # Step counter for one episode
    step = 0

    # Start episode
    for step in range(MAX_STEP):

        # print("state")
        # print(state)
        # print(state.shape)

        # Sample action
        action = agent.choose_action(state)
        # Step forward
        new_state, reward, done, info = env.step(action)

        # Transform the state
        new_state = process_state(new_state)

        #
        agent.rewards.append(reward)
        ep_reward += reward

        # Update loop variable
        state = new_state

        if step%100==0 and step!=0:
            print("   Current step {}/{}".format(step, MAX_STEP))

        if done or step == MAX_STEP-1:
            reward_list.append(ep_reward)
            run_steps.append(step)

            if ep_reward > best_reward:
                best_reward = ep_reward

            average_reward += ep_reward 

            if i%10==0:
                print("Episode {} Average Reward {} Best Reward {} Last Reward {}".format(i, average_reward/i, best_reward, ep_reward))

            break

    # Update cumulative reward after one episode
    RUNNING_REWARD = 0.05 * ep_reward + (1 - 0.05) * RUNNING_REWARD

    # Perform backprop
    agent.finish_episode()
  
    # Add the result of one episode
    episode_number.append(i)
    average_reward_number.append(average_reward/i)

plt.plot(episode_number, average_reward_number)
plt.show()

end_time = time.time()

In [None]:
runtime = end_time - start_time
print("Running time:", runtime)

average_reward = sum(reward_list)/EPISODES
max_reward = max(reward_list)
print('Average reward:', average_reward)
print('Max reward:', max_reward)

plt.plot(reward_list)
plt.title('Reward for episodes')
plt.ylabel('Reward')
plt.xlabel('Episodes')
# plt.savefig('/content/drive/MyDrive/INF581/rewardDDQN.jpg')
plt.show()

plt.plot(run_steps, color='green')
plt.title('Run steps for episodes')
plt.ylabel('Steps')
plt.xlabel('Episodes')
# plt.savefig('/content/drive/MyDrive/INF581/runtimeDDQN.jpg')
plt.show()


In [None]:
path = '/content/drive/MyDrive/INF581-RL-Shared/A'
if ADVANTAGE:
    path += '2'
path += 'C_parameter'+ str(average_reward) + '.pkl'
print(path)

torch.save(agent.state_dict(), path)

# Train with Experience Replay

In [None]:
class ReplayBuffer:
    def __init__(self):
        self.memory = deque(maxlen=MEM_SIZE)
    
    def add(self, experience):
        self.memory.append(experience)
    
    def sample(self):
        minibatch = random.sample(self.memory, BATCH_SIZE)

        state1_batch = torch.stack([s1 for (s1,a,r,s2,d) in minibatch])
        action_batch = torch.tensor([a for (s1,a,r,s2,d) in minibatch])
        reward_batch = torch.tensor([r for (s1,a,r,s2,d) in minibatch])
        state2_batch = torch.stack([s2 for (s1,a,r,s2,d) in minibatch])
        done_batch = torch.tensor([d for (s1,a,r,s2,d) in minibatch])

        return (state1_batch, action_batch, reward_batch, state2_batch, done_batch)

In [None]:
class AnC:
    def __init__(self, advantage=ADVANTAGE):
        self.replay = ReplayBuffer()
        self.actor_critic = ActorCritic().to(DEVICE)
        self.advantage = advantage


    def choose_action(self, state):
        with torch.no_grad():
            # Probability for each action
            _, probs = self.actor_critic(state)
            # print(probs.requires_grad)

        # Create a categorical distribution over the list of probabilities of actions
        p_a_s = Categorical(probs)  # p(a|s)

        # And sample an action using the distribution
        action = p_a_s.sample()

        # the action to take
        return action.item()


    def learn(self):
        if len(self.replay.memory)< BATCH_SIZE:
            return

        # Sample minibatch s1, a1, r1, s1', done_1, ... , sn, an, rn, sn', done_n
        # Sample randomly
        state1_batch, action_batch, reward_batch, state2_batch, done_batch = self.replay.sample()

        # Init
        policy_losses = []
        value_losses = []

        for index in range(BATCH_SIZE):
            state = state1_batch[index]
            action = action_batch[index]
            reward = reward_batch[index]
            new_state = state2_batch[index]
            done = done_batch[index]

            # If done, no future value should be added
            one_minus_done = 1 - int(done)

            # Compute Q value and probs
            value, probs = self.actor_critic(state)
            value = torch.squeeze(value)
            probs_dist = Categorical(probs)

            # The log probability for the action
            log_prob = probs_dist.log_prob(action.to(DEVICE))

            # Compute Q value for new state
            with torch.no_grad():
                next_value, _ = self.actor_critic(new_state)
                next_value = torch.squeeze(next_value)

            # TD learning
            target = reward + GAMMA * next_value * one_minus_done  # if done, no future value

            # Calculate actor (policy) loss
            if self.advantage:
                advantage = target - value.item()
                policy_losses.append(-log_prob * advantage)
            else:
                policy_losses.append(-log_prob * value.item())

            # Calculate critic (value) loss
            value_losses.append(F.smooth_l1_loss(value, target))


        # Reset gradients
        self.actor_critic.optimizer.zero_grad()

        # Sum up all the values of policy_losses and value_losses
        loss = torch.stack(policy_losses).mean() + torch.stack(value_losses).mean()

        # Perform backprop
        loss.backward()
        self.actor_critic.optimizer.step()


In [None]:
agent = AnC()

# Print model state
print("Model's state_dict:")
for param_tensor in agent.actor_critic.state_dict():
    print(param_tensor, "\t", agent.actor_critic.state_dict()[param_tensor].size())

print()

# Print optimizer state
print("Optimizer's state_dict:")
for var_name in agent.actor_critic.optimizer.state_dict():
    print(var_name, "\t", agent.actor_critic.optimizer.state_dict()[var_name])

In [None]:
reward_list = []
run_steps = []

best_reward = 0
average_reward = 0
episode_number = []
average_reward_number = []

start_time = time.time()

# Train for EPISODES number
for i in tqdm(range(1, EPISODES)):
    # reset
    state = env.reset()
    state = process_state(state)

    score = 0
    agent.nan_counter = 0

    # Step counter for one episode
    step = 0

    # Start episode
    for step in range(MAX_STEP):

        # print("state")
        # print(state)
        # print(state.shape)

        # Sample action
        action = agent.choose_action(state)
        # Step forward
        new_state, reward, done, info = env.step(action)

        # Transform the state
        new_state = process_state(new_state)

        # Record this exp
        exp = (state, action, reward, new_state, done)
        agent.replay.add(exp)
        
        # Update NN
        agent.learn()

        # Update loop variable
        state = new_state
        score += reward

        if step%100==0 and step!=0:
            print("   Current step {}/{}".format(step, MAX_STEP))

        if done or step == MAX_STEP-1:
            reward_list.append(score)
            run_steps.append(step)

            if score > best_reward:
                best_reward = score

            average_reward += score 

            if i%10==0:
                print("Episode {} Average Reward {} Best Reward {} Last Reward {}".format(i, average_reward/i, best_reward, score))

            break
  
    episode_number.append(i)
    average_reward_number.append(average_reward/i)

plt.plot(episode_number, average_reward_number)
plt.show()

end_time = time.time()

In [None]:
runtime = end_time - start_time
print("Running time:", runtime)

average_reward = sum(reward_list)/EPISODES
max_reward = max(reward_list)
print('Average reward:', average_reward)
print('Max reward', max_reward)

plt.plot(reward_list)
plt.title('Reward for episodes')
plt.ylabel('Reward')
plt.xlabel('Episodes')
# plt.savefig('/content/drive/MyDrive/INF581/rewardDDQN.jpg')
plt.show()

plt.plot(run_steps, color='green')
plt.title('Run steps for episodes')
plt.ylabel('Steps')
plt.xlabel('Episodes')
# plt.savefig('/content/drive/MyDrive/INF581/runtimeDDQN.jpg')
plt.show()


In [None]:
path = '/content/drive/MyDrive/INF581-RL-Shared/A'
if ADVANTAGE:
    path += '2'
path += 'C_parameter'+ str(average_reward) + '.pkl'
print(path)

torch.save(agent.actor_critic.state_dict(), path)

# Test

In [43]:
def show_frame(env, step=0, info=""):
    plt.figure(3)
    plt.clf()
    plt.imshow(env.render(mode='rgb_array'))
    plt.title("%s | Step: %d %s" % (env.spec.id, step, info))
    plt.axis('off')

    display.clear_output(wait=True)
    display.display(plt.gcf())

def save_gif(frames, path):
    patch = plt.imshow(frames[0])
    plt.axis("off")

    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval = 30)
    anim.save(path, writer="pillow", fps = 30)

In [None]:
# Load model parameter
path = "/content/drive/MyDrive/INF581-RL-Shared/model/A2C_parameter1250.2855.pkl"
agent = ActorCritic().to(DEVICE)
agent.load_state_dict(torch.load(path))
agent.eval()

In [None]:
total_reward = 0
frames = []
step = 0

state = env.reset()
state = process_state(state)

for step in range(MAX_STEP):

    # Sample action
    action = agent.choose_action(state)
    # Step forward
    new_state, reward, done, info = env.step(action)

    # Transform the state
    new_state = process_state(new_state)

    # Add frames
    show_frame(env, step, info)
    frames.append(env.render(mode = "rgb_array"))
        
    total_reward += reward

    state = new_state

    if done and info['lives']==0:
      print(total_reward)
      break

In [None]:
path = "/content/drive/MyDrive/INF581-RL-Shared/gif/A2C_1250.3050.gif"
save_gif(frames, path)