# REINFORCE Algorithm

## Installation

In [None]:
!pip install gymnasium
!pip install box2d
!pip install mediapy



In [None]:
import torch
import numpy as np
import torch.optim as optim
import math
import gym
import mediapy as media
import torch.nn as nn
import torch.nn.functional as F

## Config Params

In [None]:
#device, CPU or the GPU of your choice
GPU = 0
DEVICE = torch.device("cuda:{}".format(GPU) if torch.cuda.is_available() else "cpu")

#environment names
RAM_ENV_NAME = 'LunarLander-v2'
STATE_SIZE = 8
ACTION_SIZE = 4

#Agent parameters
LEARNING_RATE = 0.005
GAMMA = 0.99
CRITIC = False
NORMALIZE = True
MODE = 'MC'

#Training parameters
RAM_NUM_EPISODE = 2000
SCALE = 0.01
MAX_T = 2000
BATCH_SIZE = 128



import random
import numpy as np
import torch

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

set_seed(42)

## Create the gym env & env visualization

In [None]:
env = gym.make(RAM_ENV_NAME)

In [None]:
done = False
state = env.reset()
images = []
total_reward = 0

while not done:
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)
    img = env.render(mode='rgb_array')
    images.append(img)
    total_reward += reward

env.close()

media.show_video(images, fps=30)
print('total_reward:', total_reward)

0
This browser does not support the video tag.


total_reward: -393.9566513150628


## Create Actor & Critic Network

In [None]:
class Actor(nn.Module):

    def __init__(self, state_size, action_size, hidden=[128]):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden[0])
        self.fc2 = nn.Linear(hidden[0], action_size)

    def forward(self, state):
        x = state
        x = F.relu(self.fc1(x))
        log_probs = F.log_softmax(self.fc2(x), dim=1)
        return log_probs


class Critic(nn.Module):

    def __init__(self, state_size, hidden=[128]):
        super(Critic, self).__init__()
        hidden = [state_size] + hidden + [1]
        self.forward_list = nn.ModuleList([nn.Linear(hidden[i], hidden[i+1]) for i in range(len(hidden)-1)])

    def forward(self, state):
        x = state
        for layer in self.forward_list[:-1]:
            x = F.relu(layer(x))
        values = self.forward_list[-1](x)
        return values

## Create the whole training agent

In [None]:
class Agent:

    def __init__(self, state_size, action_size, lr, gamma, device, mode='MC', use_critic=False, normalize=False):
        self.state_size = state_size
        self.action_size = action_size
        self.lr = lr
        self.gamma = gamma
        self.device = device
        self.mode = mode
        self.use_critic = use_critic
        self.normalize = normalize

        self.Actor = Actor(state_size, action_size).to(self.device)
        self.Critic = Critic(state_size).to(self.device)
        self.actor_optimizer = optim.Adam(self.Actor.parameters(), lr)
        self.critic_optimizer = optim.Adam(self.Critic.parameters(), lr)

    def act(self, states):
        with torch.no_grad():
            states = torch.tensor(states).view(-1, self.state_size).to(self.device)
            log_probs = self.Actor(states)
            probs = log_probs.exp().view(-1).cpu().numpy()
            action = np.random.choice(a=self.action_size, size=1, replace=False, p=probs)[0]
        return action

    def process_data(self, states, actions, rewards, dones):
        states = torch.as_tensor(states, dtype=torch.float, device=self.device)
        actions = torch.as_tensor(actions, dtype=torch.long, device=self.device).view(-1, 1)
        dones   = torch.as_tensor(dones,   dtype=torch.float, device=self.device).view(-1, 1)

        log_probs_all = self.Actor(states)     # shape (N, action_size)
        state_values  = self.Critic(states)    # shape (N, 1)


        log_probs = log_probs_all[:-1, :]
        log_probs = torch.gather(log_probs, dim=1, index=actions)


        L = len(rewards)
        rewards_np = np.array(rewards)
        discounts = self.gamma ** np.arange(L)
        discounted_rewards = rewards_np * discounts

        return state_values, log_probs, rewards_np, discounted_rewards, dones

    def learn(self, state_values, log_probs, rewards, discounted_rewards, dones):

        # Update Critic use MSE
        # Update Actor by maximizing A_t * log(a_t|s_t)

        L = len(discounted_rewards)
        with torch.no_grad():
            G = []
            return_value = 0
            if self.mode == 'MC':
                for i in range(L-1, -1, -1):
                    return_value = rewards[i] + self.gamma * (1-dones[i].detach().numpy()) * return_value
                    G.append(return_value)
                G = G[::-1]
                G = torch.as_tensor(G, dtype=torch.float).view(-1, 1).to(self.device)
            else:
                rewards = torch.as_tensor(rewards, dtype=torch.float).view(-1, 1).to(self.device)
                G = rewards + self.gamma * (1-dones) * state_values[1:, :]

        Critic_Loss = 0.5*(state_values[:-1, :] - G).pow(2).mean()

        with torch.no_grad():
            if self.use_critic:
                G = G - state_values[:-1, :] # advantage
            if self.normalize:
                G = (G - G.mean()) / (G.std() + 0.00001) # normalized advantage

        Actor_Loss = -log_probs * G
        Actor_Loss = Actor_Loss.mean()

        self.critic_optimizer.zero_grad()
        Critic_Loss.backward()
        self.critic_optimizer.step()
        self.actor_optimizer.zero_grad()
        Actor_Loss.backward()
        self.actor_optimizer.step()

## Training pipeline

In [None]:
def train(agent, env, n_episode, max_t, scale=1):
    rewards_log = []
    average_log = []

    for i in range(1, n_episode+1):
        state = env.reset()
        done = False
        t = 0
        state_history = [list(state)]
        action_history = []
        done_history = []
        reward_history = []
        episodic_reward = 0

        while not done and t < max_t:
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            episodic_reward += reward
            action_history.append(action)
            done_history.append(done)
            reward_history.append(reward * scale)
            state = next_state
            state_history.append(list(state))
            t += 1

        state_values, log_probs, rewards, discounted_rewards, dones = agent.process_data(state_history, action_history, reward_history, done_history)
        agent.learn(state_values, log_probs, rewards, discounted_rewards, dones)

        rewards_log.append(episodic_reward)
        average_log.append(np.mean(rewards_log[-100:]))

        print('\rEpisode {} Reward {:.2f}, Average Reward {:.2f}'.format(i, episodic_reward, average_log[-1]), end='')
        if i % 100 == 0:
            print()
        if i % 500 == 0:
            eval(agent, env, max_t)

    return rewards, average_log



def eval(agent, env, max_t):
    total_reward = 0
    t = 0
    images = []
    state = env.reset()
    done = False

    while not done and t < max_t:
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        images.append(env.render(mode='rgb_array'))
        total_reward += reward
        state = next_state
        t += 1

    env.close()
    media.show_video(images, fps=30)
    print('total_reward:', total_reward)


if __name__ == '__main__':
    env = gym.make(RAM_ENV_NAME)
    agent = Agent(env.observation_space.shape[0], env.action_space.n, LEARNING_RATE, GAMMA, DEVICE, MODE, CRITIC, NORMALIZE)
    rewards_log, _ = train(agent, env, RAM_NUM_EPISODE, MAX_T, SCALE)
    np.save('{}_rewards.npy'.format(RAM_ENV_NAME), rewards_log)

  if not isinstance(terminated, (bool, np.bool8)):
  G = torch.as_tensor(G, dtype=torch.float).view(-1, 1).to(self.device)


Episode 100 Reward -52.21, Average Reward -124.99
Episode 200 Reward -56.80, Average Reward -99.06
Episode 300 Reward 35.72, Average Reward -11.72
Episode 400 Reward 32.86, Average Reward 1.61
Episode 500 Reward -98.54, Average Reward -11.05


See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


0
This browser does not support the video tag.


total_reward: 25.165020157503925
Episode 600 Reward 59.97, Average Reward -4.56
Episode 700 Reward -8.14, Average Reward -17.29
Episode 800 Reward -84.85, Average Reward 7.94
Episode 900 Reward -15.03, Average Reward 4.37
Episode 1000 Reward -47.65, Average Reward 8.85


0
This browser does not support the video tag.


total_reward: -4.081993387304523
Episode 1100 Reward 53.20, Average Reward 12.72
Episode 1200 Reward 99.23, Average Reward 1.11
Episode 1265 Reward 23.08, Average Reward 54.26