Required Imports

In [1]:
import torch
from torch import nn
import gym
from torch.distributions import Categorical
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("You are using device: %s" % device)

REINFORCE neural network

In [3]:
def _weights_init(m):
    classname = m.__class__.__name__
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)

class Reinforce(torch.nn.Module):
    def __init__(self, H):
        """
        Note that we default the input size to be 4 (because of the 4 observations)
        and the output size to be 2 (because of the two possible actions, left and right).
        """
        super(Reinforce, self).__init__()
        self.reinforce = nn.Sequential(
                                       nn.Linear(8, H),
                                       nn.ReLU(),
                                       nn.Linear(H, 4),
                                       nn.Softmax(dim=1)

        )
        self.apply(_weights_init)
        self.log_probs = []
        self.rewards = []
    def forward(self, x):
        """
        Not much to add here.
        """
        out = self.reinforce(x)
        return out

    def act(self,state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        probs = self.forward(state)
        m = Categorical(probs=probs)
        selected_action = m.sample()
        self.log_probs.append(m.log_prob(selected_action))
        return selected_action.item()

In [7]:
def reinforce(num_episodes, gamma):
    reward_list_plot = []
    for episode in range(num_episodes):
        state = env.reset()
        done = False
        score = 0
        while not done:
            #env.render()
            action = agent.act(state)
            state, reward, done, info = env.step(action)
            agent.rewards.append(reward)
            score += reward

            if done:
                if episode % 100 == 0:
                    print(f"Currently on episode {episode}")
                reward_list_plot.append(score)
                if episode == (num_episodes - 1):
                    numbers_series = pd.Series(reward_list_plot)
                    windows = numbers_series.rolling(25)
                    moving_averages = windows.mean()
                    moving_averages_list = moving_averages.tolist()

                    plt.plot(reward_list_plot)
                    plt.plot(moving_averages_list)
                    plt.legend(["Raw Score Per Episode ", "Moving Average Score"], loc='lower right')
                    plt.xlabel('Episode (REINFORCE)')
                    plt.ylabel('Episode Score')
                    plt.show()
                break
        finish_episode(gamma)
    env.close()

def finish_episode(gamma):
    policy_loss = []
    eps = np.finfo(np.float32).eps.item()
    discounts = [gamma ** i for i in range(len(agent.rewards))]
    rewards = torch.tensor(agent.rewards)
    discounts = torch.tensor(discounts)
    rewards = discounts * rewards
    rewards = (rewards - rewards.mean()) / (rewards.std() + eps)
    for log_prob, reward in zip(agent.log_probs, rewards):
        policy_loss.append(reward * -log_prob)
    optimizer.zero_grad()
    policy_loss = torch.cat(policy_loss).sum()
    policy_loss.backward()
    optimizer.step()
    del agent.rewards[:]
    del agent.log_probs[:]

Running the agent

In [None]:
!!pip3 install box2d-py

In [None]:
env = gym.make('LunarLander-v2')

agent = Reinforce(256).to(device)
optimizer = torch.optim.Adam(agent.parameters(), lr=1e-2)
reinforce(300,0.99)