# Assignment 2

Instructions: Implement both PG and an evolutionary algorithm to solve the Open AI Gym Lunar Lander problem, and then apply it to my area of choice, which is chess.

First, we need to do some setup

In [1]:
import torch
import numpy as np
import gym

# Set the device
if torch.cuda.is_available():
    device = "gpu" # 🧮
# elif torch.backends.mps.is_available():
#     device = "mps" # 🧠
else:
    device = "cpu" # 🥺
    
print(f"Using device: {device}")

Using device: cpu


First, we need to write the code for our Policy Gradient function with a baseline (taken from REINFORCE). I'm going to use PyTorch as my neural network library (I want to try JAX, but this is the more practical choice for me at the moment. Exploration-Exploitation tradeoff 🤷‍♂️).

I'm going to start with a basic feed forward net for both the network that chooses the policy and the network that learns states' values.

First, the policy network for choosing actions

In [2]:
from torch import nn

class PolicyChoice(nn.Module):
    def __init__(self):
        super(PolicyChoice, self).__init__()
        self.flatten = nn.Flatten()
        self.policy = nn.Sequential(
            nn.Linear(8, 8),
            nn.ReLU(),
            nn.Linear(8, 4), # TODO: try reducing to one hidden layer if learning proves initially dificult
            nn.ReLU(),
            nn.Linear(4, 4),
            nn.Softmax(dim=0) # log softmax for a nice interpretation as probabilities of choosing actions
        )

    def forward(self, x):
        probs = self.policy(x)
        return probs

policy_model = PolicyChoice().to(device)
policy_adam = torch.optim.Adam(policy_model.parameters(), 1e-3)

For our loss function for the policy network, we want to adjust just the parameters with the primary aim of affecting the probability of taking the action that we took on that time step. If the return of the resulting state is better than expected, we want to increase it proportionally. If it is less than expected, we want to decrease it proportionally. Thus, we multiply the gradient of the parameter weights w.r.t. the taken action's probability by the difference of the return for that state-action pair.

Importantly, there is an extra factor however that we must consider; when we decide that we want to take the gradient of the parameters w.r.t. a specific action's return, the policy expectancy must be multiplied by the specific action's likelihood to determine the value it contributes to the policy. Thus, we end up with the gradient of the action's probability conditioned on the state and parameters. 

Thus, the general concept of loss to backpropogate in the REINFORCE algorithm is:


$\Large (G_t - \hat{\upsilon}) \frac{\nabla\pi(A_t|S_t, \theta)}{\pi(A_t|S_t, \theta)}$

This can be expressed as:

$\Large (G_t - \hat{\upsilon}) \nabla \ln{\pi(A_t|S_t, \theta)}$


The code below just worries about the loss and not the gradient, as PyTorch provides autograd differentiation behind the scenes.

In [3]:
# TODO: pretty sure I need to change this have all 4 softmax outputs
def policy_loss(prob, state_util_difference):
    nll_loss = nn.NLLLoss()
    return nll_loss(prob, torch.ones(1)) * state_util_difference

Now, the network for approximating state utililities.

In [4]:
class StateUtility(nn.Module):
    def __init__(self):
        super(StateUtility, self).__init__()
        self.flatten = nn.Flatten()
        self.state_utility = nn.Sequential(
            nn.Linear(8, 8),
            nn.ReLU(),
            nn.Linear(8, 4), # TODO: try reducing to one hidden layer if learning proves initially dificult
            nn.ReLU(),
            nn.Linear(4, 1), # output a tensor of a scalar value
        )

    def forward(self, x):
        state_utility = self.state_utility(x)
        return state_utility

state_util_model = StateUtility().to(device)
state_util_adam = torch.optim.Adam(state_util_model.parameters(), 1e-3)

For the state utilities network, we just use L1 loss with the gradients of W with respect to state utility.

$\Large (G_t - \hat{\upsilon}(S_t, W)) \nabla \hat{\upsilon}(S_t, W)$

Like above, the code below just worries about the loss and not the gradient, as PyTorch provides autograd differntiation.

In [5]:
def state_util_loss(calculated_state_value, episode_state_value):
    # the overall state value is the input, and the individual state value is our target
    l1_loss = nn.L1Loss()
    return l1_loss(calculated_state_value, episode_state_value)


Let's define our hyperparameters

In [6]:
gamma = .99

Let's load the Lunar Lander environment now

In [7]:
# TODO: use a custom dataloader class and see if speed up

env = gym.make(
    "LunarLander-v2",
    #render_mode="human"
)

action_space_seed = np.random.seed(13)

observation, info = env.reset(seed=13)

episodes_total_rewards = []
# for debug of state-value funtion
episode_total_state_err = []

# index i in the lists below corresponds to the timestep i of the current episode
observations = []
rewards = []
# for debug of state-value funtion
state_err = []



for timestep in range(100000):
    
    # use policy gradient to get action probabilities; sample stochastically
    action_weights = np.array(policy_model(torch.tensor(observation, device=device)).tolist())
    action_array = np.random.multinomial(n=1, pvals=action_weights)
    action = np.argmax(action_array)
    
    observation, reward, terminated, truncated, info = env.step(action)
    observations.append(observation)
    rewards.append(reward)
    
    # end of episode
    if terminated or truncated:
        ep_length = len(observations)
        ep_total_reward = np.sum(np.array(rewards))
        episodes_total_rewards.append(ep_total_reward)
        returns = np.zeros(len(observations))
        for timestep in reversed(range(ep_length)):

            # calculate state's actual return by looking at reward + future rewards
            terminal = timestep == len(rewards) - 1
            returns[timestep] = rewards[timestep] + (gamma * returns[timestep+1]) if not terminal else rewards[timestep]
            
            pred_state_util = state_util_model(torch.tensor(observations[timestep], device=device))
            actual_state_util = torch.tensor([returns[timestep]], device=device, dtype=torch.float32)
            loss_state_utility = state_util_loss(pred_state_util, actual_state_util)
            with torch.no_grad():
                state_pred_err = np.abs(loss_state_utility.item())
                state_err.append(state_pred_err)
            
            state_util_adam.zero_grad()
            # print(returns[timestep])
            # with torch.no_grad():
                # _before_update = state_util_model(torch.tensor(observations[timestep], device=device, dtype=torch.float32))
                # print(_before_update)
            loss_state_utility.backward()
            state_util_adam.step()
            # with torch.no_grad():
                # _after_update = state_util_model(torch.tensor(observations[timestep], device=device, dtype=torch.float32))
                # print(_after_update)

        episode_total_state_err.append(np.sum(np.array(state_err)))

        observation, info = env.reset()
        observations, rewards = [], []
        state_err = []

print(episode_total_state_err)
print(f'The state val prediction error on the first quarter of episodes was: {np.sum(episode_total_state_err[:len(episode_total_state_err)//4])}')
print(f'The state val prediction error on the second quarter of episodes was: {np.sum(episode_total_state_err[len(episode_total_state_err)//4:2 * len(episode_total_state_err)//4])}')
print(f'The state val prediction error on the third quarter of episodes was: {np.sum(episode_total_state_err[2 * len(episode_total_state_err)//4:3 *len(episode_total_state_err)//4])}')
print(f'The state val prediction error on the fourth quarter of episodes was: {np.sum(episode_total_state_err[3 *len(episode_total_state_err)//4:len(episode_total_state_err)])}')
env.close()

[6941.991863250732, 16199.033813476562, 14587.366539001465, 17708.747940063477, 1813.0881671905518, 11828.75845336914, 3936.683624267578, 5558.287757873535, 8201.771430969238, 7016.337368011475, 1721.964900970459, 1753.5125560760498, 2301.5042877197266, 613.7211799621582, 533.0879364013672, 6485.171836853027, 650.1154403686523, 10026.791114807129, 1170.4592094421387, 2693.27001953125, 6969.311126708984, 2133.174362182617, 17417.772270202637, 2391.0910873413086, 3139.4989891052246, 8991.500480651855, 10585.202880859375, 11633.80337524414, 5597.613437652588, 6394.007350921631, 2481.911632537842, 2001.418025970459, 1314.5760803222656, 732.0543823242188, 8012.825832366943, 5088.1151695251465, 1573.9752311706543, 1295.7779731750488, 1385.4859352111816, 971.548168182373, 1410.4799728393555, 550.0129508972168, 24082.834175109863, 2004.9600067138672, 8964.507669448853, 5144.946460723877, 4546.897125244141, 2693.2973861694336, 5437.583400726318, 743.7057952880859, 2118.2358016967773, 2147.36110