In [15]:
!pip install wandb --quiet
!pip install swig --quiet
!pip install gym[all] --quiet
!pip install pygame --quiet

In [16]:
import warnings
warnings.filterwarnings("ignore")

In [17]:
## fixing the pygame error

import os
os.environ['SDL_VIDEODRIVER']='dummy'
import pygame
pygame.display.set_mode((640,480))

<Surface(640x480x32 SW)>

In [18]:
import gym
from gym.envs import box2d
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

In [30]:
class Policy_NN(nn.Module):
    def __init__(self,observation_space,action_space,gamma,learning_rate):
        super().__init__()

        self.net = nn.Sequential(
            nn.Linear(in_features= observation_space, out_features = 128),
            nn.ReLU(),
            nn.Linear(in_features= 128, out_features = action_space))

        self.observation_space = observation_space
        self.action_space = action_space
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.optimizer = optim.Adam(self.parameters(),lr=self.learning_rate)

    def forward(self,x):
      return self.net(x) #forward propagation

    def act(self,obs):
        #With obs, find the maximum Q value and output the corresponding action
        obs_tensor = torch.as_tensor(obs, dtype=torch.float32)
        q_value = self(obs_tensor.unsqueeze(0)) #convert to row vector
        highest_q_value = torch.argmax(input=q_value)
        log_value = torch.log(q_value.squeeze(0)[highest_q_value])
        action = abs(int(log_value.detach().item())) #action corresponding to max Q index
        return action, log_value

In [20]:
class reinforce_update:
    def __init__(self, Policy_NN, rewards, log_value):
        self.network = Policy_NN
        self.rewards = rewards
        self.log_value = log_value

    def update_function(self):
        discount_rewards = []
        Gt,pw = 0, 0
        for r in reversed(rewards):
            Gt = Gt + gamma**pw * r
            pw = pw + 1
            discount_rewards.append(Gt)

        steps_value = np.arange(rewards.size)

        discount_rewards = torch.tensor(discount_rewards)
        discount_rewards = discount_rewards[::-1].cumsum()[::-1] / gamma**steps_value

        # def discount_rewards(rewards, gamma):
        # t_steps = np.arange(rewards.size)
        # r = rewards * gamma**t_steps
        # r = r[::-1].cumsum()[::-1] / gamma**t_steps
        # return r

        policy_gradient = []
        for log_prob, e_d_r in zip(self.log_value, discount_rewards):
            policy_gradient.append(-log_prob*e_d_r)

        policy_network.optimizer.zero_grad()
        policy_gradient = torch.stack(policy_gradient).sum()
        policy_gradient.backward()
        policy_network.optimizer.step()

In [21]:
gamma = 0.97
learning_rate = 0.1

In [32]:
env = gym.make("LunarLander-v2", render_mode="human")
observation_space = env.observation_space.shape[0]
action_space = env.action_space.n
policy_network = Policy_NN(observation_space,action_space,gamma,learning_rate)

episode_array, rewards_array = [] , []

n_episode = 1000
n_timesteps = 1000


Reward_list = np.empty(shape=n_episode)

s = env.reset()

for episode in range(n_episode):

    log_probs,rewards = [], []
    for step in range(n_timesteps):
        action,log_prob = policy_network.act(s)
        s_next, r, done, _ = env.step(action)
        log_probs.append(log_prob)
        rewards.append(r)
        if done:
            reinforce_update.update_function(policy_network, rewards,log_probs)
            rewards_array.append(np.sum(rewards))
            s = env.reset()
        else:
            s = s_next

TypeError: reinforce_update.update_function() takes 1 positional argument but 3 were given

In [None]:
x = episode_array
y = rewards_array
#plotting the comparison between the two
plt.title("Performance of REINFORCE")
plt.plot(x, y, label = "REINFORCE")
plt.xlabel("Episodes")
plt.ylabel("Rewards")
plt.legend()
plt.show()

# Adding WandB section

In [None]:
!wandb login

In [None]:
import wandb
import random

In [None]:
# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="my-awesome-project",

    # track hyperparameters and run metadata
    config={
    "learning_rate": 0.02,
    "architecture": "CNN",
    "dataset": "CIFAR-100",
    "epochs": 10,
    }
)

# simulate training
epochs = 10
offset = random.random() / 5
for epoch in range(2, epochs):
    acc = 1 - 2 ** -epoch - random.random() / epoch - offset
    loss = 2 ** -epoch + random.random() / epoch + offset

    # log metrics to wandb
    wandb.log({"acc": acc, "loss": loss})

# [optional] finish the wandb run, necessary in notebooks
wandb.finish()