In [4]:
# original : https://github.com/DeepReinforcementLearning/DeepReinforcementLearningInAction/blob/master/Chapter%204/Ch4_book.ipynb

import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import time
from tqdm import tqdm
import wandb
from torch.utils.tensorboard import SummaryWriter

In [5]:
args = {
    'env_id': 'CartPole-v1',
    'algorithm': 'REINFORCE',
    'algorithm_version': 'v1',
    'truncated' : 500,
    'seed': 42,
    'cuda': True,
    'learning_rate' : 0.0003,
    'total_timesteps' : 300000,
    'wandb_entity' : None,
    'gamma' : 0.99,
    'capture_video' : False,
}

project_path = args['env_id'].split('/')[-1]
device = torch.device("cuda" if torch.cuda.is_available() and args["cuda"] else "cpu")
run_name=f"{args['algorithm']}_{args['algorithm_version']}_{int(time.time())}"

print(f'project_path: {project_path}, device : {device}, run_name : {run_name}')

project_path: CartPole-v1, device : cuda, run_name : REINFORCE_v1_1674898628


In [6]:
class QNetwork(torch.nn):
    def __init__(self, env:gym.Env):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(env.observation_space.shape[0], 128),
            nn.LeakyReLU(),
            nn.Linear(128, env.action_space.n),
            nn.Softmax()
        )
    def forward(self, x):
        return self.network(x)

TypeError: module() takes at most 2 arguments (3 given)

In [None]:
env = gym.make(args["env_id"], render_mode=None)
if args['truncated']:
    env = gym.wrappers.TimeLimit(env, args['truncated'])
env = gym.wrappers.AutoResetWrapper(env)
env = gym.wrappers.RecordEpisodeStatistics(env)

In [None]:
wandb.init(
    # set the wandb project where this run will be logged
    name=run_name,
    project=project_path,
    entity=args['wandb_entity'],
    # sync_tensorboard=True,
    config=args,
    monitor_gym=True,
    save_code=True
)

writer = SummaryWriter(f'runs/{project_path}/{run_name}')
writer.add_text(
    "hyperparameters",
    "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in args.items()])),
)

[34m[1mwandb[0m: Currently logged in as: [33miamhelpingstar[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
def discount_rewards(rewards, gamma=0.99):
    lenr = len(rewards)
    #A Compute exponentially decaying rewards
    disc_return = torch.pow(gamma,torch.arange(lenr).float()) * rewards
    #B Normalize the rewards to be within the [0,1] interval to improve numerical stability
    disc_return /= disc_return.max()
    return disc_return

In [None]:
#A The loss function expects an array of action probabilities for the actions that were taken and the discounted rewards.
def loss_fn(preds, r):
    #B It computes the log of the probabilities, multiplies by the discounted rewards, sums them all and flips the sign.
    return -1 * torch.sum(r * torch.log(preds)) #B

$$-\sum \gamma r_{t+1} \log \pi (a_t \mid \theta, S_t)$$

In [None]:
q_network = QNetwork()
optimizer = optim.Adam(q_network.parameters(), lr=args['learning_rate'])
obs, _ = env.reset()
for global_step in tqdm(range(args['total_timesteps'])):
    obs, _ = env.reset()