In [None]:
# Libraries and dependencies
import gymnasium as gym
import numpy as np
from tqdm import tqdm
from gymnasium.wrappers import RecordVideo
import torch
import torch.nn as nn
import torch.distributions as distributions
import torch.optim as optim
import matplotlib.pyplot as plt

In [None]:
# Training environment (wind disabled)
env = gym.make(
    "LunarLander-v3",
    continuous=False,     
    gravity=-10.0,        
    enable_wind=False,   
    wind_power=15.0,      
    turbulence_power=1.0, 
    render_mode="rgb_array" 
)

# Evaluation environment (wind enabled)
video_env = gym.make(
    "LunarLander-v3",
    continuous=False,     
    gravity=-10.0,        
    enable_wind=False,   
    wind_power=15.0,      
    turbulence_power=1.0, 
    render_mode="rgb_array" 
)

# Record video every 100 epochs
video_env = RecordVideo(video_env, video_folder="a2c_1e6_256_2048_3e4", episode_trigger=lambda x: x % 1000 == 0)   

In [None]:
space_dim = env.observation_space.shape[0]      # Observation space: 8-dimensional vector
action_dim = env.action_space.n                 # Action space: 4 discrete actions

# Training parameters (following paper)
gamma = 0.99                # Discount factor
lr = 3e-4                   # Learning rate
lamb = 0.95                 # Generalised Advantage Estimation (GAE) lambda
epsilon = 0.2               # Clipping value
h = 0.01                    # Entropy coefficient
v = 0.5                     # Value loss coefficient
max_timesteps = 1e6         # Maximal number of iterations
eval_episodes = 100         # Episodes for evaluation
N = 1                       # Number of agents collecting training data
T = 2048                    # Maximal trajectory length
K = 10                      # Number of epoches per update
minibatch_size = 256        # Size of a mini batch
number_minibatches = N * T / minibatch_size     # Number of mini batches
actor_losses = []           # For plotting
critic_losses = []
eval_rewards = []            
index = []

In [None]:
# The network to select an action
ActorNetwork = nn.Sequential(
    nn.Linear(space_dim, 128),
    nn.LeakyReLU(),
    nn.Linear(128, 128),
    nn.LeakyReLU(),
    nn.Linear(128, action_dim)
)

# The network to get value of a state
CriticNetwork = nn.Sequential(
    nn.Linear(space_dim, 128),
    nn.LeakyReLU(),
    nn.Linear(128, 128),
    nn.LeakyReLU(),
    nn.Linear(128, 1)
)

# Optimizer using Adam Gradient Descent
actor_optimizer = optim.Adam(ActorNetwork.parameters(), lr=lr)
critic_optimizer = optim.Adam(CriticNetwork.parameters(), lr=lr)

In [None]:
state = env.reset()
timesteps = 0

while timesteps < max_timesteps:
    states, actions, rewards, log_probs, values, dones = [], [], [], [], [], []
    print(timesteps)

    for _ in range(T):
        