In [7]:
import gymnasium as gym
from gymnasium.wrappers import RecordVideo
import os
import shutil
import torch
import time
from collections import deque
import random
import numpy as np
from tqdm import tqdm
from matplotlib import pyplot as plt
import seaborn as sns
import torch.nn as nn

In [5]:
# Check if CUDA is available
cuda_available = torch.cuda.is_available()
print(f"CUDA available: {cuda_available}")

# Get the number of available GPUs
gpu_count = torch.cuda.device_count()
print(f"Number of GPUs available: {gpu_count}")

# Get current device information
if cuda_available:
    current_device = torch.cuda.current_device()
    print(f"Current device: {current_device}")
    
    # Print information for each available GPU
    for i in range(gpu_count):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"Memory allocated: {torch.cuda.memory_allocated(i) / 1e9:.2f} GB")
        print(f"Memory reserved: {torch.cuda.memory_reserved(i) / 1e9:.2f} GB")
        print("-" * 40)
else:
    print("No GPU available. Using CPU.")

CUDA available: True
Number of GPUs available: 1
Current device: 0
GPU 0: NVIDIA GeForce RTX 3060
Memory allocated: 0.00 GB
Memory reserved: 0.00 GB
----------------------------------------


In [6]:
def build_env(name = 'LunarLander-v3', record_name = None):
    # Delete all contents in lunar-agent folder

    # Initialise the environment
    env = gym.make(name, render_mode="rgb_array")

    if record_name != None and record_name != "":
        if os.path.exists(record_name):
            shutil.rmtree(record_name)

        env = RecordVideo(
            env,
            video_folder=record_name,
            episode_trigger=lambda x: True,  # Record every episode
            name_prefix="training",
            video_length=3000,  # Maximum number of steps to record per episode
        )

    return env

env = build_env()

# Reset the environment to generate the first observation
observation, info = env.reset(seed=42)

for _ in range(1000):
    # this is where you would insert your policy
    action = env.action_space.sample()

    # step (transition) through the environment with the action
    # receiving the next observation, reward and if the episode has terminated or truncated
    observation, reward, terminated, truncated, info = env.step(action)

    # If the episode has ended then we can reset to start a new episode
    if terminated or truncated:
        observation, info = env.reset()

env.close()


### PPO

in value based, we use value function (Qnet, Q-table) to estimate the policy

policy-based directly optimize the policy function without using intermediate value function

In [19]:
class Policy(nn.Module):
    def __init__(self, state_size = 8, hidden_size = 16, action_size = 4):
        super(Policy, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(8, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, action_size),
            nn.Softmax(dim=1) #turn output into probability distribution
        )

    def forward(self, x):
        return self.model(x)

    def act(self, state):
        tensor = torch.tensor(state, dtype=torch.float32)
        probs = self.forward(tensor)
        m = torch.distributions.Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)

# Test
model = Policy()

state = torch.rand(1, 8)

action, log_prob = model.act(state)
print(action)
print(log_prob.item())

3
-1.0543193817138672


  tensor = torch.tensor(state, dtype=torch.float32)


In [None]:
#train
policy = Policy()
optimizer = torch.optim.Adam(policy.parameters(), lr=1e-2)
gamma = 0.99
num_episodes = 1000
reward_threshold = 200
print_every = 10
total_steps = 1000

env = build_env()

for i in range(total_steps):
    
    




env.close()

