In [2]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

from gym import ActionWrapper
from gym.wrappers.monitoring.video_recorder import VideoRecorder


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/Users/kpradjinata/opt/anaconda3/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/Users/kpradjinata/opt/anaconda3/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/Users/kpradjinata/Documents/Research/OpenAIGym/venv/lib/python3.9/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/kpradjinata/Documents/Research/OpenAIGym/venv/lib/python3.9/site-packages/traitlets/con

In [3]:
class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 64),  # state_dim should be 4
            nn.ReLU(),
            nn.Linear(64, action_dim)  # action_dim should be 2
        )

    def forward(self, x):
        return self.fc(x)

In [4]:
env = gym.make('CartPole-v1')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

policy = PolicyNetwork(state_dim, action_dim)
optimizer = optim.Adam(policy.parameters(), lr=0.01)


state, _ = env.reset()  # Use this instead of just env.reset()
state, _ = env.reset()
print("Initial state:", state)

action_probs = policy(torch.FloatTensor(state))

Initial state: [ 0.0015697  -0.03605074  0.01477886  0.02003803]


In [5]:
state, _ = env.reset()
state = state['observation'] if isinstance(state, dict) else state
print("State shape:", state.shape)
print("PolicyNetwork input shape:", policy.fc[0].in_features)
action_probs = policy(state if isinstance(state, torch.Tensor) else torch.FloatTensor(state))

State shape: (4,)
PolicyNetwork input shape: 4


In [6]:
import torch
import torch.nn.functional as F
from torch.distributions import Categorical

# Your original logits
logits = torch.tensor([[1.3031, -0.3031]])

# Create the Categorical distribution with logits
dist = Categorical(logits=logits)

# You can now use the distribution for sampling or other operations
action = dist.sample()
log_prob = dist.log_prob(action)

In [11]:
from tqdm import tqdm

num_episodes = 1000
max_steps = 500
rewards = []

# Add a progress bar for episodes
for episode in tqdm(range(num_episodes), desc="Training Progress"):
    state, _ = env.reset()
    episode_reward = 0
    log_probs = []
    
    for step in range(max_steps):
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        action_probs = torch.nn.functional.softmax(policy(state_tensor), dim=-1)
        distribution = torch.distributions.Categorical(action_probs)
        action = distribution.sample()
        log_prob = distribution.log_prob(action)
        
        # Use the wrapper function to avoid np.bool8 error
        next_state, reward, done, truncated, _ = step_with_bool_fix(env, action.item())
        episode_reward += reward
        log_probs.append(log_prob)
        
        if done or truncated:
            break
        
        state = next_state
    
    rewards.append(episode_reward)
    
    # Update policy only if we have collected any log probabilities
    if log_probs:
        loss = -torch.stack(log_probs).sum() * episode_reward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    if episode % 10 == 0:
        avg_reward = np.mean(rewards[-10:])
        print(f"Episode {episode}, Average Reward: {avg_reward:.2f}")

env.close()


Training Progress:   0%|                       | 5/1000 [00:00<00:47, 20.96it/s]

Episode 0, Average Reward: 121.00


Training Progress:   1%|▎                     | 14/1000 [00:00<00:44, 22.27it/s]

Episode 10, Average Reward: 113.70


Training Progress:   2%|▌                     | 23/1000 [00:01<00:47, 20.75it/s]

Episode 20, Average Reward: 120.30


Training Progress:   4%|▊                     | 35/1000 [00:01<00:46, 20.62it/s]

Episode 30, Average Reward: 127.30


Training Progress:   4%|▉                     | 44/1000 [00:02<00:55, 17.23it/s]

Episode 40, Average Reward: 130.80


Training Progress:   5%|█▏                    | 52/1000 [00:02<00:55, 17.01it/s]

Episode 50, Average Reward: 158.00


Training Progress:   6%|█▍                    | 63/1000 [00:03<00:55, 16.94it/s]

Episode 60, Average Reward: 151.90


Training Progress:   7%|█▌                    | 73/1000 [00:04<00:58, 15.72it/s]

Episode 70, Average Reward: 165.80


Training Progress:   8%|█▊                    | 83/1000 [00:04<00:59, 15.52it/s]

Episode 80, Average Reward: 168.50


Training Progress:   9%|██                    | 93/1000 [00:05<01:07, 13.46it/s]

Episode 90, Average Reward: 177.50


Training Progress:  10%|██▏                  | 103/1000 [00:06<01:13, 12.18it/s]

Episode 100, Average Reward: 205.80


Training Progress:  11%|██▎                  | 113/1000 [00:06<01:07, 13.09it/s]

Episode 110, Average Reward: 196.30


Training Progress:  12%|██▌                  | 123/1000 [00:07<01:05, 13.37it/s]

Episode 120, Average Reward: 196.00


Training Progress:  13%|██▊                  | 133/1000 [00:08<01:03, 13.66it/s]

Episode 130, Average Reward: 186.00


Training Progress:  14%|███                  | 143/1000 [00:09<01:02, 13.64it/s]

Episode 140, Average Reward: 194.80


Training Progress:  15%|███▏                 | 151/1000 [00:09<01:01, 13.70it/s]

Episode 150, Average Reward: 186.70


Training Progress:  16%|███▍                 | 161/1000 [00:10<01:04, 12.95it/s]

Episode 160, Average Reward: 206.20


Training Progress:  17%|███▌                 | 171/1000 [00:11<01:20, 10.26it/s]

Episode 170, Average Reward: 259.50


Training Progress:  18%|███▊                 | 182/1000 [00:12<01:32,  8.84it/s]

Episode 180, Average Reward: 281.70


Training Progress:  19%|████                 | 192/1000 [00:13<01:17, 10.39it/s]

Episode 190, Average Reward: 275.40


Training Progress:  20%|████▏                | 202/1000 [00:14<01:17, 10.24it/s]

Episode 200, Average Reward: 216.70


Training Progress:  21%|████▍                | 212/1000 [00:15<01:32,  8.50it/s]

Episode 210, Average Reward: 224.80


Training Progress:  22%|████▋                | 222/1000 [00:17<01:26,  8.97it/s]

Episode 220, Average Reward: 301.90


Training Progress:  23%|████▊                | 231/1000 [00:18<02:28,  5.16it/s]

Episode 230, Average Reward: 324.40


Training Progress:  24%|█████                | 242/1000 [00:19<01:23,  9.04it/s]

Episode 240, Average Reward: 254.10


Training Progress:  25%|█████▎               | 252/1000 [00:20<01:09, 10.74it/s]

Episode 250, Average Reward: 257.80


Training Progress:  26%|█████▌               | 262/1000 [00:21<00:55, 13.32it/s]

Episode 260, Average Reward: 195.90


Training Progress:  27%|█████▋               | 272/1000 [00:22<00:57, 12.74it/s]

Episode 270, Average Reward: 201.60


Training Progress:  28%|█████▉               | 282/1000 [00:23<00:55, 12.84it/s]

Episode 280, Average Reward: 198.00


Training Progress:  29%|██████▏              | 292/1000 [00:24<01:08, 10.36it/s]

Episode 290, Average Reward: 249.50


Training Progress:  30%|██████▎              | 302/1000 [00:25<01:12,  9.61it/s]

Episode 300, Average Reward: 259.30


Training Progress:  31%|██████▌              | 312/1000 [00:26<01:52,  6.12it/s]

Episode 310, Average Reward: 388.60


Training Progress:  32%|██████▊              | 322/1000 [00:28<01:18,  8.59it/s]

Episode 320, Average Reward: 333.30


Training Progress:  33%|██████▉              | 332/1000 [00:29<01:24,  7.89it/s]

Episode 330, Average Reward: 333.40


Training Progress:  34%|███████▏             | 343/1000 [00:30<01:05, 10.10it/s]

Episode 340, Average Reward: 287.60


Training Progress:  35%|███████▍             | 353/1000 [00:31<00:51, 12.48it/s]

Episode 350, Average Reward: 213.80


Training Progress:  36%|███████▌             | 363/1000 [00:32<00:45, 13.87it/s]

Episode 360, Average Reward: 185.70


Training Progress:  37%|███████▊             | 373/1000 [00:32<00:44, 14.06it/s]

Episode 370, Average Reward: 191.70


Training Progress:  38%|████████             | 383/1000 [00:33<00:51, 12.07it/s]

Episode 380, Average Reward: 222.80


Training Progress:  39%|████████▏            | 391/1000 [00:34<00:49, 12.25it/s]

Episode 390, Average Reward: 214.20


Training Progress:  40%|████████▍            | 403/1000 [00:35<01:05,  9.10it/s]

Episode 400, Average Reward: 330.40


Training Progress:  41%|████████▋            | 413/1000 [00:36<00:53, 11.06it/s]

Episode 410, Average Reward: 246.60


Training Progress:  42%|████████▉            | 423/1000 [00:37<00:44, 13.07it/s]

Episode 420, Average Reward: 237.40


Training Progress:  43%|█████████            | 434/1000 [00:38<00:38, 14.78it/s]

Episode 430, Average Reward: 181.80


Training Progress:  44%|█████████▎           | 442/1000 [00:38<00:34, 16.35it/s]

Episode 440, Average Reward: 159.50


Training Progress:  45%|█████████▌           | 454/1000 [00:39<00:34, 15.81it/s]

Episode 450, Average Reward: 190.70


Training Progress:  46%|█████████▋           | 462/1000 [00:40<00:38, 14.01it/s]

Episode 460, Average Reward: 181.60


Training Progress:  47%|█████████▉           | 474/1000 [00:41<00:33, 15.60it/s]

Episode 470, Average Reward: 218.10


Training Progress:  48%|██████████▏          | 484/1000 [00:41<00:33, 15.21it/s]

Episode 480, Average Reward: 178.10


Training Progress:  49%|██████████▎          | 494/1000 [00:42<00:26, 18.98it/s]

Episode 490, Average Reward: 157.00


Training Progress:  50%|██████████▌          | 504/1000 [00:42<00:27, 17.92it/s]

Episode 500, Average Reward: 130.80


Training Progress:  51%|██████████▊          | 513/1000 [00:43<00:25, 18.95it/s]

Episode 510, Average Reward: 145.40


Training Progress:  52%|███████████          | 524/1000 [00:43<00:26, 17.96it/s]

Episode 520, Average Reward: 157.50


Training Progress:  53%|███████████▏         | 533/1000 [00:44<00:27, 17.06it/s]

Episode 530, Average Reward: 149.80


Training Progress:  54%|███████████▍         | 543/1000 [00:45<00:32, 14.08it/s]

Episode 540, Average Reward: 168.30


Training Progress:  55%|███████████▌         | 551/1000 [00:45<00:32, 13.95it/s]

Episode 550, Average Reward: 188.50


Training Progress:  56%|███████████▊         | 562/1000 [00:46<00:34, 12.81it/s]

Episode 560, Average Reward: 198.00


Training Progress:  57%|████████████         | 572/1000 [00:47<00:31, 13.54it/s]

Episode 570, Average Reward: 193.20


Training Progress:  58%|████████████▏        | 582/1000 [00:48<00:28, 14.79it/s]

Episode 580, Average Reward: 185.30


Training Progress:  59%|████████████▍        | 592/1000 [00:48<00:33, 12.06it/s]

Episode 590, Average Reward: 205.10


Training Progress:  60%|████████████▋        | 602/1000 [00:50<00:41,  9.53it/s]

Episode 600, Average Reward: 280.10


Training Progress:  61%|████████████▉        | 614/1000 [00:50<00:25, 14.85it/s]

Episode 610, Average Reward: 198.60


Training Progress:  62%|█████████████        | 622/1000 [00:51<00:26, 14.45it/s]

Episode 620, Average Reward: 175.20


Training Progress:  63%|█████████████▎       | 634/1000 [00:52<00:24, 15.07it/s]

Episode 630, Average Reward: 189.60


Training Progress:  64%|█████████████▌       | 644/1000 [00:52<00:24, 14.57it/s]

Episode 640, Average Reward: 175.10


Training Progress:  65%|█████████████▋       | 654/1000 [00:53<00:23, 14.67it/s]

Episode 650, Average Reward: 187.60


Training Progress:  66%|█████████████▉       | 663/1000 [00:54<00:22, 14.73it/s]

Episode 660, Average Reward: 174.10


Training Progress:  67%|██████████████▏      | 673/1000 [00:55<00:26, 12.38it/s]

Episode 670, Average Reward: 188.90


Training Progress:  68%|██████████████▎      | 683/1000 [00:55<00:23, 13.33it/s]

Episode 680, Average Reward: 195.80


Training Progress:  69%|██████████████▌      | 693/1000 [00:56<00:22, 13.92it/s]

Episode 690, Average Reward: 175.20


Training Progress:  70%|██████████████▊      | 703/1000 [00:57<00:20, 14.73it/s]

Episode 700, Average Reward: 181.60


Training Progress:  71%|██████████████▉      | 713/1000 [00:57<00:17, 16.53it/s]

Episode 710, Average Reward: 159.20


Training Progress:  72%|███████████████▏     | 724/1000 [00:58<00:15, 18.01it/s]

Episode 720, Average Reward: 149.30


Training Progress:  73%|███████████████▍     | 733/1000 [00:58<00:13, 19.66it/s]

Episode 730, Average Reward: 123.90


Training Progress:  74%|███████████████▋     | 745/1000 [00:59<00:12, 21.20it/s]

Episode 740, Average Reward: 130.50


Training Progress:  75%|███████████████▊     | 752/1000 [00:59<00:14, 16.85it/s]

Episode 750, Average Reward: 155.90


Training Progress:  76%|████████████████     | 762/1000 [01:00<00:18, 12.88it/s]

Episode 760, Average Reward: 190.50


Training Progress:  77%|████████████████▏    | 772/1000 [01:01<00:17, 12.69it/s]

Episode 770, Average Reward: 199.60


Training Progress:  78%|████████████████▍    | 782/1000 [01:02<00:20, 10.81it/s]

Episode 780, Average Reward: 248.70


Training Progress:  79%|████████████████▋    | 792/1000 [01:03<00:18, 11.46it/s]

Episode 790, Average Reward: 228.70


Training Progress:  80%|████████████████▊    | 802/1000 [01:04<00:15, 12.49it/s]

Episode 800, Average Reward: 210.40


Training Progress:  81%|█████████████████    | 812/1000 [01:05<00:16, 11.26it/s]

Episode 810, Average Reward: 241.70


Training Progress:  82%|█████████████████▎   | 824/1000 [01:06<00:13, 13.28it/s]

Episode 820, Average Reward: 208.00


Training Progress:  83%|█████████████████▍   | 833/1000 [01:06<00:10, 16.58it/s]

Episode 830, Average Reward: 163.40


Training Progress:  84%|█████████████████▋   | 843/1000 [01:07<00:10, 14.29it/s]

Episode 840, Average Reward: 165.60


Training Progress:  85%|█████████████████▉   | 853/1000 [01:07<00:10, 14.47it/s]

Episode 850, Average Reward: 165.70


Training Progress:  86%|██████████████████   | 863/1000 [01:08<00:08, 15.82it/s]

Episode 860, Average Reward: 174.10


Training Progress:  87%|██████████████████▎  | 873/1000 [01:09<00:07, 16.87it/s]

Episode 870, Average Reward: 157.30


Training Progress:  88%|██████████████████▌  | 883/1000 [01:09<00:07, 16.38it/s]

Episode 880, Average Reward: 175.60


Training Progress:  89%|██████████████████▊  | 893/1000 [01:10<00:07, 14.76it/s]

Episode 890, Average Reward: 183.80


Training Progress:  90%|██████████████████▉  | 901/1000 [01:11<00:07, 14.07it/s]

Episode 900, Average Reward: 184.00


Training Progress:  91%|███████████████████▏ | 912/1000 [01:12<00:12,  6.78it/s]

Episode 910, Average Reward: 337.80


Training Progress:  92%|███████████████████▎ | 922/1000 [01:14<00:14,  5.27it/s]

Episode 920, Average Reward: 500.00


Training Progress:  93%|███████████████████▌ | 932/1000 [01:15<00:07,  8.80it/s]

Episode 930, Average Reward: 321.60


Training Progress:  94%|███████████████████▊ | 941/1000 [01:17<00:14,  3.93it/s]

Episode 940, Average Reward: 462.70


Training Progress:  95%|███████████████████▉ | 951/1000 [01:20<00:13,  3.57it/s]

Episode 950, Average Reward: 500.00


Training Progress:  96%|████████████████████▏| 961/1000 [01:22<00:08,  4.77it/s]

Episode 960, Average Reward: 500.00


Training Progress:  98%|████████████████████▍| 975/1000 [01:24<00:01, 12.78it/s]

Episode 970, Average Reward: 378.10


Training Progress:  98%|████████████████████▋| 984/1000 [01:24<00:00, 19.62it/s]

Episode 980, Average Reward: 118.00


Training Progress:  99%|████████████████████▊| 994/1000 [01:24<00:00, 24.19it/s]

Episode 990, Average Reward: 106.00


Training Progress: 100%|████████████████████| 1000/1000 [01:25<00:00, 11.74it/s]
