In [1]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

from gym import ActionWrapper
from gym.wrappers.monitoring.video_recorder import VideoRecorder

In [2]:
class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 64),  # state_dim should be 4
            nn.ReLU(),
            nn.Linear(64, action_dim)  # action_dim should be 2
        )

    def forward(self, x):
        return self.fc(x)

In [3]:
env = gym.make('CartPole-v1')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

policy = PolicyNetwork(state_dim, action_dim)
optimizer = optim.Adam(policy.parameters(), lr=0.01)


state, _ = env.reset()  # Use this instead of just env.reset()
state, _ = env.reset()
print("Initial state:", state)

action_probs = policy(torch.FloatTensor(state))

Initial state: [-0.03948366 -0.01858574 -0.00191666  0.02249132]


In [4]:
state, _ = env.reset()
state = state['observation'] if isinstance(state, dict) else state
print("State shape:", state.shape)
print("PolicyNetwork input shape:", policy.fc[0].in_features)
action_probs = policy(state if isinstance(state, torch.Tensor) else torch.FloatTensor(state))

State shape: (4,)
PolicyNetwork input shape: 4


In [5]:
import torch
import torch.nn.functional as F
from torch.distributions import Categorical

# Your original logits
logits = torch.tensor([[1.3031, -0.3031]])

# Create the Categorical distribution with logits
dist = Categorical(logits=logits)

# You can now use the distribution for sampling or other operations
action = dist.sample()
log_prob = dist.log_prob(action)

In [7]:
num_episodes = 1000
max_steps = 500
rewards = []

for episode in range(num_episodes):
    state, _ = env.reset()
    episode_reward = 0
    log_probs = []
    
    for step in range(max_steps):
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        action_probs = policy(state_tensor)
        distribution = torch.distributions.Categorical(action_probs)
        action = distribution.sample()
        log_prob = distribution.log_prob(action)
        
        next_state, reward, done, _, _ = env.step(action.item())
        episode_reward += reward
        log_probs.append(log_prob)
        
        if done:
            break
        
        state = next_state
    
    rewards.append(episode_reward)
    
    # Update policy only if we have collected any log probabilities
    if log_probs:
        loss = -torch.stack(log_probs).sum() * episode_reward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    if episode % 10 == 0:
        avg_reward = np.mean(rewards[-10:])
        print(f"Episode {episode}, Average Reward: {avg_reward:.2f}")

env.close()

AttributeError: module 'numpy' has no attribute 'bool8'