In [2]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:

env = gym.make('Pong-v4', render_mode = "human")

observation, info = env.reset()

for _ in range(1000):
    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)

    if terminated or truncated:
        observation, info = env.reset()

env.close()

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


In [5]:
class PolicyNetwork(nn.Module):
    def __init__(self, input_size, output_size):
        super(PolicyNetwork, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Linear(64, output_size)
        )
    
    def forward(self, x):
        return torch.softmax(self.network(x), dim=-1)


In [6]:
env = gym.make('Pong-v4')
input_size = np.prod(env.observation_space.shape)  # Flatten the observation space
output_size = env.action_space.n  # Number of actions in Pong

policy = PolicyNetwork(input_size, output_size)
optimizer = optim.Adam(policy.parameters(), lr=0.01)


In [7]:
def select_action(policy, state):
    state = torch.FloatTensor(np.array(state).flatten()).unsqueeze(0)  # Convert to tensor and flatten state
    probs = policy(state)
    action = torch.multinomial(probs, 1).item()  # Sample action from probability distribution
    return action


In [8]:
def train_policy(policy, optimizer, trajectories, gamma=0.99):
    policy_loss = []
    returns = []
    
    # Calculate discounted rewards for each trajectory
    for trajectory in trajectories:
        rewards = [step[2] for step in trajectory]
        discounted_rewards = []
        cumulative_reward = 0
        for r in reversed(rewards):
            cumulative_reward = r + gamma * cumulative_reward
            discounted_rewards.insert(0, cumulative_reward)
        
        for (state, action, _, log_prob), R in zip(trajectory, discounted_rewards):
            policy_loss.append(-log_prob * R)
    
    # Perform optimization step
    optimizer.zero_grad()
    loss = torch.stack(policy_loss).mean()
    loss.backward()
    optimizer.step()


In [17]:


# Main training loop
num_episodes = 1000
max_timesteps = 10000
gamma = 0.99

for episode in tqdm(range(num_episodes)):
    state, info = env.reset()  # Unpack observation and additional info
    trajectory = []
    for t in range(max_timesteps):
        action = select_action(policy, state)
        next_state, reward, done, _, _ = env.step(action)
        
        log_prob = torch.log(policy(torch.FloatTensor(np.array(state).flatten()).unsqueeze(0))[0, action])
        trajectory.append((state, action, reward, log_prob))
        
        state = next_state
        if done:
            break
    
    # After each episode, update policy with the collected trajectory
    train_policy(policy, optimizer, [trajectory], gamma=gamma)
    
    if episode % 100 == 0:
        print(f'Episode {episode}: Finished training step')

env.close()

  0%|                                                  | 0/1000 [00:00<?, ?it/s]


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x13824 and 2592x64)

In [9]:
# Define a policy network with convolutional layers
class PolicyNetwork(nn.Module):
    def __init__(self, input_channels, output_size):
        super(PolicyNetwork, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(input_channels, 16, kernel_size=8, stride=4),  # Convolutional layer
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=4, stride=2),  # Second conv layer
            nn.ReLU(),
            nn.Flatten()
        )
        
        self.fc_layers = nn.Sequential(
            nn.Linear(32 * 9 * 9, 64),  # Adjust input size based on image processing
            nn.ReLU(),
            nn.Linear(64, output_size)
        )
    
    def forward(self, x):
        x = self.conv_layers(x)
        x = self.fc_layers(x)
        return torch.softmax(x, dim=-1)

# Update the environment and policy initialization
env = gym.make('Pong-v4')
input_channels = env.observation_space.shape[2]  # Number of channels in the image (usually 3 for RGB)
output_size = env.action_space.n  # Number of actions in Pong

policy = PolicyNetwork(input_channels, output_size)
optimizer = optim.Adam(policy.parameters(), lr=0.01)

# Adjust the state preprocessing for CNN input
def preprocess_state(state):
    state = np.transpose(state, (2, 0, 1))  # Change from HxWxC to CxHxW format for PyTorch
    state = torch.FloatTensor(state).unsqueeze(0)  # Add batch dimension
    return state

# Update the select_action function to handle image input
def select_action(policy, state):
    state = preprocess_state(state)  # Preprocess state for CNN
    probs = policy(state)
    action = torch.multinomial(probs, 1).item()  # Sample action from probability distribution
    return action

# Main training loop remains the same


In [10]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from tqdm import tqdm

# Define a policy network with convolutional layers
class PolicyNetwork(nn.Module):
    def __init__(self, input_channels, output_size):
        super(PolicyNetwork, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(input_channels, 16, kernel_size=8, stride=4),  # Convolutional layer
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=4, stride=2),  # Second conv layer
            nn.ReLU(),
        )
        
        # Dynamically calculate the size of the flattened input from the conv layers
        dummy_input = torch.zeros(1, input_channels, 210, 160)  # Example Pong input image (C, H, W)
        conv_output_size = self.conv_layers(dummy_input).view(1, -1).size(1)
        
        self.fc_layers = nn.Sequential(
            nn.Linear(conv_output_size, 64),  # Adjusted input size
            nn.ReLU(),
            nn.Linear(64, output_size)
        )
    
    def forward(self, x):
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)  # Flatten the conv output
        x = self.fc_layers(x)
        return torch.softmax(x, dim=-1)

# Initialize environment and network
env = gym.make('Pong-v4')
input_channels = env.observation_space.shape[2]  # Number of channels in the image (usually 3 for RGB)
output_size = env.action_space.n  # Number of actions in Pong

policy = PolicyNetwork(input_channels, output_size)
optimizer = optim.Adam(policy.parameters(), lr=0.01)

# Preprocess state to prepare it for CNN
def preprocess_state(state):
    state = np.transpose(state, (2, 0, 1))  # Change from HxWxC to CxHxW format for PyTorch
    state = torch.FloatTensor(state).unsqueeze(0)  # Add batch dimension
    return state

# Select action based on policy network's output
def select_action(policy, state):
    state = preprocess_state(state)  # Preprocess state for CNN
    probs = policy(state)
    action = torch.multinomial(probs, 1).item()  # Sample action from probability distribution
    return action


In [None]:

# Training loop
num_episodes = 1000
max_timesteps = 500
gamma = 0.99

for episode in tqdm(range(num_episodes)):
    state, info = env.reset()  # Unpack observation and additional info
    trajectory = []
    for t in range(max_timesteps):
        action = select_action(policy, state)
        next_state, reward, done, _, _ = env.step(action)
        
        log_prob = torch.log(policy(preprocess_state(state))[0, action])
        trajectory.append((state, action, reward, log_prob))
        
        state = next_state
        if done:
            break
    
    # After each episode, update policy with the collected trajectory
    train_policy(policy, optimizer, [trajectory], gamma=gamma)
    
    if episode % 100 == 0:
        print(f'Episode {episode}: Finished training step')

env.close()


In [6]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from tqdm import tqdm

# Define a policy network with convolutional layers
class PolicyNetwork(nn.Module):
    def __init__(self, input_channels, output_size):
        super(PolicyNetwork, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(input_channels, 16, kernel_size=8, stride=4),  # Convolutional layer
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=4, stride=2),  # Second conv layer
            nn.ReLU(),
        )
        
        # Dynamically calculate the size of the flattened input from the conv layers
        dummy_input = torch.zeros(1, input_channels, 210, 160)  # Example Pong input image (C, H, W)
        conv_output_size = self.conv_layers(dummy_input).view(1, -1).size(1)
        
        self.fc_layers = nn.Sequential(
            nn.Linear(conv_output_size, 64),  # Adjusted input size
            nn.ReLU(),
            nn.Linear(64, output_size)
        )
    
    def forward(self, x):
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)  # Flatten the conv output
        x = self.fc_layers(x)
        return torch.softmax(x, dim=-1)

# Preprocess state to prepare it for CNN
def preprocess_state(state):
    state = np.transpose(state, (2, 0, 1))  # Change from HxWxC to CxHxW format for PyTorch
    state = torch.FloatTensor(state).unsqueeze(0)  # Add batch dimension
    return state

# Select action based on policy network's output
def select_action(policy, state):
    state = preprocess_state(state)  # Preprocess state for CNN
    probs = policy(state)
    action = torch.multinomial(probs, 1).item()  # Sample action from probability distribution
    return action

# Function to compute discounted rewards
def compute_discounted_rewards(trajectory, gamma):
    rewards = [r for _, _, r, _ in trajectory]
    discounted_rewards = []
    cumulative_reward = 0
    for reward in reversed(rewards):
        cumulative_reward = reward + gamma * cumulative_reward
        discounted_rewards.insert(0, cumulative_reward)  # Insert at the beginning to reverse the order
    return discounted_rewards

# Train the policy using trajectories
def train_policy(policy, optimizer, trajectories, gamma=0.99):
    policy.train()  # Set the policy network to training mode
    
    total_loss = 0
    for trajectory in trajectories:
        # Compute discounted rewards for each step in the trajectory
        discounted_rewards = compute_discounted_rewards(trajectory, gamma)
        discounted_rewards = torch.FloatTensor(discounted_rewards)
        
        # Normalize rewards to stabilize training
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9)
        
        # Compute the policy loss (negative log prob * discounted reward)
        policy_loss = []
        for (_, action, reward, log_prob), Gt in zip(trajectory, discounted_rewards):
            policy_loss.append(-log_prob * Gt)
        
        total_loss += torch.stack(policy_loss).sum()  # Combine all the losses for this trajectory
    
    # Backpropagate the loss
    optimizer.zero_grad()  # Reset gradients
    total_loss.backward()  # Compute gradients
    optimizer.step()  # Update the policy network

# Initialize environment and network
env = gym.make('Pong-v4')
input_channels = env.observation_space.shape[2]  # Number of channels in the image (usually 3 for RGB)
output_size = env.action_space.n  # Number of actions in Pong

policy = PolicyNetwork(input_channels, output_size)
optimizer = optim.Adam(policy.parameters(), lr=0.01)


In [5]:

# Training loop
num_episodes = 1000
max_timesteps = 500
gamma = 0.99

for episode in tqdm(range(num_episodes)):
    state, info = env.reset()  # Unpack observation and additional info
    trajectory = []
    for t in range(max_timesteps):
        action = select_action(policy, state)
        next_state, reward, done, _, _ = env.step(action)
        
        log_prob = torch.log(policy(preprocess_state(state))[0, action])
        trajectory.append((state, action, reward, log_prob))
        
        state = next_state
        if done:
            break
    
    # After each episode, update policy with the collected trajectory
    train_policy(policy, optimizer, [trajectory], gamma=gamma)
    
    if episode % 100 == 0:
        print(f'Episode {episode}: Finished training step')

env.close()


  0%|                                                  | 0/1000 [00:00<?, ?it/s]


NameError: name 'select_action' is not defined

In [3]:
# Save the trained model after training (skip if already done)
torch.save(policy.state_dict(), 'pong_policy_network.pth')


NameError: name 'policy' is not defined

In [4]:
# Load the trained model
policy = PolicyNetwork(input_channels, output_size)
policy.load_state_dict(torch.load('pong_policy_network.pth'))
policy.eval()  # Set the policy to evaluation mode


NameError: name 'PolicyNetwork' is not defined

In [11]:
import time

# Function to visualize the trained policy
def evaluate_policy(policy, env, num_episodes=5):
    policy.eval()  # Set the policy to evaluation mode (no gradients calculated)

    for episode in range(num_episodes):
        state, info = env.reset()  # Reset the environment
        done = False
        total_reward = 0

        while not done:
            env.render()  # Render the game (visualize the performance)
            action = select_action(policy, state)  # Use the trained policy to select an action
            next_state, reward, done, _, _ = env.step(action)  # Take the action in the environment
            total_reward += reward  # Accumulate the reward
            state = next_state  # Move to the next state
            time.sleep(0.03)  # Slow down the rendering to make it viewable

        print(f"Episode {episode + 1}: Total Reward: {total_reward}")

    env.close()

# Initialize the environment with render_mode='human' to visualize
env = gym.make('Pong-v4', render_mode='human')

# Visualize the performance of the trained policy
evaluate_policy(policy, env, num_episodes=5)  # Play 5 episodes to visualize performance


  logger.warn(


KeyboardInterrupt: 

In [8]:
import matplotlib.pyplot as plt

# Function to evaluate and track rewards
def evaluate_policy(policy, env, num_episodes=10):
    policy.eval()  # Set the policy to evaluation mode (no gradients are calculated)
    rewards_per_episode = []  # To store total rewards per episode

    for episode in range(num_episodes):
        state, info = env.reset()  # Reset the environment
        done = False
        total_reward = 0

        while not done:
            action = select_action(policy, state)  # Use the trained policy to select an action
            next_state, reward, done, _, _ = env.step(action)  # Take the action in the environment
            total_reward += reward  # Accumulate the reward
            state = next_state  # Move to the next state

        rewards_per_episode.append(total_reward)  # Store the total reward for this episode
        print(f"Episode {episode + 1}: Total Reward: {total_reward}")

    env.close()
    return rewards_per_episode

# Initialize the environment with render_mode='human' if you want to visualize during evaluation
env = gym.make('Pong-v4')

# Evaluate the policy and collect the total rewards
num_episodes = 1000  # Define how many episodes you want to evaluate
rewards = evaluate_policy(policy, env, num_episodes)

# Plot the rewards per episode
plt.plot(range(1, num_episodes + 1), rewards)
plt.title('Total Rewards per Episode')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.show()


Episode 1: Total Reward: -21.0
Episode 2: Total Reward: -21.0
Episode 3: Total Reward: -21.0
Episode 4: Total Reward: -21.0
Episode 5: Total Reward: -21.0
Episode 6: Total Reward: -21.0
Episode 7: Total Reward: -21.0
Episode 8: Total Reward: -21.0
Episode 9: Total Reward: -21.0
Episode 10: Total Reward: -21.0
Episode 11: Total Reward: -21.0
Episode 12: Total Reward: -21.0
Episode 13: Total Reward: -21.0
Episode 14: Total Reward: -21.0
Episode 15: Total Reward: -21.0
Episode 16: Total Reward: -21.0
Episode 17: Total Reward: -21.0
Episode 18: Total Reward: -21.0
Episode 19: Total Reward: -21.0
Episode 20: Total Reward: -21.0
Episode 21: Total Reward: -21.0
Episode 22: Total Reward: -21.0
Episode 23: Total Reward: -21.0
Episode 24: Total Reward: -21.0
Episode 25: Total Reward: -21.0
Episode 26: Total Reward: -21.0
Episode 27: Total Reward: -21.0
Episode 28: Total Reward: -21.0
Episode 29: Total Reward: -21.0
Episode 30: Total Reward: -21.0
Episode 31: Total Reward: -21.0
Episode 32: Total

KeyboardInterrupt: 