In [1]:
import gym
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque


In [2]:

# Set up the Pong environment
env = gym.make('Pong-v4')

# Hyperparameters
gamma = 0.99  # Discount factor
epsilon = 1.0  # Initial exploration rate
epsilon_min = 0.01  # Minimum exploration rate
epsilon_decay = 0.98  # Faster decay rate for epsilon
learning_rate = 0.00025  # Learning rate
batch_size = 32  # Batch size for experience replay
target_update = 500  # Update target network every x steps (less frequent)
memory_size = 5000  # Replay buffer size (smaller for quick demo)
num_episodes = 25  # Reduced number of training episodes

# gamma = 0.99  # Discount factor
# epsilon = 1.0  # Initial exploration rate
# epsilon_min = 0.01  # Minimum exploration rate
# epsilon_decay = 0.995  # Epsilon decay rate
# learning_rate = 0.00025  # Learning rate
# batch_size = 32  # Batch size for experience replay
# target_update = 1000  # Update target network every x steps
# memory_size = 10000  # Replay buffer size
# num_episodes = 100  # Number of training episodes


A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


In [3]:
class DQN(nn.Module):
    def __init__(self):
        super(DQN, self).__init__()

        # Convolutional layers
        self.conv1 = nn.Conv2d(4, 16, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1)  # This is the missing conv3 layer
        
        # Fully connected layers
        self.fc1 = nn.Linear(64 * 6 * 6, 256)  # Adjust this based on the output size of conv3
        self.fc2 = nn.Linear(256, env.action_space.n)
        
        # self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4)
        # self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        # self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        
        # # Update this based on the output size after convolutions
        # self.fc1 = nn.Linear(64 * 6 * 6, 512)  # Adjusted to 2304 input size
        # self.fc2 = nn.Linear(512, env.action_space.n)

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        x = torch.flatten(x, start_dim=1)
        x = torch.relu(self.fc1(x))
        return self.fc2(x)


In [4]:
# Initialize networks and optimizer
policy_net = DQN().to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
target_net = DQN().to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)

# Experience Replay Memory
memory = deque(maxlen=memory_size)

# Function to process state (convert to grayscale and resize)
def preprocess_state(state):
    state = np.array(state)  # Ensure state is a NumPy array
    if len(state.shape) == 3:  # Check if the state is 3D (height, width, channels)
        state = np.mean(state, axis=2).astype(np.uint8)  # Convert to grayscale
    elif len(state.shape) == 1:  # If the state is 1D, reshape and process it
        state = np.array(state).reshape(210, 160, 3)  # Reshape to match Pong frame dimensions
        state = np.mean(state, axis=2).astype(np.uint8)  # Convert to grayscale
    
    state = state[35:195]  # Crop
    state = state[::2, ::2]  # Downsample by factor of 2
    return state

# Function to select an action using epsilon-greedy policy
def select_action(state, epsilon):
    if random.random() < epsilon:
        return env.action_space.sample()
    else:
        with torch.no_grad():
            state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
            q_values = policy_net(state)
            return torch.argmax(q_values).item()

# Function to store experience in replay memory
def store_experience(state, action, reward, next_state, done):
    memory.append((state, action, reward, next_state, done))

# Function to perform experience replay and train the DQN
def replay():
    if len(memory) < batch_size:
        return

    batch = random.sample(memory, batch_size)
    states, actions, rewards, next_states, dones = zip(*batch)

    states = torch.tensor(states, dtype=torch.float32).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
    actions = torch.tensor(actions).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
    rewards = torch.tensor(rewards, dtype=torch.float32).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
    next_states = torch.tensor(next_states, dtype=torch.float32).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
    dones = torch.tensor(dones, dtype=torch.float32).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

    q_values = policy_net(states).gather(1, actions.unsqueeze(1)).squeeze(1)
    next_q_values = target_net(next_states).max(1)[0]
    target_q_values = rewards + (gamma * next_q_values * (1 - dones))

    loss = nn.MSELoss()(q_values, target_q_values)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


In [5]:
def get_conv_output_size():
    with torch.no_grad():
        dummy_input = torch.zeros(1, 4, 80, 80)  # (batch_size, num_frames, height, width)
        conv_out = policy_net.conv1(dummy_input)
        conv_out = policy_net.conv2(conv_out)
        print(f"Output shape after conv layers: {conv_out.shape}")

get_conv_output_size()


Output shape after conv layers: torch.Size([1, 32, 8, 8])


In [6]:
from tqdm import tqdm

# Wrap the episode loop in tqdm for a progress bar
for episode in tqdm(range(num_episodes), desc="Training Progress"):
    state, _ = env.reset()  # Adjust to capture the state correctly if reset returns a tuple
    state = preprocess_state(state)
    state = np.stack([state] * 4, axis=0)  # Stack 4 frames to create state
    total_reward = 0
    done = False

    while not done:
        action = select_action(state, epsilon)

        # Unpack 5 values returned from env.step()
        next_state, reward, done, truncated, info = env.step(action)

        next_state = preprocess_state(next_state)
        next_state = np.stack([next_state] * 4, axis=0)  # Stack 4 frames
        store_experience(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward

        replay()

        # Handle both 'done' and 'truncated' conditions
        if done or truncated:
            break

    if episode % target_update == 0:
        target_net.load_state_dict(policy_net.state_dict())

    epsilon = max(epsilon_min, epsilon_decay * epsilon)

    tqdm.write(f"Episode: {episode}, Total Reward: {total_reward}")

env.close()


  states = torch.tensor(states, dtype=torch.float32).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
Training Progress:   4%|▉                       | 1/25 [03:15<1:18:17, 195.72s/it]

Episode: 0, Total Reward: -21.0


Training Progress:   8%|█▉                      | 2/25 [07:11<1:23:57, 219.00s/it]

Episode: 1, Total Reward: -19.0


Training Progress:  12%|██▉                     | 3/25 [10:27<1:16:30, 208.66s/it]

Episode: 2, Total Reward: -20.0


Training Progress:  16%|███▊                    | 4/25 [13:55<1:12:53, 208.26s/it]

Episode: 3, Total Reward: -20.0


Training Progress:  20%|████▊                   | 5/25 [16:49<1:05:22, 196.13s/it]

Episode: 4, Total Reward: -21.0


Training Progress:  24%|█████▊                  | 6/25 [20:01<1:01:40, 194.79s/it]

Episode: 5, Total Reward: -21.0


Training Progress:  28%|███████▎                  | 7/25 [23:28<59:35, 198.63s/it]

Episode: 6, Total Reward: -21.0


Training Progress:  32%|███████▋                | 8/25 [27:42<1:01:14, 216.14s/it]

Episode: 7, Total Reward: -19.0


Training Progress:  36%|████████▋               | 9/25 [32:14<1:02:21, 233.84s/it]

Episode: 8, Total Reward: -17.0


Training Progress:  40%|██████████               | 10/25 [35:33<55:45, 223.05s/it]

Episode: 9, Total Reward: -21.0


Training Progress:  44%|███████████              | 11/25 [38:43<49:40, 212.89s/it]

Episode: 10, Total Reward: -21.0


Training Progress:  48%|████████████             | 12/25 [42:41<47:47, 220.55s/it]

Episode: 11, Total Reward: -21.0


Training Progress:  52%|█████████████            | 13/25 [46:09<43:22, 216.86s/it]

Episode: 12, Total Reward: -21.0


Training Progress:  56%|██████████████           | 14/25 [49:32<38:58, 212.58s/it]

Episode: 13, Total Reward: -21.0


Training Progress:  60%|███████████████          | 15/25 [52:49<34:39, 207.93s/it]

Episode: 14, Total Reward: -20.0


Training Progress:  64%|████████████████         | 16/25 [55:55<30:11, 201.33s/it]

Episode: 15, Total Reward: -20.0


Training Progress:  68%|█████████████████        | 17/25 [59:25<27:11, 203.92s/it]

Episode: 16, Total Reward: -20.0


Training Progress:  72%|████████████████▌      | 18/25 [1:02:28<23:02, 197.45s/it]

Episode: 17, Total Reward: -21.0


Training Progress:  76%|█████████████████▍     | 19/25 [1:05:59<20:09, 201.66s/it]

Episode: 18, Total Reward: -20.0


Training Progress:  80%|██████████████████▍    | 20/25 [1:09:48<17:29, 209.92s/it]

Episode: 19, Total Reward: -20.0


Training Progress:  84%|███████████████████▎   | 21/25 [1:12:41<13:15, 198.87s/it]

Episode: 20, Total Reward: -21.0


Training Progress:  88%|████████████████████▏  | 22/25 [1:15:35<09:33, 191.27s/it]

Episode: 21, Total Reward: -21.0


Training Progress:  92%|█████████████████████▏ | 23/25 [1:19:04<06:33, 196.57s/it]

Episode: 22, Total Reward: -19.0


Training Progress:  96%|██████████████████████ | 24/25 [1:22:21<03:16, 196.69s/it]

Episode: 23, Total Reward: -21.0


Training Progress: 100%|███████████████████████| 25/25 [1:26:18<00:00, 207.14s/it]

Episode: 24, Total Reward: -19.0





In [1]:
import gym

# Set up the Pong environment with render_mode='human'
env = gym.make('Pong-v4', render_mode='human')

epsilon = 0  # Disable exploration to purely exploit the learned policy

for episode in range(5):  # Evaluate for 5 episodes
    state, _ = env.reset()
    state = preprocess_state(state)
    state = np.stack([state] * 4, axis=0)
    total_reward = 0
    done = False

    while not done:
        action = select_action(state, epsilon)
        next_state, reward, done, truncated, info = env.step(action)
        next_state = preprocess_state(next_state)
        next_state = np.stack([next_state] * 4, axis=0)
        state = next_state
        total_reward += reward

        env.render()  # This will now work correctly with mode='human'

        if done or truncated:
            print(f"Episode: {episode}, Total Reward: {total_reward}")
            break

env.close()


A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


NameError: name 'preprocess_state' is not defined