In [9]:
!pip install gymnasium torch numpy swig gymnasium[box2d]


Collecting swig
  Using cached swig-4.3.0-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (3.5 kB)
Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Using cached box2d-py-2.3.5.tar.gz (374 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting pygame>=2.1.3 (from gymnasium[box2d])
  Using cached pygame-2.6.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Using cached swig-4.3.0-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.9 MB)
Using cached pygame-2.6.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.0 MB)
Building wheels for collected packages: box2d-py
  Building wheel for box2d-py (pyproject.toml) ... [?25ldone
[?25h  Created wheel for box2d-py: filename=box2d_py-2.3.5-cp313-cp313-linux_x86_64.whl size=2571560 sha256=83d3b8d62e38ca63a4a31349afe8975e9efd88613da072d097339ce2ced

In [14]:
import gymnasium as gym
import numpy as np

# Create the Lunar Lander environment
env = gym.make('LunarLander-v3')

# Check environment details
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

print(f"State Dimensions: {state_dim}")
print(f"Action Dimensions: {action_dim}")

# Test the environment with random actions
state, info = env.reset()
done = False
score = 0

while not done:
    env.render()  # Renders the environment
    action = env.action_space.sample()  # Take a random action
    next_state, reward, done, truncated, info = env.step(action)
    score += reward
    state = next_state

print(f"Final Score: {score}")
env.close()


State Dimensions: 8
Action Dimensions: 4
Final Score: -90.30743934077333


  gym.logger.warn(


In [18]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque
import random

# Define the Q-network (Neural Network)
class DQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_dim)
    
    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Hyperparameters
learning_rate = 1e-3
gamma = 0.99
epsilon = 1.0
epsilon_decay = 0.995
epsilon_min = 0.01
batch_size = 64
buffer_size = 10000
target_update_freq = 10
max_episodes = 1000

# Create the environment
env = gym.make('LunarLander-v3')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

# Initialize networks, optimizer, and loss function
policy_net = DQN(state_dim, action_dim).float()
target_net = DQN(state_dim, action_dim).float()
target_net.load_state_dict(policy_net.state_dict())
optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
criterion = nn.MSELoss()

# Experience replay buffer
replay_buffer = deque(maxlen=buffer_size)

# Epsilon-greedy action selection
def select_action(state):
    if random.random() < epsilon:
        return env.action_space.sample()
    else:
        with torch.no_grad():
            state_tensor = torch.tensor(state, dtype=torch.float32)
            q_values = policy_net(state_tensor)
            return torch.argmax(q_values).item()

# Train the model
def train():
    global epsilon
    for episode in range(max_episodes):
        state, info = env.reset()
        done = False
        total_reward = 0
        
        while not done:
            action = select_action(state)
            next_state, reward, done, truncated, info = env.step(action)
            total_reward += reward
            
            # Store the experience in replay buffer
            replay_buffer.append((state, action, reward, next_state, done))
            
            # Sample a batch from replay buffer
            if len(replay_buffer) >= batch_size:
                batch = random.sample(replay_buffer, batch_size)
                states, actions, rewards, next_states, dones = zip(*batch)
                
                states = torch.tensor(states, dtype=torch.float32)
                actions = torch.tensor(actions, dtype=torch.int64)
                rewards = torch.tensor(rewards, dtype=torch.float32)
                next_states = torch.tensor(next_states, dtype=torch.float32)
                dones = torch.tensor(dones, dtype=torch.float32)
                
                # Q-values from current policy network
                q_values = policy_net(states).gather(1, actions.unsqueeze(1)).squeeze(1)
                
                # Q-values from target network
                with torch.no_grad():
                    next_q_values = target_net(next_states).max(1)[0]
                    target_q_values = rewards + (gamma * next_q_values * (1 - dones))
                
                # Compute loss
                loss = criterion(q_values, target_q_values)
                
                # Optimize the model
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            # Update state and epsilon
            state = next_state
            epsilon = max(epsilon_min, epsilon * epsilon_decay)
        
        # Update target network
        if episode % target_update_freq == 0:
            target_net.load_state_dict(policy_net.state_dict())
        
        print(f"Episode {episode + 1}/{max_episodes}, Total Reward: {total_reward}, Epsilon: {epsilon:.3f}")

# Run training
train()

# Save the model
torch.save(policy_net.state_dict(), 'dqn_lunar_lander.pth')

# Close the environment
env.close()


Episode 1/1000, Total Reward: -92.66950692436035, Epsilon: 0.751
Episode 2/1000, Total Reward: -86.7523770698207, Epsilon: 0.508
Episode 3/1000, Total Reward: -215.56334519128535, Epsilon: 0.152
Episode 4/1000, Total Reward: -247.0729990098024, Epsilon: 0.071
Episode 5/1000, Total Reward: -188.09204252705092, Epsilon: 0.039
Episode 6/1000, Total Reward: -239.36442458816424, Epsilon: 0.021
Episode 7/1000, Total Reward: -116.87168398339173, Epsilon: 0.010
Episode 8/1000, Total Reward: -137.96694913653377, Epsilon: 0.010
Episode 9/1000, Total Reward: -97.61610444942764, Epsilon: 0.010
Episode 10/1000, Total Reward: -114.57392850325809, Epsilon: 0.010
Episode 11/1000, Total Reward: -188.17676545304522, Epsilon: 0.010
Episode 12/1000, Total Reward: -125.32645196870855, Epsilon: 0.010
Episode 13/1000, Total Reward: -176.62015795456284, Epsilon: 0.010
Episode 14/1000, Total Reward: -135.0992580986205, Epsilon: 0.010
Episode 15/1000, Total Reward: -191.52391718326783, Epsilon: 0.010
Episode 16

KeyboardInterrupt: 