In [None]:
import random
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

In [None]:
# Define the custom environment
class SimpleGridEnv:
    def __init__(self, grid_size=5, target_position=(4, 4)):
        self.grid_size = grid_size
        self.target_position = target_position
        self.reset()

    def reset(self):
        self.agent_position = [0, 0]
        return self.get_state()

    def get_state(self):
        return np.array(self.agent_position)

    def step(self, action):
        if action == 0:  # Up
            self.agent_position[0] = max(0, self.agent_position[0] - 1)
        elif action == 1:  # Down
            self.agent_position[0] = min(self.grid_size - 1, self.agent_position[0] + 1)
        elif action == 2:  # Left
            self.agent_position[1] = max(0, self.agent_position[1] - 1)
        elif action == 3:  # Right
            self.agent_position[1] = min(self.grid_size - 1, self.agent_position[1] + 1)

        reward = -1
        done = False
        if self.agent_position == list(self.target_position):
            reward = 10
            done = True

        return self.get_state(), reward, done

    def sample_action(self):
        return random.randint(0, 3)

In [None]:
# Define the DQN model
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

In [None]:
# Hyperparameters
EPISODES = 500
GAMMA = 0.99
EPSILON = 1.0
EPSILON_DECAY = 0.995
EPSILON_MIN = 0.01
LEARNING_RATE = 0.001
BATCH_SIZE = 64
MEMORY_SIZE = 1000

In [None]:
# Replay buffer
class ReplayMemory:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, transition):
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = transition
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [None]:
def train_dqn():
    global EPSILON  # Declare EPSILON as global so it can be modified
    env = SimpleGridEnv()
    state_dim = 2
    action_dim = 4

    dqn = DQN(state_dim, action_dim)
    target_dqn = DQN(state_dim, action_dim)
    target_dqn.load_state_dict(dqn.state_dict())

    optimizer = optim.Adam(dqn.parameters(), lr=LEARNING_RATE)
    memory = ReplayMemory(MEMORY_SIZE)

    for episode in range(EPISODES):
        state = env.reset()
        total_reward = 0
        path = [list(state)]  # Initialize path with starting position

        for step in range(100):  # Limit steps per episode
            # Epsilon-greedy action selection
            if random.random() < EPSILON:
                action = env.sample_action()
            else:
                state_tensor = torch.FloatTensor(state).unsqueeze(0)
                with torch.no_grad():
                    action = dqn(state_tensor).argmax().item()

            next_state, reward, done = env.step(action)
            memory.push((state, action, reward, next_state, done))

            # Append the new position to the path
            path.append(list(next_state))

            state = next_state
            total_reward += reward

            if len(memory) >= BATCH_SIZE:
                # Sample a batch from memory
                batch = memory.sample(BATCH_SIZE)
                batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip(*batch)

                batch_state = torch.FloatTensor(batch_state)
                batch_action = torch.LongTensor(batch_action).unsqueeze(1)
                batch_reward = torch.FloatTensor(batch_reward)
                batch_next_state = torch.FloatTensor(batch_next_state)
                batch_done = torch.FloatTensor(batch_done)

                # Compute the target Q values
                with torch.no_grad():
                    next_q_values = target_dqn(batch_next_state).max(1)[0]
                    target_q_values = batch_reward + GAMMA * next_q_values * (1 - batch_done)

                # Get current Q values
                current_q_values = dqn(batch_state).gather(1, batch_action).squeeze(1)

                # Compute loss and optimize
                loss = nn.MSELoss()(current_q_values, target_q_values)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            if done:
                break

        # Update the target network periodically
        if episode % 10 == 0:
            target_dqn.load_state_dict(dqn.state_dict())

        # Decay epsilon
        EPSILON = max(EPSILON_MIN, EPSILON * EPSILON_DECAY)

        # Print the path every 20 episodes
        if episode % 20 == 0:
            print(f"Episode {episode}, Path: {path}")

        print(f"Episode {episode}, Total Reward: {total_reward}, Epsilon: {EPSILON}")


In [None]:
if __name__ == "__main__":
    train_dqn()

Episode 0, Path: [[0, 0], [0, 1], [0, 0], [0, 0], [0, 0], [1, 0], [2, 0], [3, 0], [4, 0], [4, 1], [4, 0], [4, 0], [4, 0], [4, 1], [4, 2], [4, 1], [4, 0], [4, 1], [4, 1], [4, 2], [4, 2], [4, 3], [4, 4]]
Episode 0, Total Reward: -11, Epsilon: 0.995
Episode 1, Total Reward: -31, Epsilon: 0.990025


  batch_state = torch.FloatTensor(batch_state)


Episode 2, Total Reward: -25, Epsilon: 0.985074875
Episode 3, Total Reward: -100, Epsilon: 0.9801495006250001
Episode 4, Total Reward: -44, Epsilon: 0.9752487531218751
Episode 5, Total Reward: -27, Epsilon: 0.9703725093562657
Episode 6, Total Reward: -100, Epsilon: 0.9655206468094844
Episode 7, Total Reward: -61, Epsilon: 0.960693043575437
Episode 8, Total Reward: -53, Epsilon: 0.9558895783575597
Episode 9, Total Reward: -100, Epsilon: 0.9511101304657719
Episode 10, Total Reward: -68, Epsilon: 0.946354579813443
Episode 11, Total Reward: -18, Epsilon: 0.9416228069143757
Episode 12, Total Reward: -64, Epsilon: 0.9369146928798039
Episode 13, Total Reward: -8, Epsilon: 0.9322301194154049
Episode 14, Total Reward: -44, Epsilon: 0.9275689688183278
Episode 15, Total Reward: -4, Epsilon: 0.9229311239742362
Episode 16, Total Reward: -100, Epsilon: 0.918316468354365
Episode 17, Total Reward: -58, Epsilon: 0.9137248860125932
Episode 18, Total Reward: -10, Epsilon: 0.9091562615825302
Episode 19, T