In [16]:
# Imports
import torch
import numpy as np
import gymnasium as gym
from collections import deque
import pygame
import random
import torch.nn as nn

In [17]:
# DQN model which takes in the state as an input and outputs predicted q values for every possible action
class DQN(torch.nn.Module):
    def __init__(self, state_space, action_space):
        super().__init__()
        # Add your architecture parameters here
        # You can use nn.Functional
        # Remember that the input is of size batch_size x state_space
        # and the output is of size batch_size x action_space (ulta ho sakta hai dekh lo)
        # TODO: Add code here
        self.fc1 = nn.Linear(state_space, 128)
        self.fc2 = nn.Linear(128, 128)
        self.out = nn.Linear(128, action_space)

    def forward(self, input):
        # TODO: Complete based on your implementation
        x = input
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.out(x)


In [18]:
# While training neural networks, we split the data into batches.
# To improve the training, we need to remove the "correlation" between game states
# The buffer starts storing states and once it reaches maximum capacity, it replaces
# states at random which reduces the correlation.
class ExperienceBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        return states, actions, rewards, next_states, dones

    def __len__(self):
        return len(self.buffer)


In [19]:
class ChaseEscapeEnv(gym.Env):
    metadata = {"render_modes": ["human"], "render_fps": 30}

    def __init__(self, render_mode=None):
        super().__init__()

        self.dt = 0.1
        self.max_speed = 0.4
        self.agent_radius = 0.05
        self.target_radius = 0.05
        self.chaser_radius = 0.07
        self.chaser_speed = 0.03
        self.steps = 0
        self.max_steps = 200

        self.action_space = gym.spaces.MultiDiscrete([3, 3])  # actions in {0,1,2} map to [-1,0,1]
        self.observation_space = gym.spaces.Box(
            low=-1,
            high=1,
            shape=(8,),
            dtype=np.float32,
        )

        self.render_mode = render_mode
        self.screen_size = 500
        self.np_random = None

        if render_mode == "human":
            pygame.init()
            self.screen = pygame.display.set_mode((self.screen_size, self.screen_size))
            self.clock = pygame.time.Clock()

    def sample_pos(self, far_from=None, min_dist=0.5):
        while True:
            pos = self.np_random.uniform(low=-0.8, high=0.8, size=(2,))
            if far_from is None or np.linalg.norm(pos - far_from) >= min_dist:
                return pos

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.steps = 0
        self.agent_pos = self.sample_pos()
        self.agent_vel = np.zeros(2, dtype=np.float32)
        self.target_pos = self.sample_pos(far_from=self.agent_pos, min_dist=0.5)
        self.chaser_pos = self.sample_pos(far_from=self.agent_pos, min_dist=0.7)

        return self._get_obs(), {}

    def _get_obs(self):
        # TODO: Decide how to pass the state (don't use pixel values)
        # Observation is concatenation of positions and velocity:
        # [agent_x, agent_y, agent_vx, agent_vy, target_x, target_y, chaser_x, chaser_y]
        obs = np.concatenate((
            self.agent_pos,
            self.agent_vel,
            self.target_pos,
            self.chaser_pos
        )).astype(np.float32)
        return obs

    def _get_info(self):
        return {}

    def step(self, action):
        # TODO: Add reward scheme
        # 1) Try to make the agent stay within bounds
        # 2) The agent shouldn't idle around
        # 3) The agent should go for the reward
        # 4) The agent should avoid the chaser

        self.steps += 1

        # Map discrete actions to acceleration values (-1, 0, +1)
        accel = (np.array(action) - 1) * 0.1
        self.agent_vel += accel
        self.agent_vel = np.clip(self.agent_vel, -self.max_speed, self.max_speed)
        self.agent_pos += self.agent_vel * self.dt

        # Keep agent inside bounds [-1, 1]
        self.agent_pos = np.clip(self.agent_pos, -1, 1)

        # Move chaser towards agent
        direction = self.agent_pos - self.chaser_pos
        dist = np.linalg.norm(direction)
        if dist > 1e-5:
            self.chaser_pos += self.chaser_speed * direction / dist

        # Compute distances
        dist_to_target = np.linalg.norm(self.agent_pos - self.target_pos)
        dist_to_chaser = np.linalg.norm(self.agent_pos - self.chaser_pos)

        reward = 0.0

        if(dist_to_chaser < dist_to_target) :
          reward += 0.5
        else :
          reward -= 0.5


        terminated = False

        # Reward for reaching target
        if dist_to_target < self.agent_radius + self.target_radius:
            reward += 10.0
            self.target_pos = self.sample_pos(far_from=self.agent_pos, min_dist=0.5)

        # Terminate if caught by chaser
        if dist_to_chaser < self.agent_radius + self.chaser_radius:
            reward -= 10.0
            terminated = True

        # Small per-timestep penalty to encourage quick behavior
        reward -= 0.01

        # Optional: truncate episode after max steps
        truncated = self.steps >= self.max_steps

        return self._get_obs(), reward, terminated, truncated, {}

    def render(self):
        if self.render_mode != "human":
            return

        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                self.close()

        self.screen.fill((255, 255, 255))

        def to_screen(p):
            x = int((p[0] + 1) / 2 * self.screen_size)
            y = int((1 - (p[1] + 1) / 2) * self.screen_size)
            return x, y

        pygame.draw.circle(self.screen, (0, 255, 0), to_screen(self.target_pos), int(self.target_radius * self.screen_size))
        pygame.draw.circle(self.screen, (0, 0, 255), to_screen(self.agent_pos), int(self.agent_radius * self.screen_size))
        pygame.draw.circle(self.screen, (255, 0, 0), to_screen(self.chaser_pos), int(self.chaser_radius * self.screen_size))

        pygame.display.flip()
        self.clock.tick(self.metadata["render_fps"])

    def close(self):
        if self.render_mode == "human":
            pygame.quit()


In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
import random

env = ChaseEscapeEnv(render_mode=None)
obs_shape = env.observation_space.shape[0]  # 8
n_actions = np.prod(env.action_space.nvec)  # 3 * 3 = 9 possible action pairs

# Your DQN model should take 8 inputs and output 9 Q-values (for each action pair)
model = DQN(obs_shape, n_actions)
target_model = DQN(obs_shape, n_actions)
target_model.load_state_dict(model.state_dict())

optimizer = optim.Adam(model.parameters(), lr=1e-3)
buffer = ExperienceBuffer(10000)
batch_size = 64
gamma = 0.99
epsilon_start = 1.0
epsilon_final = 0.05
epsilon_decay = 10000
epsilon = epsilon_start
steps_done = 0

def select_action(state):
    global steps_done, epsilon
    steps_done += 1
    epsilon = epsilon_final + (epsilon_start - epsilon_final) * \
              np.exp(-1. * steps_done / epsilon_decay)
    if random.random() < epsilon:
        # Sample random discrete actions for 2 dimensions:
        a1 = random.choice([0,1,2])
        a2 = random.choice([0,1,2])
        action = a1 * 3 + a2  # encode to single int from 0 to 8
        return torch.tensor([[action]], dtype=torch.long)
    else:
        with torch.no_grad():
            state = torch.FloatTensor(state).unsqueeze(0)
            q_values = model(state)
            action = q_values.max(1)[1].view(1,1)
            return action

def decode_action(action_idx):
    # inverse of a1 * 3 + a2 encoding
    a1 = action_idx // 3
    a2 = action_idx % 3
    return [a1, a2]

# Training loop sketch
num_episodes = 1000
for episode in range(num_episodes):
    state, _ = env.reset()
    total_reward = 0
    done = False

    while not done:
        action_idx = select_action(state).item()
        action = decode_action(action_idx)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        buffer.push(state.copy(), action_idx, reward, next_state.copy(), done)
        state = next_state
        total_reward += reward

        # Sample from buffer and train model here (omitted for brevity)
        # ...

    print(f"Episode {episode} Total Reward: {total_reward:.2f}")

env.close()


Episode 0 Total Reward: 4.70
Episode 1 Total Reward: 16.95
Episode 2 Total Reward: 18.87
Episode 3 Total Reward: 1.17
Episode 4 Total Reward: 7.89
Episode 5 Total Reward: 20.83
Episode 6 Total Reward: 1.97
Episode 7 Total Reward: -2.65
Episode 8 Total Reward: -0.20
Episode 9 Total Reward: -30.76
Episode 10 Total Reward: 12.54
Episode 11 Total Reward: 6.71
Episode 12 Total Reward: -4.03
Episode 13 Total Reward: 12.91
Episode 14 Total Reward: 12.03
Episode 15 Total Reward: 14.81
Episode 16 Total Reward: -2.52
Episode 17 Total Reward: 10.30
Episode 18 Total Reward: 15.48
Episode 19 Total Reward: 1.27
Episode 20 Total Reward: -6.36
Episode 21 Total Reward: 14.01
Episode 22 Total Reward: 0.62
Episode 23 Total Reward: 22.75
Episode 24 Total Reward: -13.79
Episode 25 Total Reward: 7.64
Episode 26 Total Reward: 16.32
Episode 27 Total Reward: -4.22
Episode 28 Total Reward: -9.20
Episode 29 Total Reward: 2.25
Episode 30 Total Reward: 7.15
Episode 31 Total Reward: 16.46
Episode 32 Total Reward: -

In [21]:
def evaluate_agent(env, model, episodes=20, render=False):
    total_rewards = []
    for ep in range(episodes):
        state, _ = env.reset()
        done = False
        ep_reward = 0
        steps = 0
        while not done:
            if render:
                env.render()
            with torch.no_grad():
                state_tensor = torch.FloatTensor(state).unsqueeze(0)
                q_values = model(state_tensor)
                action_idx = torch.argmax(q_values).item()
            # Decoding the action index to the (a1, a2) pair
            a1 = action_idx // 3
            a2 = action_idx % 3
            action = [a1, a2]
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            ep_reward += reward
            state = next_state
            steps += 1
        print(f"Test Episode {ep+1}: Reward={ep_reward:.2f}, Steps={steps}")
        total_rewards.append(ep_reward)
    avg = np.mean(total_rewards)
    print(f"\nAverage test reward over {episodes} episodes: {avg:.2f}")
    env.close()
    return total_rewards

# Example usage (should be after training, with trained model):
test_rewards = evaluate_agent(env, model, episodes=20, render=False)


Test Episode 1: Reward=9.60, Steps=40
Test Episode 2: Reward=-15.60, Steps=60
Test Episode 3: Reward=14.71, Steps=79
Test Episode 4: Reward=-5.10, Steps=10
Test Episode 5: Reward=8.87, Steps=63
Test Episode 6: Reward=-6.58, Steps=58
Test Episode 7: Reward=-7.17, Steps=67
Test Episode 8: Reward=18.83, Steps=67
Test Episode 9: Reward=-2.09, Steps=59
Test Episode 10: Reward=12.48, Steps=52
Test Episode 11: Reward=-23.58, Steps=58
Test Episode 12: Reward=14.50, Steps=50
Test Episode 13: Reward=16.46, Steps=54
Test Episode 14: Reward=23.75, Steps=75
Test Episode 15: Reward=13.01, Steps=49
Test Episode 16: Reward=-1.50, Steps=50
Test Episode 17: Reward=12.05, Steps=45
Test Episode 18: Reward=25.28, Steps=72
Test Episode 19: Reward=13.03, Steps=47
Test Episode 20: Reward=1.76, Steps=24

Average test reward over 20 episodes: 6.14
