In [None]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque

# Define the UAV environment (Custom)
class UAVEnv(gym.Env):
    def __init__(self):
        super(UAVEnv, self).__init__()
        self.action_space = gym.spaces.Discrete(4)  # Example: 4 actions (up, down, left, right)
        self.observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=(4,), dtype=np.float32)  # UAV state: x, y, speed, angle
        self.reset()

    def reset(self):
        # Reset UAV position and environment
        self.state = np.array([0, 0, 1, 0])  # Example: [x, y, speed, angle]
        self.goal = np.array([10, 10])  # Goal position (target)
        self.done = False
        return self.state

    def step(self, action):
        # Update UAV state based on action
        if action == 0:  # Move up
            self.state[1] += 1
        elif action == 1:  # Move down
            self.state[1] -= 1
        elif action == 2:  # Move left
            self.state[0] -= 1
        elif action == 3:  # Move right
            self.state[0] += 1

        # Calculate distance to goal (reward)
        dist_to_goal = np.linalg.norm(self.state[:2] - self.goal)
        if dist_to_goal < 1.0:
            reward = 100  # Reached goal
            self.done = True
        else:
            # Penalize the UAV for each time step (encouraging efficient movement)
            reward = -0.1

        # Check for collision with boundaries or obstacles (simple example)
        if self.state[0] < -10 or self.state[0] > 10 or self.state[1] < -10 or self.state[1] > 10:
            reward = -100  # Collision penalty
            self.done = True

        return np.array(self.state), reward, self.done, {}

# Define the Q-Network (Deep Q-Learning Model)
class QNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, action_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Define the experience replay buffer
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
    
    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)
    
    def size(self):
        return len(self.buffer)

# Training parameters
state_dim = 4  # [x, y, speed, angle]
action_dim = 4  # 4 possible actions
batch_size = 32
gamma = 0.99  # Discount factor
learning_rate = 0.001
epsilon_start = 1.0  # Epsilon-greedy exploration parameter
epsilon_end = 0.1
epsilon_decay = 0.995  # Epsilon decay rate
num_episodes = 1000
buffer_capacity = 10000
target_update_frequency = 10  # Update the target network every 10 episodes

# Initialize environment, model, and optimizer
env = UAVEnv()
model = QNetwork(state_dim, action_dim)
target_model = QNetwork(state_dim, action_dim)
target_model.load_state_dict(model.state_dict())  # Target model is a copy of the model
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()

# Initialize experience replay buffer
buffer = ReplayBuffer(buffer_capacity)

# Training loop
epsilon = epsilon_start
for episode in range(num_episodes):
    state = env.reset()
    done = False
    episode_reward = 0

    while not done:
        # Choose action using epsilon-greedy policy
        if random.random() < epsilon:
            action = env.action_space.sample()  # Random action (exploration)
        else:
            state_tensor = torch.tensor(state, dtype=torch.float32)
            q_values = model(state_tensor)
            action = torch.argmax(q_values).item()  # Greedy action (exploitation)

        # Take action and observe the next state and reward
        next_state, reward, done, _ = env.step(action)

        # Store experience in replay buffer
        buffer.push(state, action, reward, next_state, done)

        # Sample a batch from the replay buffer
        if buffer.size() >= batch_size:
            batch = buffer.sample(batch_size)
            states, actions, rewards, next_states, dones = zip(*batch)

            states = torch.tensor(states, dtype=torch.float32)
            actions = torch.tensor(actions, dtype=torch.int64)
            rewards = torch.tensor(rewards, dtype=torch.float32)
            next_states = torch.tensor(next_states, dtype=torch.float32)
            dones = torch.tensor(dones, dtype=torch.bool)

            # Q-learning target
            next_q_values = target_model(next_states)
            max_next_q_values = next_q_values.max(1)[0]
            targets = rewards + (gamma * max_next_q_values) * (~dones)

            # Get current Q-values from the model
            q_values = model(states)
            current_q_values = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)

            # Compute loss
            loss = criterion(current_q_values, targets)

            # Backpropagate and update the model
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Update state and accumulate reward
        state = next_state
        episode_reward += reward

    # Decay epsilon
    epsilon = max(epsilon_end, epsilon * epsilon_decay)

    # Periodically update the target model
    if episode % target_update_frequency == 0:
        target_model.load_state_dict(model.state_dict())

    # Log episode results
    print(f"Episode {episode+1}/{num_episodes}, Reward: {episode_reward}, Epsilon: {epsilon:.2f}")

# Save the trained model
torch.save(model.state_dict(), "uav_dqn_model.pth")


Episode 1/1000, Reward: -122.10000000000005, Epsilon: 0.99
Episode 2/1000, Reward: -120.50000000000003, Epsilon: 0.99
Episode 3/1000, Reward: -109.09999999999998, Epsilon: 0.99
Episode 4/1000, Reward: -103.7, Epsilon: 0.98
Episode 5/1000, Reward: -111.39999999999998, Epsilon: 0.98
Episode 6/1000, Reward: -121.20000000000003, Epsilon: 0.97
Episode 7/1000, Reward: -104.5, Epsilon: 0.97
Episode 8/1000, Reward: -119.30000000000001, Epsilon: 0.96
Episode 9/1000, Reward: -105.5, Epsilon: 0.96
Episode 10/1000, Reward: -108.1, Epsilon: 0.95
Episode 11/1000, Reward: -107.19999999999999, Epsilon: 0.95
Episode 12/1000, Reward: -110.29999999999998, Epsilon: 0.94
Episode 13/1000, Reward: -131.20000000000016, Epsilon: 0.94
Episode 14/1000, Reward: -113.19999999999997, Epsilon: 0.93
Episode 15/1000, Reward: -118.6, Epsilon: 0.93
Episode 16/1000, Reward: -124.80000000000008, Epsilon: 0.92
Episode 17/1000, Reward: -103.8, Epsilon: 0.92
Episode 18/1000, Reward: -105.8, Epsilon: 0.91
Episode 19/1000, Rew