In [None]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

class UAVEnv(gym.Env):
    def __init__(self):
        super(UAVEnv, self).__init__()
        # Define observation and action spaces
        self.observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=(4,))
        self.action_space = gym.spaces.Discrete(4)  # Example: 4 actions (up, down, left, right)

    def reset(self):
        # Reset the environment (UAV's position, obstacles, etc.)
        self.state = np.array([0, 0, 0, 0])  # Example initial state
        return self.state

    def step(self, action):
        # Update state based on action, return new state, reward, done, and info
        next_state = self.state + np.array([1, 0, 0, 0])  # Example: move UAV
        reward = -1  # Reward calculation logic (collision check, goal reached, etc.)
        done = False  # End of episode
        return next_state, reward, done, {}

class QNetwork(nn.Module):
    def __init__(self):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(4, 128)
        self.fc2 = nn.Linear(128, 4)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

env = UAVEnv()
model = QNetwork()
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

# Training loop
for episode in range(1000):
    state = env.reset()
    done = False
    while not done:
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)  # Add batch dimension
        q_values = model(state_tensor)
        action = torch.argmax(q_values).item()

        # Take action and get next state and reward
        next_state, reward, done, _ = env.step(action)
        next_state_tensor = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0)  # Add batch dimension

        # Q-value update
        target = reward + 0.99 * torch.max(model(next_state_tensor))  # Don't use .item() here
        current_q_value = q_values[0, action]  # Select Q-value for the action taken

        # Convert target to tensor and detach to avoid tracking gradients
        target = torch.tensor(target, dtype=torch.float32).detach()

        # Compute loss
        loss = criterion(current_q_value, target)

        # Backpropagate and update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


        # Update state
       # state = next_state
