In [1]:
import numpy as np
import random
import matplotlib.pyplot as plt

# # 6x6 Energy Cost Matrix
energy_cost = np.array([
    [0.83120665, 0.85904648, 0.86740642, 0.80785085, 0.72221092, 0.6821429],
    [0.8677406, 0.95105498, 1., 0.95264647, 0.88683092, 0.86433875],
    [0.89544802, 0.91301993, 0.97772431, 0.89844197, 0.92461879, 0.85710395],
    [0.69357809, 0.63251866, 0.70333501, 0.95006792, 0.91024824, 0.81895411],
    [0.50223527, 0.60647121, 0.56991673, 0.77890135, 0.65087462, 0.72998675],
    [0., 0.04277713, 0.24311663, 0.49470234, 0.46368594, 0.46379897]
])
energy_cost = np.flip(energy_cost, 0)
# Convert energy cost to reward (lower energy = higher reward)
# map obstacle when energy is < 0.95
reward_matrix = 1 - energy_cost
for i in range(5):
    for j in range(5):
        if energy_cost[i, j] > 0.90:
            reward_matrix[i, j] = -10

reward_matrix[0, 0] = 10

print(reward_matrix)

[[ 10.           0.95722287   0.75688337   0.50529766   0.53631406
    0.53620103]
 [  0.49776473   0.39352879   0.43008327   0.22109865   0.34912538
    0.27001325]
 [  0.30642191   0.36748134   0.29666499 -10.         -10.
    0.18104589]
 [  0.10455198 -10.         -10.           0.10155803 -10.
    0.14289605]
 [  0.1322594  -10.         -10.         -10.           0.11316908
    0.13566125]
 [  0.16879335   0.14095352   0.13259358   0.19214915   0.27778908
    0.3178571 ]]


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Environment settings
GRID_SIZE = 6
STATE_DIM = GRID_SIZE * GRID_SIZE
ACTIONS = [(0, 1), (1, 0), (0, -1), (-1, 0), (1, 1), (-1, -1), (1, -1), (-1, 1)]  # Right, Down, Left, Up, Diagonals
NUM_ACTIONS = len(ACTIONS)

def get_next_state(state, action):
    x, y = state // GRID_SIZE, state % GRID_SIZE
    dx, dy = ACTIONS[action]
    nx, ny = x + dx, y + dy
    if 0 <= nx < GRID_SIZE and 0 <= ny < GRID_SIZE:
        return nx * GRID_SIZE + ny
    return state  # Stay in place if out of bounds

# Q-Network
class DQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(DQN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, action_dim)
        )
    
    def forward(self, x):
        return self.fc(x)

In [6]:

# Hyperparameters
learning_rate = 0.001
gamma = 0.99
epsilon = 1.0
epsilon_decay = 0.995
epsilon_min = 0.01
batch_size = 32
memory_size = 10000
episodes = 1000

dqn = DQN(STATE_DIM, NUM_ACTIONS).to(device)
optimizer = optim.Adam(dqn.parameters(), lr=learning_rate)
loss_fn = nn.MSELoss()

# Experience Replay
memory = deque(maxlen=memory_size)

def select_action(state):
    if random.random() < epsilon:
        return random.randint(0, NUM_ACTIONS - 1)
    state_tensor = torch.FloatTensor(state).to(device)
    with torch.no_grad():
        return torch.argmax(dqn(state_tensor)).item()

def train():
    if len(memory) < batch_size:
        return
    batch = random.sample(memory, batch_size)
    states, actions, rewards, next_states, dones = zip(*batch)
    
    states = torch.FloatTensor(states).to(device)
    actions = torch.LongTensor(actions).to(device)
    rewards = torch.FloatTensor(rewards).to(device)
    next_states = torch.FloatTensor(next_states).to(device)
    dones = torch.FloatTensor(dones).to(device)
    
    q_values = dqn(states).gather(1, actions.unsqueeze(1)).squeeze(1)
    next_q_values = dqn(next_states).max(1)[0]
    targets = rewards + gamma * next_q_values * (1 - dones)
    
    loss = loss_fn(q_values, targets.detach())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


In [7]:
# Training Loop
for episode in range(episodes):
    state = 35  # Start from bottom-right (5,5)
    done = False
    total_reward = 0
    
    while not done:
        action = select_action(np.eye(STATE_DIM)[state])
        next_state = get_next_state(state, action)
        reward = reward_matrix[state // GRID_SIZE, state % GRID_SIZE]
        done = (reward == 10)  # Goal reached
        
        memory.append((np.eye(STATE_DIM)[state], action, reward, np.eye(STATE_DIM)[next_state], done))
        state = next_state
        total_reward += reward
        
        train()
    
    global epsilon
    epsilon = max(epsilon * epsilon_decay, epsilon_min)
    
    if episode % 100 == 0:
        print(f"Episode {episode}, Total Reward: {total_reward}, Epsilon: {epsilon:.2f}")


  dones = torch.FloatTensor(dones).to(device)


Episode 0, Total Reward: -123.26278973999993, Epsilon: 0.99


KeyboardInterrupt: 