In [1]:
# # Define the Actor (Policy Network)
# class Actor(tf.keras.Model):
#     def __init__(self, num_actions):
#         super(Actor, self).__init__()
#         self.dense1 = tf.keras.layers.Dense(64, activation="relu")
#         self.dense2 = tf.keras.layers.Dense(64, activation="relu")
#         self.output_layer = tf.keras.layers.Dense(num_actions, activation="softmax")

#     def call(self, inputs):
#         x = self.dense1(inputs)
#         x = self.dense2(x)
#         return self.output_layer(x)

In [2]:
# # Define the Critic (Value Network)
# class Critic(tf.keras.Model):
#     def __init__(self):
#         super(Critic, self).__init__()
#         self.dense1 = tf.keras.layers.Dense(64, activation="relu")
#         self.dense2 = tf.keras.layers.Dense(64, activation="relu")
#         self.output_layer = tf.keras.layers.Dense(1)

#     def call(self, inputs):
#         x = self.dense1(inputs)
#         x = self.dense2(x)
#         return self.output_layer(x)

In [3]:
import tensorflow as tf
import numpy as np
import networkx as nx
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from enum import Enum
from collections import deque
import random


In [4]:
FREE: int = 0
OBSTACLE_SOFT: int = 1
OBSTACLE_HARD: int = 2
# AGENT: int = 3
LEADER: int = 3
FOLLOWER: int = 5
TARGET: int = 4


In [5]:
agents: np.ndarray | list[dict]
agents = [
    {"id": 0, "role": "leader"},
    {"id": 1, "role": "follower"}
]

In [6]:
class ACTION_SPACE(Enum):
    UP = (0, -1)
    DOWN = (0, 1)
    LEFT = (-1, 0)
    RIGHT = (1, 0)
    STAY = (0, 0)
    UP_LEFT = (-1, -1)
    UP_RIGHT = (1, -1)
    DOWN_LEFT = (-1, 1)
    DOWN_RIGHT = (1, 1)


In [7]:
# Diagnol
def new_pos(agent_position, action):
  x,y = agent_position[0]
  dx,dy = action.value

  new_pos = (x + dx, y+ dy)

  if not (0 <= new_pos[0] < 10 and 0 <= new_pos[1] < 10):
        return agent_position

  if env[new_pos] in [LEADER, FOLLOWER, OBSTACLE_SOFT, OBSTACLE_HARD]:
    return agent_position

  # elif env[new_pos] == 0 or env[new_pos] == 4:
  #   env[new_pos] == AGENT
  #   env[agent_position] = FREE

  return new_pos

In [8]:
# def calculate_reward(env, leader_pos, follower_pos, target):
#   reward = 0
#   if env[leader_pos] == target or env[follower_pos] == target:
#     reward = 50
#   elif env[leader_pos] == OBSTACLE_SOFT or env[follower_pos] == OBSTACLE_SOFT:
#     reward = -0
#   return reward


# Message structure :

# - Distance to the nearest obstacle (obs_dist): int or float
# - Relative position of the goal (xg): int, -1 if goal is not in partial observability.
# - Relative position of the goal (yg): int, -1 if goal is not in partial observability.
# - Whether the path is clear or blocked(path_blocked): bool or 0/1 int
# - Leader's action (action.name): str
# - Leader can observe the follower or not (follower_visibility): bool or 0/1 int
# - Leaders distance to follower (follower_dist): int or float


In [9]:
# - Suggested movement: e.g., (0,0) if leader suggests follower to stay or (0,-1) if suggested going left. Not the coordinates here!
# - Urgency level: int ranging between 1-5, the lower the more urgent.

In [10]:
from ast import IsNot
#window: 3X3
def get_leader_message(pos):
  # message = []
  follower_visibility = 0 #leader cannot observe it
  follower_dist, obs_dist, counter = -1,-1, 0
  obstacles_pos, distances = [],[]
  path_blocked = 0 #not blocked
  x,y = pos[0]
  xg,yg = (-1,-1)


  for dx in range(-2, 2):
      for dy in range(-2, 2):
            nx, ny = x + dx, y + dy
            if 0 <= nx < 10 and 0 <= ny < 10:
                counter += 1
                #relative position to the goal
                if env[nx, ny] == TARGET:
                  xg,yg = nx, ny
                  # action = ACTION_SPACE((xg-x, yg-y))

                #leader can see follower
                if env[nx,ny] == FOLLOWER:
                  follower_visibility = 1
                  follower_dist = np.sqrt((x - nx)**2 + (y - ny)**2)

                #nearest obstacle
                if env[nx, ny] in [OBSTACLE_SOFT, OBSTACLE_HARD]:
                    obstacles_pos.append(nx,ny)
                    dist = np.sqrt((x - nx)**2 + (y - ny)**2)
                    distances.append(dist)

  if len(distances) > 0:
    obs_dist = min(distances)
  if len(obstacles_pos) == counter:
      path_blocked = 1
  # if action == ACTION_SPACE.STAY.value:
  #   if len(obstacles_pos) == counter:
  #     path_blocked = 1
    # else:
    #   while action in obstacles_pos or env[action] == 3:
    #     random_actions = random.sample(list(ACTION_SPACE.value), 1)[0]
    #     if  0 <= x+random_actions.value[0] < 10 and  0 <= y+random_actions.value[1] < 10:
    #         action = random_actions


  return [xg, yg, obs_dist, follower_visibility, follower_dist, path_blocked]


In [11]:
# LSTM
from tensorflow.keras.layers import Reshape

def build_encoder_decoder():
    input_layer = Input(shape=(8,))
    reshaped = Reshape((1, 8))(input_layer)
    x = LSTM(64, return_sequences=True)(reshaped)
    x = LSTM(32)(x)
    output_layer = Dense(8, activation="linear")(x)
    return Model(input_layer, output_layer)

encoder_decoder = build_encoder_decoder()

In [12]:
# MLP MAPPO
def build_policy_network():
    input_layer = Input(shape=(8,))
    x = Dense(64, activation="relu")(input_layer)
    x = Dense(64, activation="relu")(x)
    output_layer = Dense(len(ACTION_SPACE), activation="softmax")(x)
    return Model(input_layer, output_layer)

leader_policy = build_policy_network()
follower_policy = build_policy_network()

In [13]:
class MAPPO:
    def __init__(self, leader_model, follower_model, encoded_model, lr=0.001):
        self.leader_model = leader_model
        self.follower_model = follower_model
        self.encoded_model = encoded_model
        self.optimizer = Adam(learning_rate=lr)

    def compute_loss(self, state_leader, decoded_msg, action_leader, action_follower, reward, leader_message, encoded_message, decoded_message):
        # Compute Advantage (A = R + γV(s') - V(s))
        value = self.leader_model(state_leader.reshape(1, -1))[0, 0]  # Predicted value
        advantage = reward - value  # TD error as Advantage Estimate
        print("loss")

        # Policy Gradient Loss (A2C)
        action_prob_leader = self.leader_model(state_leader.reshape(1, -1))
        action_prob_follower = self.follower_model(decoded_msg.reshape(1, -1))
        policy_loss = -tf.reduce_mean(advantage * tf.math.log(action_prob_leader + 1e-8))
        print('Policy Gradient Loss')
        # Contrastive Loss (CACL) for Communication Alignment
        contrastive_loss_value = contrastive_loss(tf.convert_to_tensor([encoded_message]), positive_pairs=[0])

        print('Contrastive Loss')
        # Message Reconstruction Loss (L_recon)
        print(f'leader_message={leader_message}')
        print(f'decoded_message= {decoded_message}')
        reconstruction_loss = tf.reduce_mean(tf.keras.losses.MSE(leader_message, decoded_message))
        print('Entropy')
        # Entropy Bonus for Exploration
        entropy_bonus = -tf.reduce_mean(action_prob_leader * tf.math.log(action_prob_leader + 1e-8))

        # Final loss function
        total_loss = policy_loss + 0.01 * entropy_bonus + 0.5 * contrastive_loss_value + 0.2 * reconstruction_loss

        return total_loss


    def apply_gradients(self, state_leader, decoded_msg, action_leader, action_follower, reward):
        with tf.GradientTape() as tape:
            loss = self.compute_loss(state_leader, decoded_msg, action_leader, action_follower, reward)
        grads = tape.gradient(loss, self.leader_model.trainable_variables + self.follower_model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.leader_model.trainable_variables + self.follower_model.trainable_variables))

In [14]:
# =======================
# Contrastive Learning for Communication
# =======================
def contrastive_loss(messages, positive_pairs, temperature=0.1):
    messages = tf.keras.layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=1))(messages)
    sim_matrix = tf.matmul(messages, messages, transpose_b=True) / temperature
    labels = tf.one_hot(positive_pairs, depth=len(messages))
    loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)(labels, sim_matrix)
    return loss

In [20]:
def train_MAPPO(episodes, leader_model, follower_model, encoded_model, leader_pos, target_pos,follower_pos):

  optimizer = Adam(learning_rate=lr)

  for episode in range(episodes):
    # reset the environment
    env[:] = FREE
    env[leader_pos] = LEADER
    env[follower_pos] = FOLLOWER
    env[target_pos] = TARGET

    total_reward = 0
    leader_path = [leader_pos]
    follower_path = [follower_pos]
    episode_reset = False

    for step in range(2):

        # Leader moves
        leader_message = get_leader_message(leader_pos)
        leader_message.append(-1)
        leader_message.append(-1)
        print("*********")
        leader_action_probs = leader_model.predict(np.array(leader_message[:8]).reshape(1, -1))
        leader_action = list(ACTION_SPACE)[np.argmax(leader_action_probs)]

        print('kimia')
        leader_message[6], leader_message[7] = leader_action.value
        new_leader_pos = new_pos(leader_pos, leader_action)
        # new_leader_pos = move_agent(leader_pos, leader_action)
        print(leader_action.value)

        # encoded decoded
        print(len(leader_message))
        encoded_msg = encoded_model.predict(np.array(leader_message[:8]).reshape(1, -1))
        decoded_msg = encoded_msg.reshape(-1)
        print('leader')

        # Follower moves
        follower_action_probs = follower_model.predict(decoded_msg.reshape(1,-1))
        follower_action = list(ACTION_SPACE)[np.argmax(follower_action_probs)]
        new_follower_pos = new_pos(follower_pos, follower_action)
        print("follower")
        # compute distance
        distance = np.sqrt((new_leader_pos[0][0] - new_follower_pos[0][0])**2 + (new_leader_pos[0][1] - new_follower_pos[0][1])**2)

        x_l , y_l = new_leader_pos[0]
        x_f , y_f = new_follower_pos[0]

        if distance > 2 or distance < 1:
          print(f"Episode {episode+1}: Distance constraint violated (Distance: {distance:.2f}). Resetting...")
          break


        elif env[x_l, y_l] == OBSTACLE_HARD or env[x_f , y_f] == OBSTACLE_HARD:
          print(f"Episode {episode+1}: Hard obstacle constraint violated. Resetting...")
          break

        # update the path and position
        env[follower_pos] = FREE
        follower_pos = new_follower_pos
        env[follower_pos] = FOLLOWER
        follower_path.append(follower_pos)

        env[leader_pos] = FREE
        leader_pos = new_leader_pos
        env[leader_pos] = LEADER
        leader_path.append(leader_pos)


        # compute reward
        reward = 0
        if env[x_l, y_l] == TARGET or env[x_f , y_f] == TARGET:
          reward += 10
        elif env[x_l, y_l] == OBSTACLE_SOFT or env[x_f , y_f] == OBSTACLE_SOFT:
          reward -= 2

        total_reward += reward

        mappo_model = MAPPO(leader_model, follower_model, encoded_model)
        print("mappo")
        with tf.GradientTape() as tape:
                loss = mappo_model.compute_loss(
                    np.array(leader_message[:8]), decoded_msg,
                    leader_action, follower_action, reward,
                    leader_message, encoded_msg, decoded_msg
                )

        # Update Policy
        print("Update Policy")
        grads = tape.gradient(loss, leader_model.trainable_variables + follower_model.trainable_variables)
        optimizer.apply_gradients(zip(grads, leader_model.trainable_variables + follower_model.trainable_variables))


  if not episode_reset:
            print(f"\nEpisode {episode+1} finished with Reward: {total_reward}")
            print(f"Leader Path: {leader_path}")
            print(f"Follower Path: {follower_path}\n")

In [16]:
env = np.array([
    [0, 0, 5, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 3, 4, 0, 0, 0, 0, 1, 1],
    [0, 0, 0, 0, 0, 0, 2, 0, 0, 1],
    [0, 0, 1, 0, 0, 0, 0, 0, 1, 0],
    [0, 0, 0, 2, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0, 0, 0, 0, 2, 0],
    [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 2, 0, 0, 0, 0, 0, 0, 1],
    [0, 1, 0, 0, 1, 0, 1, 0, 0, 0],
    [0, 0, 0, 0, 0, 0, 0, 2, 0, 0]
  ])

leader_pos= np.argwhere(env == LEADER)
follower_pos= np.argwhere(env == FOLLOWER)
target_pos = np.argwhere(env == TARGET)

lr = 0.001

print(leader_pos)
print(follower_pos)
print(target_pos)

[[1 2]]
[[0 2]]
[[1 3]]


In [21]:
train_MAPPO(2, leader_policy, follower_policy, encoder_decoder,leader_pos, target_pos, follower_pos,)


*********
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
kimia
(-1, 1)
8
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
leader
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
follower
mappo
loss
Policy Gradient Loss
Contrastive Loss
leader_message=[np.int64(1), np.int64(3), -1, 1, np.float64(1.4142135623730951), 0, -1, 1]
decoded_message= [ 0.0171809  -0.02600081  0.03360142  0.00800042  0.03094879 -0.05869333
 -0.02111227  0.00025879]
Entropy
Update Policy
*********


  return self.fn(y_true, y_pred, **self._fn_kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
kimia
(-1, 1)
8
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
leader
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
follower
mappo
loss
Policy Gradient Loss
Contrastive Loss
leader_message=[-1, -1, -1, 1, np.float64(1.4142135623730951), 0, -1, 1]
decoded_message= [ 0.00652463 -0.012621    0.00657111 -0.01526564 -0.00891721 -0.02557101
 -0.01264657  0.00010886]
Entropy
Update Policy
*********
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
kimia
(-1, 1)
8
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
leader
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
follower
mappo
loss
Policy Gradient Loss
Contrastive Loss
leader_message=[np.int64(1), np.int64(3), -1, 1, np.float64(1.4142135623730951), 0, -1, 1]
decoded_message= [ 0.0171809  -0.02600081  0.03360142  0.00800042  0.03094879 -0.05869333
 -0.

In [None]:
x,y = leader_pos[0]
env[x,y]
