### Import

In [None]:
!pip install supersuit
!pip install stable-baselines3
!pip install pettingzoo==1.21.0
!pip install git+https://github.com/Markus28/tianshou@support_pz

In [None]:
from stable_baselines3.ppo import CnnPolicy
from stable_baselines3 import PPO
from pettingzoo.butterfly import pistonball_v6
import supersuit as ss
import pettingzoo

In [None]:
%matplotlib inline
import random
import copy
from collections import namedtuple, deque
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from supersuit import color_reduction_v0, frame_stack_v1, resize_v1
from torch.distributions.categorical import Categorical

from pettingzoo.butterfly import pistonball_v6
from tianshou.env.pettingzoo_env import PettingZooEnv

### Parameters

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
BUFFER_SIZE = int(1e6)
BATCH_SIZE = 512
GAMMA = 0.95
TAU = 0.05
LR_ACTOR = 1e-3
LR_CRITIC = 1e-3
WEIGHT_DECAY = 0
UPDATE_EVERY = 1
NO_UPDATES = 1

EPSILON = 1e-5
ALPHA = 0.6
TOTAL_EPISODES=10000
NOISE_SCALE = 1
NOISE_DECAY_LIMIT=1200
BETA_EPISODES_LIMIT = 2000

GREEDY_EPSILON = 0.01
GREEDY_EPSILON_MIN = 0.01  
GREEDY_EPSILON_DECAY= 0.0005

### Environment Setup

In [None]:
stack_size = 4
frame_size = (64, 64)
max_cycles = 125
total_episodes = 5

""" ENV SETUP """
env = pistonball_v6.parallel_env( continuous=False, max_cycles=max_cycles)

In [None]:
env = color_reduction_v0(env)
env = resize_v1(env, frame_size[0], frame_size[1])
env = frame_stack_v1(env, stack_size=stack_size)
num_agents = len(env.possible_agents)
num_actions = env.action_space(env.possible_agents[0]).n
observation_size = env.observation_space(env.possible_agents[0]).shape

### Model

In [None]:
def hidden_init(layer):
    fan_in = layer.weight.data.size()[0]
    lim = 1. / np.sqrt(fan_in)
    return (-lim, lim)

class Actor(nn.Module):
    """Actor Model - Used to update policy."""

    def __init__(self, state_size, action_size, seed, fc1_units=64, fc2_units=64):
        super(Actor, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size*2, fc1_units)        
        self.fc2 = nn.Linear(fc1_units, fc2_units)        
        self.fc3 = nn.Linear(fc2_units, action_size)        
        self.reset_parameters()

    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(-3e-3, 3e-3)        

    def forward(self, state):
        """Actor (policy) network that maps states to the actions"""        
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return torch.tanh(self.fc3(x))


class Critic(nn.Module):
    """Critic Model to evaluate Value"""

    def __init__(self, state_size, action_size, seed, fcs1_units=64, fc2_units=64):
        super(Critic, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fcs1 = nn.Linear(state_size*2, fcs1_units)            
        self.fc2 = nn.Linear(fcs1_units+ (action_size*2), fc2_units)
        self.fc3 = nn.Linear(fc2_units, 1)                
        self.reset_parameters()

    def reset_parameters(self):
        self.fcs1.weight.data.uniform_(*hidden_init(self.fcs1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(-3e-3, 3e-3)       

    def forward(self, state, action):
        """critic/value network that maps (state, action) pairs to the Q-values"""
        xs = F.leaky_relu(self.fcs1(state))
        x = torch.cat((xs, action), dim=1)
        x = F.leaky_relu(self.fc2(x))
        return self.fc3(x)

In [None]:
class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size,  random_seed):        
        self.state_size = state_size
        self.action_size = action_size        
        self.seed = random.seed(random_seed)
        self.batch_size = BATCH_SIZE   
        self.t_step = 0
        self.seed = random.seed(random_seed)
        self.decay_step = 0

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
        
        self.memory =  ReplayBuffer(BATCH_SIZE, BUFFER_SIZE, random_seed)
        
        # Noise process
        self.noise = OUNoise( action_size , random_seed)
       
      
    def step(self, state, action, reward, next_state, done,agent_number, beta):
        """Save experience in replay memory and use random sample to buffer from them."""
        # Save experience / reward                                       
        self.memory.add(state, action, reward, next_state, done)            
       
        self.t_step +=1
        
        if self.t_step %UPDATE_EVERY == 0:
            if self.memory.is_filled():                
                self.learn(agent_number, GAMMA,beta) 

    def numpy_to_torch(self,data):
        return torch.from_numpy(data).float().to(device)
                                
    def act(self, state, episode_num, add_noise=True ):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)                  
        self.actor_local.eval()
        with torch.no_grad():              
            action = self.actor_local(state).cpu().data.numpy()                                            
        self.actor_local.train()               
        if add_noise:                   
            action = self.add_random_noise(action,episode_num)       
        return np.clip(action, -1, 1)
    
    def noise_decay_schedule(self,episode_num):  
        return max(0.0, NOISE_SCALE * (1 - (episode_num / NOISE_DECAY_LIMIT)))

    def add_random_noise(self,action,episode_num):  
        if episode_num < 500:
            return np.random.randn(1,self.action_size)
        action +=  self.noise_decay_schedule(episode_num) * self.noise.sample()
        return action
    
    def reset(self):        
        self.noise.reset()
    
    def learn(self, agent_number, gamma,beta):
        """Update policy and value parameters using given batch of experience tuples"""
                                                  
        states, actions, rewards, next_states, dones = self.memory.sample()
        # update critic, Get predicted next-state actions and Q values from target models
                            
        actions_next = self.actor_target(next_states)
        if agent_number == 0:                     
            actions_next = torch.cat((actions_next, actions[:,2:]), dim=1)
        else:
            actions_next = torch.cat((actions[:,:2], actions_next), dim=1)
            
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)                        
        # Minimize the loss         
        critic_loss =  F.mse_loss(Q_expected, Q_targets)
        
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()
        # update actor, Compute actor loss                 
        actions_pred = self.actor_local(states)
        if agent_number == 0:                    
            actions_pred = torch.cat((actions_pred, actions[:,2:]), dim=1)
        else:            
            #actions_pred = self.actor_local(states[:,self.state_size:])
            actions_pred = torch.cat((actions[:,:2], actions_pred), dim=1)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # update target network#        
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU) 
                           

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters"""
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)


class OUNoise:
    def __init__(self, size, seed, mu=0., theta=0.13, sigma=0.2):
        """Initialize parameters and noise process."""
        self.mu = mu * np.ones(size)
        self.theta = theta
        self.sigma = sigma
        self.seed = random.seed(seed)
        self.size = size
        self.reset()

    def reset(self):
        """Reset the internal state"""
        self.state = copy.copy(self.mu)

    def sample(self):
        """Update internal state"""
        x = self.state        
        dx = self.theta * (self.mu - x) + self.sigma * np.random.standard_normal(self.size)
        self.state = x + dx
        return self.state

    
class ReplayBuffer:    

    def __init__(self, batch_size, buffer_size, seed):
        """Initialize a ReplayBuffer object."""
        
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)
    
    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)
    
    def sample(self):
        """Randomly sample a batch of experiences from memory."""
        experiences = random.sample(self.memory, k=self.batch_size)

        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)

        return (states, actions, rewards, next_states, dones)

    def is_filled(self):
        """Return the current size"""
        return len(self.memory) >= self.batch_size


### Multi Agent

In [None]:
class MADDPGAgent():
    def __init__(self,state_size,action_size,num_agents,random_seed):        
        self.agents = [Agent(state_size, action_size,  random_seed) for _ in range(num_agents)]
        self.num_agents = num_agents
        self.total_state_size = state_size*num_agents 
        self.total_action_size = action_size * num_agents
        self.t_step = 0
    
    def reset(self):
        for agent in self.agents:
            agent.reset()
    
    def act(self,states,episode_num):        
        all_states = np.reshape(states, (1,self.total_state_size))     
        actions = [self.agents[agent_num].act(all_states, episode_num) for agent_num in range(self.num_agents)] 
        actions = np.reshape(actions, (1,self.total_action_size))          
        return actions
        

    def step(self,states,actions,rewards,next_states,dones,beta):   
        states = np.reshape(states, (1,self.total_state_size))
        actions = np.reshape(actions, (1,self.total_action_size))
        next_states = np.reshape(next_states,(1,self.total_state_size))
        for agent_num in range(self.num_agents):
            self.agents[agent_num].step(states,actions,rewards[agent_num],next_states,dones[agent_num],agent_num,beta)        
                              
    def save_checkpt(self):
        torch.save(self.agents[0].actor_local.state_dict(), 'checkpoint_actor1_ddpgv.pth')
        torch.save(self.agents[0].critic_local.state_dict(), 'checkpoint_critic1_ddpgv.pth')
        torch.save(self.agents[1].actor_local.state_dict(), 'checkpoint_actor2_ddpgv.pth')
        torch.save(self.agents[1].critic_local.state_dict(), 'checkpoint_critic2_ddpgv.pth')

In [None]:
state_size = 180
action_size = 3
num_agents = env.unwrapped.num_agents

20

In [None]:
agent = MADDPGAgent(state_size, action_size,num_agents,  0) 

def scale_beta(episode_num):
    return min(1.0, ( episode_num / BETA_EPISODES_LIMIT))                            
    
def noise_decay_schedule(episode_num):  
    return max(0.0, NOISE_SCALE * (1 - (episode_num / NOISE_DECAY_LIMIT)))

In [None]:
def maddpg(n_episodes=TOTAL_EPISODES, max_t=100, print_every=200):
    scores_deque = deque(maxlen=100)
    all_scores = []
    rolling_avg_100 = []    
    total_state_size = state_size*num_agents 
    total_action_size = action_size * num_agents
    
    for i_episode in range(1, n_episodes+1):
        beta = scale_beta(i_episode)
        env_info = env.reset(train_mode=True)[brain_name] # reset the environment
        states = env_info.vector_observations        
        agent.reset()
        scores = np.zeros(num_agents)
        
        while True: 
            actions = agent.act(states,i_episode)              
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations      
            rewards = env_info.rewards
            dones = env_info.local_done                                
            agent.step(states,actions,rewards,next_states,dones,beta)            
            scores += rewards
            states = next_states
            if np.any(dones):
                break              
        max_score = np.max(scores)        
        scores_deque.append(max_score)
        all_scores.append(max_score)   
        rolling_avg_100.append(np.mean(scores_deque))
        print('\rEpisode {} Episode Score:{:.2f} \tAverage Score: {:.3f}'.format(i_episode, max_score, np.mean(scores_deque)), end="")        
        if i_episode % print_every == 0:
            print('\rEpisode {}\t  Episode Score:{:.2f} \t Average Score: {:.3f}'.format(i_episode, max_score,np.mean(scores_deque)))
                           
        if np.mean(scores_deque)>=0.5:                    
            agent.save_checkpt()
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}'.format(i_episode-100, np.mean(scores_deque)))            
            break
         
    env.close()
    return all_scores,rolling_avg_100
scores,rolling_mean = maddpg()

### Plot Result

In [None]:
def plot_result(scores,mean_scores,title_name):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(len(mean_scores)), mean_scores)
    plt.title(title_name)
    plt.ylabel('Rewards')
    plt.xlabel('Episode #')
    plt.show()

In [None]:
plot_result(list(range(100)),rewards_episode,"Rewards Per Episode")

In [None]:
plot_result(list(range(100)),policy_loss,"Policy Loss Per Episode")

In [None]:
plot_result(list(range(100)),value_loss,"Value Loss per Episode")

### *Reference*

- https://github.com/Zorrorulz/MultiAgentDDPG-Tennis
- https://github.com/starry-sky6688/MADDPG
- https://github.com/shariqiqbal2810/MAAC
- https://github.com/openai/maddpg