# Tennis Multi-Agent RL
This notebook solves the Unity-ML Tennis environment, in which two agents are competing against one-another to win a tennis match
## exploring the environment
below code loads the environment and gives us data on the action space and state space

In [1]:
from unityagents import UnityEnvironment
import numpy as np
env = UnityEnvironment(file_name="./Tennis_Windows_x86_64/Tennis.exe")
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents 
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 
Number of agents: 2
Size of each action: 2
There are 2 agents. Each observes a state with length: 24
The state for the first agent looks like: [ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         -6.65278625 -1.5
 -0.          0.          6.83172083  6.         -0.          0.        ]


In [2]:
states.shape

(2, 24)

# play random moves for each agent

In [2]:
from collections import deque
horizon = deque()
for i in range(1, 6):                                      # play game for 5 episodes
    env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
    states = env_info.vector_observations                  # get the current state (for each agent)
    scores = np.zeros(num_agents)                          # initialize the score (for each agent)
    
    while True:
        actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
        actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
        env_info = env.step(actions)[brain_name]           # send all actions to tne environment
        next_states = env_info.vector_observations         # get next state (for each agent)
        rewards = env_info.rewards                         # get reward (for each agent)
        dones = env_info.local_done                        # see if episode finished
        scores += env_info.rewards                         # update the score (for each agent)
        states = next_states                               # roll over states to next time step
        if np.any(dones):                                  # exit loop if episode finished
            break
    print('Score (max over agents) from episode {}: {}'.format(i, np.max(scores)))

Score (max over agents) from episode 1: 0.0
Score (max over agents) from episode 2: 0.0
Score (max over agents) from episode 3: 0.09000000171363354
Score (max over agents) from episode 4: 0.0
Score (max over agents) from episode 5: 0.0


In [12]:
env.close()

UnityEnvironmentException: No Unity environment is loaded.

# Define a memory/buffer class to store experiences and sample from them
Buffer will prepare also the state (concatenate all observations) and `permute` the sample dimensions so that it is
actors x batch_size x state_size (or action_size) instead of batch_size x actors x ...

In [1]:
from numpy.random import default_rng
from collections import namedtuple
import torch

class Buffer():
    def __init__(self, size):
        super(Buffer,self).__init__()
        self.buffer = size*[]
        self.next_idx = 0
        self.max_size = size
        self.rng = default_rng()
        self.experience = namedtuple("Experience", field_names=["obs","state", "action", "all_actions","reward", "next_obs", "next_state", "done"])
        device = "cpu"
        if torch.cuda.is_available():
            device = "cuda:0"
        self.device = torch.device(device)
    def __len__(self):
        return len(self.buffer)

    def push(self, obs, actions, rewards, next_obs, done):
        """push an experience (obs,actions,rewards,next_obs) to the memory buffer"""
        #combine all agent observations
        state = [item for values in obs for item in values]
        next_state = [item for values in next_obs for item in values]
        all_actions = [item for values in actions for item in values]
        data = self.experience(obs, state, actions, all_actions,rewards, next_obs, next_state, done)
        if self.next_idx >= len(self.buffer):
            self.buffer.append(data)
        else:
            self.buffer[self.next_idx] = data
        self.next_idx = int((self.next_idx + 1) % self.max_size)

    def sample(self, batch_size):
        """return a random subset of size batch_size of the memory"""
        assert batch_size <= len(self.buffer), "not enough samples"
        idxs = list(self.rng.choice(len(self.buffer), batch_size))
        return self.encode_sample(idxs)
    
    def encode_sample(self,idxes):
        obs, states, actions, all_actions,rewards, next_obs, next_states,dones = [], [], [], [],[], [], [],[]
        for idx in idxes:
            obs.append(self.buffer[idx].obs)
            states.append(self.buffer[idx].state)
            actions.append(self.buffer[idx].action)
            all_actions.append(self.buffer[idx].all_actions)
            rewards.append(self.buffer[idx].reward)
            next_obs.append(self.buffer[idx].next_obs)
            next_states.append(self.buffer[idx].next_state)
            dones.append(self.buffer[idx].done)
        # cast to tensors
        obs, states, actions, all_actions,rewards, next_obs, next_states,dones = map(
                lambda x: torch.tensor(x).float().to(self.device),
                (obs, states, actions, all_actions,rewards, next_obs, next_states,dones)
            )
        # change from batch x actors x ... to actors x batch x ...
        obs, actions, rewards, next_obs,dones = map(
                lambda x: x.permute(1,0,-1),
                (obs, actions, rewards.unsqueeze(-1), next_obs, dones.unsqueeze(-1))
            )
        return (obs, states, actions, all_actions,rewards, next_obs, next_states,dones)
    
my_buff = Buffer(10)


# define our actor class
We will be implementing a soft actor critic for the DDPG, so here we define our actor

First we define a helper funtion `hidden_init` to allow for random initialization of weights

In [2]:
def hidden_init(layer):
    fan_in = layer.weight.data.size()[0]
    lim = 1. / np.sqrt(fan_in)
    return (-lim, lim)

In [36]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Actor(nn.Module):
    def __init__(self, state_size, action_size, hidden_dims=(64,64), activation_fc = torch.tanh):
        super(Actor,self).__init__()
        self.ds = state_size
        self.da = action_size
        self.activation_fc = activation_fc

        self.input_layer = nn.Linear(self.ds, hidden_dims[0])
        self.input_layer.weight.data.uniform_(*hidden_init(self.input_layer))
        self.hidden_layers = nn.ModuleList()
        for i in range(len(hidden_dims)-1):
            hidden_layer = nn.Linear(hidden_dims[i], hidden_dims[i+1])
            hidden_layer.weight.data.uniform_(*hidden_init(hidden_layer))
            self.hidden_layers.append(hidden_layer)
        self.output_layer = nn.Linear(hidden_dims[-1],self.da)
        self.output_layer.weight.data.uniform_(*hidden_init(self.output_layer))

        self.Noise = OUNoise(self.da, scale=1.0)
        # move to GPU if available
        device = "cpu"
        if torch.cuda.is_available():
            device = "cuda:0"
        self.device = torch.device(device)
        self.to(self.device)
    
    def _format(self, state):
        """cast state to torch tensor and unroll """
        x = state
        if not isinstance(x, torch.Tensor):
            x = torch.tensor(x, dtype=torch.float32).to(self.device)
        x = x.view(-1,self.ds)
        return x

    def forward(self,states):
        states = self._format(states)
        x = self.activation_fc(self.input_layer(states))
        for hidden_layer in self.hidden_layers:
            x = self.activation_fc(hidden_layer(x))
        x = torch.tanh(self.output_layer(x))
        return x
    
    def np_act(self,states, noise=0.0):
        action = self.forward(states)
        np_action = action.detach().cpu().numpy().clip(-1.0,1.0)
        return np_action 


## Create Critic
Our critic is special, as this is a multi-agent system, and the critic will take into account the policies of all actors to criticize a policy


In [37]:
class CentralActionValueFn(nn.Module):
    def __init__(self, state_size, action_size, num_agents, hidden_dims=(64,64), activation_fc=torch.tanh):
        super(CentralActionValueFn, self).__init__()
        self.ds = state_size*num_agents
        self.da = action_size*num_agents
        self.num_agents = num_agents
        self.activation_fc = activation_fc
        self.input_layer = nn.Linear(self.ds+self.da, hidden_dims[0])
        self.input_layer.weight.data.uniform_(*hidden_init(self.input_layer))
        self.hidden_layers = nn.ModuleList()
        for i in range(len(hidden_dims)-1):
            hidden_layer = nn.Linear(hidden_dims[i], hidden_dims[i+1])
            hidden_layer.weight.data.uniform_(*hidden_init(hidden_layer))
            self.hidden_layers.append(hidden_layer)
        self.output_layer = nn.Linear(hidden_dims[-1],1)
        self.output_layer.weight.data.uniform_(*hidden_init(self.output_layer))
        # move to GPU if available
        device = "cpu"
        if torch.cuda.is_available():
            device = "cuda:0"
        self.device = torch.device(device)
        self.to(self.device)
    
    def forward(self, states, actions):
        x = torch.cat((states,actions), dim=1)
        x = self.activation_fc(self.input_layer(x))
        for hidden_layer in self.hidden_layers:
            x = self.activation_fc(hidden_layer(x))
        x = self.output_layer(x)
        return x

Define a class for adding exploration noise for actions.
Below Ornstein-Uhlenbeck Noise class

In [38]:
import numpy as np
import torch


# from https://github.com/songrotek/DDPG/blob/master/ou_noise.py
class OUNoise:

    def __init__(self, action_dimension, scale=0.1, mu=0, theta=0.15, sigma=0.2):
        self.action_dimension = action_dimension
        self.scale = scale
        self.mu = mu
        self.theta = theta
        self.sigma = sigma
        self.state = np.ones(self.action_dimension) * self.mu
        self.reset()

    def reset(self):
        self.state = np.ones(self.action_dimension) * self.mu

    def noise(self):
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(len(x))
        self.state = x + dx
        return torch.tensor(self.state * self.scale).float()

Create a DDPG class, which simply combines two `Actor` (current and target) and two `Critic` classes (current and target)

In [39]:
class DDPG():
    def __init__(self,
                 state_size, 
                 action_size, 
                 total_agents=1, 
                 actor_activation_fc=F.relu, 
                 lr_actor = 0.01,
                 critic_activation_fc=F.relu,
                 lr_critic = 0.01,
                 actor_hidden_dims = (64,64),
                 critic_hidden_dims = (64,64),
                 gamma = 0.99,
                 tau = 0.1):
        super(DDPG,self).__init__()
        self.ds = state_size
        self.da = action_size
        self.tot_agents = total_agents
        self.noise = OUNoise(action_dimension=self.da, scale=.1)
        #self.discount = gamma
        #self.network_update_factor = tau
        # create actor / critic networks

        self.critic = CentralActionValueFn(state_size=self.ds, 
                                           action_size=self.da, 
                                           num_agents=self.tot_agents, 
                                           hidden_dims=critic_hidden_dims, 
                                           activation_fc=critic_activation_fc)
        self.target_critic = CentralActionValueFn(state_size=self.ds, 
                                                  action_size=self.da, 
                                                  num_agents=self.tot_agents, 
                                                  hidden_dims=critic_hidden_dims, 
                                                  activation_fc=critic_activation_fc)
                                                
        self.actor = Actor(state_size=self.ds, 
                           action_size=self.da, 
                           hidden_dims=actor_hidden_dims, 
                           activation_fc = actor_activation_fc)
        self.target_actor = Actor(state_size=self.ds, 
                                  action_size=self.da, 
                                  hidden_dims=actor_hidden_dims, 
                                  activation_fc = actor_activation_fc)
        
        # make target & current networks identical
        self.soft_update(tau=1.0)
        # set up optimizers
        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=1.e-5)
        
    def act(self, obs, noise=0.0):
        action = self.actor(obs) + noise*self.noise.noise()
        #print('actor: got obs {} and will take action {}'.format(obs,action))
        action = action.clip(-1.0,1.0)
        return action

    def target_act(self, obs, noise=0.0):
        action = self.target_actor(obs) + noise*self.noise.noise()
        action = action.clip(-1.0,1.0)
        return action
    
    
    def soft_update(self,tau=0.01):
        """Soft update model parameters.
        """
        # update actor
        for target_param, local_param in zip(self.target_actor.parameters(), self.actor.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
        # update critic
        for target_param, local_param in zip(self.target_critic.parameters(), self.critic.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)


# Multi-Agent DDPG class
This class is the center of our learning. Based on the number of agents we will create as many DDPG agents in here, and this class will be performing a `learn_step` to train our agents

In [30]:
import numpy as np 
from torch.optim import Adam
class MADDPG():
    def __init__(self,
                env,
                gamma=0.99,
                tau=0.1,
                memory_size = 1e6,
                batch_size = 1024):
        super(MADDPG,self).__init__()
        self.discount = gamma
        self.network_update_factor = tau
        self.env = env
        self.memory = Buffer(size=int(memory_size))
        self.batch_size = batch_size

        self.brain_name, self.num_agents, self.da, self.ds = self._get_env_params(env)
        self.agents = [DDPG(state_size = self.ds, 
                            action_size = self.da, 
                            total_agents=self.num_agents, 
                            actor_activation_fc=F.relu, 
                            lr_actor=0.01,
                            critic_activation_fc=F.relu,
                            lr_critic=0.01,
                            actor_hidden_dims = (64,64),
                            critic_hidden_dims = (64,64),
                            gamma = 0.99,
                            tau = 0.1) for _ in range(self.num_agents)]
        
    
    def _get_env_params(self, env):
        brain_name = env.brain_names[0]
        brain = env.brains[brain_name]
        # reset the environment
        env_info = env.reset(train_mode=True)[brain_name]
        # number of agents 
        num_agents = len(env_info.agents)
        # get the action size
        action_size = brain.vector_action_space_size
        # get the state space 
        states = env_info.vector_observations
        state_size = states.shape[1]
        return brain_name, num_agents, action_size, state_size

    def act_np(self, obs, noise=0.0):
        actions = []
        #obs = torch.tensor(obs).float()
        for agent, observation in zip(self.agents, obs):
            actions.append(agent.target_act(observation, noise).detach().squeeze().cpu().data.numpy())
        actions = np.array(actions)
        return actions
    def act(self,obs,noise=0.0):
        actions = []
        #obs = torch.tensor(obs).float()
        for agent, observation in zip(self.agents, obs):
            actions.append(agent.act(observation, noise))
        actions = torch.cat(actions, dim=1)
        return actions
    def target_act(self,obs,noise=0.0):
        target_actions = []
        #obs = torch.tensor(obs).float()
        for agent, observation in zip(self.agents, obs):
            target_actions.append(agent.target_act(observation, noise))
        target_actions = torch.cat(target_actions, dim=1)
        return target_actions

    def train(self, iterations):
        for it in range(iterations):
            print('\rIteration {}'.format(it))
            self.agents[0].noise.reset()
            self.agents[1].noise.reset()
            env_info = self.env.reset(train_mode=True)[self.brain_name]
            states = env_info.vector_observations   
            while True:
                actions = self.act_np(states)
                #print(actions)
                env_info = self.env.step(actions)[self.brain_name]
                next_states = states = env_info.vector_observations
                rewards = env_info.rewards 
                dones = env_info.local_done
                self.memory.push(states, actions, rewards, next_states, dones)
                if (len(self.memory)> self.batch_size):
                    self.learn_step()
                #scores += env_info.rewards
                if np.any(dones):
                    break
                states = next_states
    def learn_step(self):
        obs, states, actions, all_actions, rewards, next_obs, next_states,dones = self.memory.sample(self.batch_size)
        
        for i in range(self.num_agents):
            agent = self.agents[i]

            agent.critic_optimizer.zero_grad()
            future_actions = self.target_act(next_obs)
            #print('future actions shape {}'.format(future_actions.shape))
            q_next = agent.target_critic(next_states, future_actions)
            #print('shape q_next :{}, rewards[i]:{}, dones[i]:{}'.format(
            #    q_next.shape, rewards[i].shape, dones[i].shape
            #))
            y = rewards[i].view(-1, 1) + self.discount * q_next * (1 - dones[i].view(-1, 1))
            q = agent.critic(states, all_actions)
            huber_loss = torch.nn.SmoothL1Loss()
            critic_loss = huber_loss(q, y.detach())
            critic_loss.backward()
            #torch.nn.utils.clip_grad_norm_(agent.critic.parameters(), 0.5)
            agent.critic_optimizer.step()

            agent.actor_optimizer.zero_grad()

            new_actions = [ self.agents[num_agent].actor(ob) if num_agent == i \
                   else self.agents[num_agent].actor(ob).detach()
                   for num_agent, ob in enumerate(obs) ]
            new_actions = torch.cat(new_actions,dim=1)
            #print('new actions shape {}'.format(new_actions.shape))
            #print('obs shape {}'.format(obs.shape))
            actor_loss = -agent.critic(states, new_actions).mean()
            actor_loss.backward()
            #torch.nn.utils.clip_grad_norm_(agent.actor.parameters(),0.5)
            agent.actor_optimizer.step()

    def play(self, iterations):
        for it in range(iterations):
            print('\rIteration {}'.format(it))
            env_info = self.env.reset(train_mode=False)[self.brain_name]
            states = env_info.vector_observations   
            while True:
                actions = self.act_np(states)
                #print(actions)
                env_info = self.env.step(actions)[self.brain_name]
                next_states = states = env_info.vector_observations
                rewards = env_info.rewards 
                dones = env_info.local_done
                if np.any(dones):
                    break
                states = next_states


In [8]:
from unityagents import UnityEnvironment
env = UnityEnvironment(file_name="./Tennis_Windows_x86_64/Tennis.exe")


INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


In [31]:
dd = MADDPG(env)

In [34]:
dd.train(1000)

Iteration 0
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
Iteration 9
Iteration 10
Iteration 11
Iteration 12
Iteration 13
Iteration 14
Iteration 15
Iteration 16
Iteration 17
Iteration 18
Iteration 19
Iteration 20
Iteration 21
Iteration 22
Iteration 23
Iteration 24
Iteration 25
Iteration 26
Iteration 27
Iteration 28
Iteration 29
Iteration 30
Iteration 31
Iteration 32
Iteration 33
Iteration 34
Iteration 35
Iteration 36
Iteration 37
Iteration 38
Iteration 39
Iteration 40
Iteration 41
Iteration 42
Iteration 43
Iteration 44
Iteration 45
Iteration 46
Iteration 47
Iteration 48
Iteration 49
Iteration 50
Iteration 51
Iteration 52
Iteration 53
Iteration 54
Iteration 55
Iteration 56
Iteration 57
Iteration 58
Iteration 59
Iteration 60
Iteration 61
Iteration 62
Iteration 63
Iteration 64
Iteration 65
Iteration 66
Iteration 67
Iteration 68
Iteration 69
Iteration 70
Iteration 71
Iteration 72
Iteration 73
Iteration 74
Iteration 75
Iteration 76
Iteration

In [26]:
dd.play = play

In [35]:
dd.play(10)

Iteration 0
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
Iteration 9
