In [1]:
import matplotlib

matplotlib.use("TkAgg")
import gym
import multiagent
import multiagent.scenarios
import multiagent.scenarios.simple_tag as simple_tag
import multiagent.scenarios.simple_tag as simple_spread
import multiagent.scenarios.simple_tag as simple_adversary
from multiagent.environment import MultiAgentEnv
import multiagent.scenarios as scenarios
from gym import wrappers, logger
import numpy as np
import copy
from multiagent.environment import MultiAgentEnv
import multiagent.scenarios as scenarios

import torch
import torchvision.datasets as datasets
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
import torch.nn.functional as F
from torch import nn
from torch import optim
import copy

from random import sample
from collections import deque
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
def phi(obs):
    return torch.Tensor(obs).to(device,torch.double)

def UONoise():
    theta = 0.15
    sigma = 0.3
    state = 0
    while True:
        yield state
        state += -theta*state+sigma*np.random.randn()

class NN_Q(nn.Module):
    def __init__(self):
        super(NN_Q, self).__init__()
        self.f1 = nn.Linear(14,100)
        self.f2 = nn.Linear(106,100)
        self.f3 = nn.Linear(100,1)

        self.bn0 = nn.BatchNorm1d(14)
        self.bn1 = nn.BatchNorm1d(100)
        self.bn2 = nn.BatchNorm1d(100)
        
    def forward(self, x , actions ):
        x = self.bn0(x)
        x = torch.relu(self.f1(x))
        x = self.bn1(x)
        x = torch.cat((x,actions),dim=1)
        x = torch.relu(self.f2(x))
        x = self.bn2(x)
        x = self.f3(x)
        return x

class NN_mu(nn.Module):
    def __init__(self):
        super(NN_mu, self).__init__()
        self.f1 = nn.Linear(14,100)
        self.f2 = nn.Linear(100,2)
        
        self.bn0 = nn.BatchNorm1d(14)        
        self.bn1 = nn.BatchNorm1d(100)
        
    def forward(self, x ):
        x = self.bn0(x)
        x = torch.relu(self.f1(x))
        x = self.bn1(x)
        x = torch.tanh(self.f2(x))
        return x

class Memory():
    def __init__(self, N=1000000):
        self.data = deque(maxlen=N)
        
    def sample(self,n):
        samples = sample(self.data,n)
        lastobs = torch.stack([item[0] for item in samples],dim=1)
        action = torch.stack([item[1] for item in samples],dim=0).to(device,torch.double)
        r = torch.stack([item[2] for item in samples],dim=0)
        obs = torch.stack([item[3] for item in samples],dim=1)
        done = torch.Tensor([item[4] for item in samples]).unsqueeze(1).to(device)
        
        return lastobs,action,r,obs,done
            
    
    def store(self,last_obs,a,r,obs,done):
        self.data.append([last_obs,a,r,obs,done])

class DDPG_agent():
    def __init__(self):
        
        #Creating Q functions
        self.Q = NN_Q().to(device,torch.double)
        self.Q_target = NN_Q().to(device,torch.double)
        self.Q_loss = nn.MSELoss()
        self.Q_target.load_state_dict(self.Q.state_dict())
        self.opt_Q = torch.optim.Adam(self.Q.parameters(),lr=0.01)
        
        #Creating mu functions
        self.mu = NN_mu().to(device,torch.double)
        self.mu_target = NN_mu().to(device,torch.double)
        self.mu_target.load_state_dict(self.mu.state_dict())
        self.opt_mu = torch.optim.Adam(self.mu.parameters(),lr=0.01)
        self.mu.eval()
        
        #Noise handling
        self.explo_1 = UONoise()
        self.explo_2 = UONoise()
        
    
    def act(self,obs,explo):
        
        with torch.no_grad():
            #action = self.mu.forward(obs.unsqueeze(0)).to(torch.device("cpu")) + torch.Tensor(np.random.normal(size=2))*explo
            action = self.mu.forward(obs.unsqueeze(0)).to(torch.device("cpu")) +\
            torch.Tensor([next(self.explo_1),next(self.explo_2)])*explo
            
        return np.array(action).reshape(-1)
    
class Multi_agent():
    def __init__(self,num_agents=3,tau=0.1,gamma=0.95,batch=1024,update_freq=1,max_explo=250,epochs=1,start_train=150):
    
        self.agents = [DDPG_agent() for _ in range(3)]
        
        self.episodes = 0
        self.update_freq = update_freq
        self.max_explo = max_explo
        self.start_train = start_train
        self.i = 0
        
        self.epochs = epochs
        
        self.gamma = gamma
        self.tau = tau
        
        self.memory = Memory()
        self.batch = batch
        
    def update(self):
        for agent in self.agents:
            agent.mu.train()
        
        for _ in range(self.epochs):
            self.i +=1
            lastobs,actions,r,obs,done = self.memory.sample(self.batch)
            
            with torch.no_grad():
                mu = torch.cat([agent.mu_target.forward(obs[i]) for i,agent in enumerate(self.agents)],dim=1)

            mu_last = [torch.cat([actions[:,:i*2],agent.mu.forward(lastobs[i]),actions[:,(i+1)*2:]],dim=1) for i,agent in enumerate(self.agents)]
            for i,agent in enumerate(self.agents):
                
                #1- Critic update (Q)
                agent.opt_Q.zero_grad()
                agent.opt_mu.zero_grad()
                with torch.no_grad():
                    y = r + self.gamma * agent.Q_target.forward(obs[i],mu)
                Qloss = agent.Q_loss( y , agent.Q.forward(lastobs[i],actions) )
                writer.add_scalar('QLoss_'+str(i),Qloss.item(),self.i)
                Qloss.backward()
                agent.opt_Q.step()


                #2- Actor update (mu)
                agent.opt_Q.zero_grad()
                agent.opt_mu.zero_grad()
                mu_loss = -agent.Q.forward( lastobs[i] , mu_last[i] ).mean()
                writer.add_scalar('mu_loss_'+str(i),-mu_loss.item(),self.i)
                mu_loss.backward()
                agent.opt_mu.step()

        #3- smooth update of Q and mu
        for agent in self.agents:
            for p_target,p in zip(agent.Q_target.parameters(),agent.Q.parameters()):
                p_target.data.copy_( self.tau * p.data + (1-self.tau) * p_target.data )
            for p_target,p in zip(agent.mu_target.parameters(),agent.mu.parameters()):
                p_target.data.copy_( self.tau * p.data + (1-self.tau) * p_target.data )
            
        for agent in self.agents:
            agent.mu.eval()   
        
        
    def act(self,obs,done):
        
        explo = 1 - min(self.episodes,self.max_explo) / self.max_explo
        actions = [agent.act(o,explo) for agent,o in zip(self.agents,obs)]
        
        if self.episodes%self.update_freq==0 and self.episodes>self.start_train:
            self.update()     
        
        return actions
        

In [3]:
def make_env(scenario_name, benchmark=False):
    '''
    Creates a MultiAgentEnv object as env. This can be used similar to a gym
    environment by calling env.reset() and env.step().
    Use env.render() to view the environment on the screen.

    Input:
        scenario_name   :   name of the scenario from ./scenarios/ to be Returns
                            (without the .py extension)
        benchmark       :   whether you want to produce benchmarking data
                            (usually only done during evaluation)

    Some useful env properties (see environment.py):
        .observation_space  :   Returns the observation space for each agent
        .action_space       :   Returns the action space for each agent
        .n                  :   Returns the number of Agents
    '''
    # load scenario from script
    scenario = scenarios.load(scenario_name + ".py").Scenario()
    # create world
    world = scenario.make_world()
    # create multiagent environment
    world.dim_c = 0
    if benchmark:
        env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data)
    else:
        env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation)
    env.discrete_action_space = False
    env.discrete_action_input = False
    scenario.reset_world(world)
    return env,scenario,world

#### Training agent

In [5]:
env,scenario,world = make_env('simple_spread')
writer = SummaryWriter("runs/multiagent/simple_spread")
nb_agents = 3
agent = Multi_agent(num_agents=nb_agents)
nb_episodes = 1000



for i in range(nb_episodes):
    obs = env.reset()
    lastobs = None
    reward = []
    
    for _ in range(100):
        a = agent.act(phi(obs),False)
        obs, r, _, _ = env.step(a)
        if lastobs:
            agent.memory.store(phi(lastobs),
                               torch.Tensor(a).to(device).reshape(-1),
                               torch.Tensor([r[0]]).to(device,torch.double),
                               phi(obs),False)

        lastobs = obs
        reward.append(r)
        #if (i+1)%101==0:
            #env.render(mode="none")
    writer.add_scalar('rewards',round(sum([sum(R) for R in reward])/1000,1),i)
    if (i+1)%50==0:
        print("Episode ",i+1,": ","r =",round(sum([sum(R) for R in reward])/1000,1))
    agent.episodes += 1


env.close()

Episode  50 :  r = -5.5
Episode  100 :  r = -3.7
Episode  150 :  r = -6.1
Episode  200 :  r = -4.7
Episode  250 :  r = -2.5
Episode  300 :  r = -3.4


KeyboardInterrupt: 