In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
from collections import namedtuple, deque
import matplotlib.pyplot as plt
from unityagents import UnityEnvironment
import numpy as np
from numpy.random import choice
import copy

In [None]:
env = UnityEnvironment(file_name="envs/tennis/Tennis.exe")

In [None]:

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents 
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

In [2]:
def hidden_init(layer):
    fan_in = layer.weight.data.size()[0]
    lim = 1. / np.sqrt(fan_in)
    return (-lim, lim)

class Actor(nn.Module):
    def __init__(self, nA, nS, seed):
        super(Actor, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.dense1 = nn.Linear(nS, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.dense2 = nn.Linear(256, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.dense3 = nn.Linear(128, nA)
        self.reset_parameters()
    
    def reset_parameters(self):
        self.dense1.weight.data.uniform_(*hidden_init(self.dense1))
        self.dense2.weight.data.uniform_(*hidden_init(self.dense2))
        self.dense3.weight.data.uniform_(-3e-3, 3e-3)
    
    def forward(self, state, training=True):
        x = F.leaky_relu(self.bn1(self.dense1(state)))
        x = F.leaky_relu(self.bn2(self.dense2(x)))
        x = F.tanh(self.dense3(x))

#         x = F.leaky_relu(self.dense1(state))
#         x = F.leaky_relu(self.dense2(x))
#         x = F.tanh(self.dense3(x))
        return x

class Critic(nn.Module):
    def __init__(self, nA, nS, seed):
        super(Critic, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.dense1 = nn.Linear(nS, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.dense2 = nn.Linear(nA+256, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.dense3 = nn.Linear(128, 1)
        self.reset_parameters()
    
    def reset_parameters(self):
        self.dense1.weight.data.uniform_(*hidden_init(self.dense1))
        self.dense2.weight.data.uniform_(*hidden_init(self.dense2))
        self.dense3.weight.data.uniform_(-4e-3, 4e-3)
    
    def forward(self, state, action, training=True):
#         x = F.leaky_relu(self.bn1(self.dense1(state))).float()
#         x = F.leaky_relu(self.bn2(self.dense2(torch.cat((x, action.float()), 1))))
#         x = F.tanh(self.dense3(x))

        x = F.leaky_relu(self.dense1(state))
        x = torch.cat((x, action), 1)
        x = F.leaky_relu(self.dense2(x))
        x = self.dense3(x)
        
        return x

In [3]:
class ReplayBuffer:
    def __init__(self, buffer_size, batch_size, nA, nS, seed, prioritized=False, e=0.1):
        self.e=e
        self.nA = nA
        self.nS = nS
        self.memory = deque(maxlen=buffer_size)
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.prioritized = prioritized
        self.seed = random.seed(seed)
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done", "tdE"])
        
    def add(self, state, action, reward, next_state, done, tdE):
        e = self.experience(state, action, reward, next_state, done, tdE)
        self.memory.append(e)
        
    def sample(self, a=0):
#         PRIORITY REPLAY first attempt
#         a = 0 if not self.prioritized
#         experiences = random.sample(self.memory, k=self.batch_size)
#         _, _, _, _, _, tdEs = zip(*self.memory)
#         tdEs = np.array(list(tdEs))
#         tdErrors = (np.abs(tdEs) + self.e)**a
#         tdErrorsSum = np.sum(tdErrors)
#         probabilities = tdErrors / tdErrorsSum
#         experiencesIdx = choice(np.arange(len(tdEs)), self.batch_size, p=probabilities)
#         experiences = [self.memory[c] for c in experiencesIdx]

#         probabilities = torch.from_numpy(np.vstack(probabilities)).float().to(device)
        probabilities = None
        experiences = random.sample(self.memory, k=self.batch_size)

        states = torch.from_numpy(np.stack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.stack([e.action for e in experiences if e is not None])).float().to(device)
        rewards = torch.from_numpy(np.stack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.stack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)

        return (states, actions, rewards, next_states, dones, probabilities)
        
    def __len__(self):
        return len(self.memory)

In [4]:
class OUNoise:
    """Ornstein-Uhlenbeck process."""

    def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.2):
        """Initialize parameters and noise process."""
        self.mu = mu * np.ones(size)
        self.theta = theta
        self.sigma = sigma
        self.seed = random.seed(seed)
        self.reset()

    def reset(self):
        """Reset the internal state (= noise) to mean (mu)."""
        self.state = copy.copy(self.mu)

    def sample(self):
        """Update internal state and return it as a noise sample."""
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.array([random.random() for i in range(len(x))])
        self.state = x + dx
        return self.state

In [5]:
BUFFER_SIZE = int(1e6)  
BATCH_SIZE = 256   
GAMMA = 0.95      
TAU = 2e-3      
LR_ACTOR = 1e-3       
LR_CRITIC = 1e-3     
WEIGHT_DECAY = 0.0001 
UPDATE_EVERY = 8
UPDATE_NUM = 1
EPSILON = 1.0
EPS_DECAY = 2e-5
EPS_END=0.03
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [6]:
class ddpgAgent():
    def __init__(self, nA, nS, nAgents, idxAgent, room, seed, prioritized=False):
        self.state_size = nS
        self.action_size = nA
        self.idxAgent = idxAgent
        self.nAgents = nAgents
        
        self.critic_local = Critic(nA, nS*nAgents, seed).to(device)
        self.critic_target = Critic(nA, nS*nAgents, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
        
        self.actor_local = Actor(nA, nS, seed).to(device)
        self.actor_target = Actor(nA, nS, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)
        
        self.ouNoise = OUNoise(nA, seed)
        self.epsilon = EPSILON
        
    def act(self, state, training=True):
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(state)
        self.actor_local.train()
        av2 = actions.cpu().numpy()
        if(training and (random.random()<self.epsilon)):
            self.epsilon -= EPS_DECAY
            if(self.epsilon < EPS_END):
                self.epsilon = EPS_END
            av2 += self.ouNoise.sample()
        return(np.clip(av2, -1, 1))
    
    def target_act(self, states):
        states = torch.from_numpy(states).float().to(device)
        actions = self.actor_target(states) + self.ouNoise.sample()
        
        return actions

In [None]:
class MADDPG:
    def __init__(self, nAgents, nS, nA, seed=0):
        self.nAgents=nAgents
        self.state_size = nS
        self.action_size = nA
        
        self.scores_all = None
        self.score_windows = None
        
        self.agents = [Agent(nA, nS, seed)]
        self.discount_factor = GAMMA
        self.tau = TAU
        self.t_step = 0
        
    def update(self, states, actions, rewards, next_states, )