In [1]:
# Environment
from unityagents import UnityEnvironment

# Standard Modules
import os
import ast
import ujson
import random
import itertools
import numpy as np
import pandas as pd
from collections import deque

# Deep Learning Modules
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical, MultivariateNormal
from torch.distributions.normal import Normal

# Plotting Modules
import matplotlib.pyplot as plt
%matplotlib inline

# Define if GPU available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Activate Auto-completer
%config Completer.use_jedi = False

from tqdm import tqdm

env = UnityEnvironment(file_name='Reacher_Multi.app')
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]
# number of agents
num_agents = len(env_info.agents)
# size of each action
action_size = brain.vector_action_space_size
# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_size -> 5.0
		goal_speed -> 1.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [2]:
class MemoryReply:
    '''Defines the memory replay of stored samples.
    memory_size (int) : maximum size of buffer
    replay_size (int) : size of each training batch
    seed        (int) : random seed
    '''
    def __init__(self, memory_size, replay_size, seed=123):
        self.memory = deque(maxlen=memory_size)  
        self.replay_size = replay_size
        self.seed = random.seed(seed)
    
    def add(self, state, action, reward, next_state, done):
        '''Store samples to memory
        state      ([float]) : The current state space of the givern envirnment
        action         (int) : The stochastic or predicted action for the current state space
        reward         (int) : The reward recieved for that action
        next_state ([float]) : The next state space of the givern envirnment after an action has been taken
        done          (bool) : Whether the envirnment has been completed or not
        '''
        if torch.is_tensor(state): state = state.detach().numpy()
        if torch.is_tensor(action): action = action.detach().numpy()
        if torch.is_tensor(reward): reward = reward.detach().numpy()
        if torch.is_tensor(next_state): next_state = next_state.detach().numpy()
        if torch.is_tensor(state): done = done.detach().numpy()
        
        #if reward > 0:
        self.memory.append({"state":state, "action":np.array(action), "reward":reward, "next_state":next_state, "done":done})
    
    def sample(self):
        '''Sample experiences from memory.'''
        experiences = random.sample(self.memory, k=self.replay_size)
        
        states = torch.FloatTensor(np.array([e['state'] for e in experiences])).to(device)
        actions = torch.FloatTensor(np.array([e['action'] for e in experiences])).to(device)
        rewards = torch.FloatTensor(np.array([e['reward'] for e in experiences])).unsqueeze(1).to(device)
        next_states = torch.FloatTensor(np.array([e['next_state'] for e in experiences])).to(device)
        dones = torch.FloatTensor(np.array([float(e['done']) for e in experiences])).unsqueeze(1).to(device)
  
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)
    

In [3]:
class ActorPolicy(nn.Module):
    def __init__(self,state_size,action_size,hidden=[66,66]):
        super().__init__()
        layerlist = []
        layerlist.append(nn.BatchNorm1d(state_size))
        layerlist.append(nn.Linear(state_size, hidden[0]))
        layerlist.append(nn.ReLU(inplace=True))
        layerlist.append(nn.Linear(hidden[0], hidden[1]))
        layerlist.append(nn.ReLU(inplace=True))
        layerlist.append(nn.Linear(hidden[1], action_size))
        layerlist.append(nn.Tanh())
        self.sequence = nn.Sequential(*layerlist)
    
    def forward(self, state):     
        x = np.array(state)
        x = self.sequence(torch.FloatTensor(x).to(device))
        return x
    
class CriticPolicy(nn.Module):
    def __init__(self,state_size,action_size,hidden=[66,66]):
        super().__init__()
        input_size = state_size
        outout_size = 1
        layerlist1 = []
        layerlist1.append(nn.BatchNorm1d(state_size))
        layerlist1.append(nn.Linear(input_size, hidden[0]))
        layerlist1.append(nn.ReLU(inplace=True))
        self.sequence1 = nn.Sequential(*layerlist1)
        
        layerlist2 = []
        layerlist2.append(nn.BatchNorm1d(hidden[0]+action_size))
        layerlist2.append(nn.Linear(hidden[0]+action_size, hidden[1]))
        layerlist2.append(nn.ReLU(inplace=True))
        layerlist2.append(nn.Linear(hidden[1], outout_size))
        self.sequence2 = nn.Sequential(*layerlist2)
    
    def forward(self, state, action):
        if torch.is_tensor(action): action = action.detach()
        x = torch.FloatTensor(state)
        x = self.sequence1(x)
        x = torch.cat((x, torch.FloatTensor(action)), dim=1)
        x = self.sequence2(x)
        return x

In [4]:
lr = 0.001

In [5]:
# initialize critic network Q(s,a|θQ) and actor µ(s|θµ) with weights θQ and θµ
actor = ActorPolicy(state_size,action_size)
critic = CriticPolicy(state_size,action_size)

# initiate optimizer
actor_optimizer = optim.Adam(actor.parameters(), lr=lr)
critic_optimizer = optim.Adam(critic.parameters(), lr=lr)

# Initialize target network Q and µ: θQ'← θQ, θµ'← θµ
actor_target = ActorPolicy(state_size,action_size)
critic_target = CriticPolicy(state_size,action_size)

# copy parameters state dictionary
actor_target.load_state_dict(actor.state_dict())
critic_target.load_state_dict(critic.state_dict())

<All keys matched successfully>

In [6]:
def soft_update(target_net,local_net,tau=0.01):
    '''Soft update: θ_target = τ * θ_local + (1 - τ) * θ_target
    '''
    for target_param, local_param in zip(target_net.parameters(), local_net.parameters()):
        target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
    return target_net

In [7]:
n_episodes = 100
max_t = 1000
print_every = 1
scores_deque = deque(maxlen=100)
scores = []

memory_size = 500000
replay_start_size = 1000
replay_size = 500
gamma = 0.99
learn_every = 10


In [8]:
theta = 0.15
mu = 0
std = 0.15
dt = 0.01

In [9]:
memory = MemoryReply(memory_size, replay_size)
add_noise = True

In [None]:
%%time
n_episodes = 1000

for i_episode in range(1, n_episodes+1):    
    env_info = env.reset(train_mode=True)[brain_name]
    rewards = []
    ohp = np.zeros((num_agents,action_size))
    state = env_info.vector_observations
    for i in range(max_t):
        
        if len(memory) < replay_start_size:
            action = np.random.random((num_agents,4))*2-1
        else:
            actor.eval()
            with torch.no_grad():
                action = action = actor(state).detach().numpy()
            actor.train()
            
            # Random Sample for noise #OUP
            if add_noise == True:
                ohp = ohp + theta * (mu - ohp) * dt + std * np.sqrt(dt)* np.random.randn(num_agents,action_size)
                action = action + ohp
                action = action.clip(-1,1)
            
        env_info = env.step(action)[brain_name]
        reward = env_info.rewards
        rewards.extend(reward)
        done = env_info.local_done
        next_state = env_info.vector_observations
        
        # Store replay buffer
        for s_,a_,r_,ns_,d_ in zip(state, action, reward, next_state, done):
            memory.add(s_,a_,r_,ns_,d_)
            
        state = next_state
        
        if (len(memory) >= replay_start_size) and (i%learn_every==0):
            # sample from replay
            s, a, r, ns, d = memory.sample()

            # critic loss optimizer
            na = actor_target(ns)
            q_target_next = critic_target(ns, na)

            q_target = r + (gamma * (1-d) * q_target_next) 
            q = critic(s, a)
            critic_loss = F.mse_loss(q, q_target)
            critic_optimizer.zero_grad()
            # gradient clipping
            critic_loss.backward()
            torch.nn.utils.clip_grad_norm_(critic.parameters(), 1)
            critic_optimizer.step()

            # actor loss optimizer
            a = actor(s)
            q = critic(s, a)
            actor_loss = -q.mean()
            actor_optimizer.zero_grad()
            actor_loss.backward()
            actor_optimizer.step()

            # soft update
            critic_target = soft_update(critic_target , critic)
            actor_target = soft_update(actor_target , actor)

        if np.any(done):                        
            break

    scores_deque.append(sum(rewards)/num_agents)
    scores.append(sum(rewards)/num_agents)

    if i_episode % print_every == 0:
        print('Episode {}\tLast Score: {:.2f}\tAverage Score: {:.2f}'.format(i_episode, scores_deque[-1], np.mean(scores_deque)))
    if np.mean(scores_deque)>=30.0:
        print('Environment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_deque)))
        break

Episode 1	Last Score: 0.33	Average Score: 0.33
Episode 2	Last Score: 0.37	Average Score: 0.35
Episode 3	Last Score: 0.47	Average Score: 0.39
Episode 4	Last Score: 0.32	Average Score: 0.37
Episode 5	Last Score: 0.44	Average Score: 0.39
Episode 6	Last Score: 0.46	Average Score: 0.40
Episode 7	Last Score: 0.46	Average Score: 0.41
Episode 8	Last Score: 0.40	Average Score: 0.41
Episode 9	Last Score: 0.46	Average Score: 0.41
Episode 10	Last Score: 0.46	Average Score: 0.42
Episode 11	Last Score: 0.38	Average Score: 0.42
Episode 12	Last Score: 0.38	Average Score: 0.41
Episode 13	Last Score: 0.50	Average Score: 0.42
Episode 14	Last Score: 0.37	Average Score: 0.42
Episode 15	Last Score: 0.43	Average Score: 0.42
Episode 16	Last Score: 0.51	Average Score: 0.42
Episode 17	Last Score: 0.42	Average Score: 0.42
Episode 18	Last Score: 0.47	Average Score: 0.42
Episode 19	Last Score: 0.37	Average Score: 0.42
Episode 20	Last Score: 0.38	Average Score: 0.42
Episode 21	Last Score: 0.44	Average Score: 0.42
E