* Trying to implement Prioritezed Experience according to explanation from Thomas Simonini from [here](https://medium.freecodecamp.org/improvements-in-deep-q-learning-dueling-double-dqn-prioritized-experience-replay-and-fixed-58b130cc5682)
* https://github.com/google/dopamine/blob/master/dopamine/agents/dqn/dqn_agent.py
* https://github.com/rlcode/per/blob/master/cartpole_per.py

## Import the necessary library

In [1]:
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
from unityagents import UnityEnvironment
from agents_maddpg.utils import OUNoise
from agents_maddpg.utils import soft_update

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
env = UnityEnvironment(file_name="./Crawler_Windows_x86_64/Crawler.exe")
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

env_info = env.reset(train_mode=False)[brain_name]
num_agents = len(env_info.agents)
states = env_info.vector_observations
action_size = brain.vector_action_space_size
state_size = states.shape[1]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: CrawlerBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 129
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 20
        Vector Action descriptions: , , , , , , , , , , , , , , , , , , , 


## Method to plot the progress of the agent's score

In [3]:
def plot_result(scores):
    # plot the scores
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.show()

In [4]:
from agents_maddpg.storage import Storage
def train(agent, scores=[], n_episodes=500, noise = 1, noise_reduction=0.9999, train_mode=True, episode_start=1):
    scores_window = deque(maxlen=100)  # last 100 scores
    for s in scores[-100:]:
        scores_window.append(s)
        
    for i_episode in range(episode_start, n_episodes+1):
        env_info = env.reset(train_mode=train_mode)[brain_name]
        states = env_info.vector_observations
        scores_one_episode = np.zeros(num_agents)
        agent.reset()
        while True:
            actions = agent.act(states, noise=noise)              # select an action (for each agent)
                
            env_info = env.step(np.clip(actions, -1, 1))[brain_name]              # send all actions to the environment
            next_states = env_info.vector_observations                            # get next state (for each agent)
            rewards = env_info.rewards                                            # get reward (for each agent)
            # lets try to promote cooperation
            dones = env_info.local_done                                           # see if episode finished
            agent.step(states, actions, rewards, next_states, dones)              # learn
            states = next_states                                                  # roll over states to next time step
            
            
            scores_one_episode += rewards
            if np.any(dones):                                                     # exit loop if episode finished
                break
                
        noise = max(noise * noise_reduction, 0.01)        
        score = np.average(scores_one_episode)
        scores.append(score)
        scores_window.append(score)
        mean_100 = np.mean(scores_window)
        
        if i_episode % 50 == 0:
            print('\rEpisode {}\tAverage Score: {:.3f}\tMin: {:.3f}\tMax: {:.3f}\tNoise: {:.4f}\tISW: {:.4f}'.
                  format(i_episode, mean_100, 
                             np.min(np.array(scores_window)[-50:]),
                             np.max(np.array(scores_window)[-50:]),
                        noise, agent.ISW_IMPACT))
            Storage.save("weights\MADDPG\eps_{}_avg_{}.pth".format(i_episode, mean_100), scores, agent=agent)
            
        if len(scores_window) >= 100 and np.mean(scores_window)>=2000:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}'.format(i_episode, mean_100))
            Storage.save("weights\MADDPG\final.pth", scores, agent=agent)
            break
            
    return scores

## Prepare the agent
* One instance of the memory ReplayBuffer is built here
* The MADDPG is one version of the DDPG that use one local actor/critic network and one target actor/critic network that are being used by all agents.
* The hyperparameters are set and the agent is initialized
* Several instances of the noise generator are created and attached to the agent

In [5]:
from agents_maddpg.utils import ReplayBuffer, SimpleNoise
from agents_maddpg.storage import Storage
from agents_maddpg.model import ActorCritic
import agents_maddpg
import torch.nn.functional as F
import random

# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = "cpu"
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

set_seed(2430)

In [6]:
agent = Storage.new(ActorCritic, states.shape[1], action_size, states.shape[0], device,  SimpleNoise, 
                    memory_size=int(1e5),
                    batch_size=256,
                    ACTIVATION = F.relu,
                    GRADIENT_CLIP = 5,
                    TAU=1e-3,
                    UPDATE_EVERY=32,
                    TRANSFER_EVERY=1,
                    UPDATE_LOOP=64,
                    ADD_NOISE_EVERY=1,
                    BOOTSTRAP_SIZE=1,
                    LR_CRITIC = 1e-4,
                    LR_ACTOR = 1e-4,
                    MEMORY_RANDOMNESS = 0.75)
agent.set_isw_impact(0)
agent.set_isw_impact_increment(2e-6)

### Test save and load

In [9]:
Storage.save("temp.ckp", [], agent)
loaded, scores = Storage.load("temp.ckp", device, states.shape[0])

### Run training

In [None]:
scores = train(loaded, scores, n_episodes=60000, noise = 1, noise_reduction = 0.9996, train_mode=True)
plot_result(scores)



Episode 50	Average Score: 0.147	Min: -0.147	Max: 0.423	Noise: 0.9802	ISW: 0.0014
Episode 100	Average Score: 0.187	Min: -0.020	Max: 0.419	Noise: 0.9608	ISW: 0.0028
Episode 150	Average Score: 0.238	Min: -0.266	Max: 0.638	Noise: 0.9418	ISW: 0.0042
Episode 200	Average Score: 0.250	Min: -0.172	Max: 0.507	Noise: 0.9231	ISW: 0.0056
Episode 250	Average Score: 0.226	Min: -0.115	Max: 0.449	Noise: 0.9048	ISW: 0.0072
Episode 300	Average Score: 0.235	Min: -0.162	Max: 0.548	Noise: 0.8869	ISW: 0.0086
Episode 350	Average Score: 0.259	Min: -0.151	Max: 0.547	Noise: 0.8693	ISW: 0.0101
Episode 400	Average Score: 0.263	Min: -0.013	Max: 0.548	Noise: 0.8521	ISW: 0.0115
Episode 450	Average Score: 0.300	Min: -0.031	Max: 0.627	Noise: 0.8352	ISW: 0.0131
Episode 500	Average Score: 0.332	Min: 0.085	Max: 0.681	Noise: 0.8187	ISW: 0.0146
Episode 550	Average Score: 0.367	Min: 0.041	Max: 1.166	Noise: 0.8025	ISW: 0.0161
Episode 600	Average Score: 0.417	Min: 0.065	Max: 1.009	Noise: 0.7866	ISW: 0.0178
Episode 650	Average 

Episode 5000	Average Score: 41.196	Min: 17.728	Max: 78.357	Noise: 0.1353	ISW: 0.4973
Episode 5050	Average Score: 44.225	Min: 19.227	Max: 76.816	Noise: 0.1326	ISW: 0.5074
Episode 5100	Average Score: 50.015	Min: 24.703	Max: 98.541	Noise: 0.1300	ISW: 0.5183
Episode 5150	Average Score: 49.380	Min: 19.247	Max: 89.563	Noise: 0.1274	ISW: 0.5279
Episode 5200	Average Score: 45.593	Min: 2.382	Max: 101.901	Noise: 0.1249	ISW: 0.5372
Episode 5250	Average Score: 44.821	Min: 2.136	Max: 87.539	Noise: 0.1224	ISW: 0.5466
Episode 5300	Average Score: 47.226	Min: 24.108	Max: 95.742	Noise: 0.1200	ISW: 0.5568
Episode 5350	Average Score: 48.200	Min: 16.130	Max: 111.824	Noise: 0.1176	ISW: 0.5665
Episode 5400	Average Score: 45.872	Min: 22.540	Max: 85.240	Noise: 0.1153	ISW: 0.5757
Episode 5450	Average Score: 45.649	Min: 19.412	Max: 90.623	Noise: 0.1130	ISW: 0.5852
Episode 5500	Average Score: 46.454	Min: 20.218	Max: 81.345	Noise: 0.1108	ISW: 0.5948
Episode 5550	Average Score: 45.127	Min: 17.336	Max: 103.662	Noise

In [6]:
loaded, scores = Storage.load("weights/MADDPG/eps_7550_avg_225.86042926408416.pth", device, states.shape[0])
soft_update(loaded.network_local, loaded.network_target, 1)
scores = train(loaded, scores, n_episodes=60000, noise = 0.0488, 
               noise_reduction = 0.9996, train_mode=True, episode_start=len(scores)+1)
plot_result(scores)



Episode 7600	Average Score: 214.122	Min: 21.638	Max: 1068.131	Noise: 0.0478	ISW: 1.0000
Episode 7650	Average Score: 198.673	Min: 16.651	Max: 1376.782	Noise: 0.0469	ISW: 1.0000
Episode 7700	Average Score: 281.999	Min: 8.317	Max: 1725.353	Noise: 0.0460	ISW: 1.0000
Episode 7750	Average Score: 215.733	Min: 11.157	Max: 789.195	Noise: 0.0450	ISW: 1.0000
Episode 7800	Average Score: 183.179	Min: 23.239	Max: 2588.496	Noise: 0.0442	ISW: 1.0000
Episode 7850	Average Score: 237.047	Min: 37.983	Max: 759.276	Noise: 0.0433	ISW: 1.0000
Episode 7900	Average Score: 255.290	Min: 15.941	Max: 1442.198	Noise: 0.0424	ISW: 1.0000
Episode 7950	Average Score: 251.743	Min: 16.020	Max: 1129.160	Noise: 0.0416	ISW: 1.0000
Episode 8000	Average Score: 191.365	Min: 17.582	Max: 1230.732	Noise: 0.0408	ISW: 1.0000
Episode 8050	Average Score: 159.523	Min: 16.471	Max: 810.156	Noise: 0.0400	ISW: 1.0000
Episode 8100	Average Score: 199.220	Min: 25.135	Max: 1200.228	Noise: 0.0392	ISW: 1.0000
Episode 8150	Average Score: 224.434	

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


Episode 10900	Average Score: nan	Min: nan	Max: nan	Noise: 0.0128	ISW: 1.0000
Episode 10950	Average Score: nan	Min: -0.575	Max: -0.575	Noise: 0.0125	ISW: 1.0000
Episode 11000	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0123	ISW: 1.0000
Episode 11050	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0120	ISW: 1.0000
Episode 11100	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0118	ISW: 1.0000


KeyboardInterrupt: 

In [None]:
loaded, scores = Storage.load("weights/MADDPG/eps_10850_avg_172.67637378279608.pth", device, states.shape[0])
scores = train(loaded, scores, n_episodes=60000, noise = 0.0130, 
               noise_reduction = 0.9996, train_mode=True, episode_start=len(scores)+1)
plot_result(scores)



Episode 10900	Average Score: 146.060	Min: 15.676	Max: 544.177	Noise: 0.0127	ISW: 1.0000
Episode 10950	Average Score: 149.753	Min: 18.019	Max: 740.840	Noise: 0.0125	ISW: 1.0000
Episode 11000	Average Score: 209.379	Min: 9.605	Max: 1239.542	Noise: 0.0122	ISW: 1.0000
Episode 11050	Average Score: 283.733	Min: 13.840	Max: 2771.085	Noise: 0.0120	ISW: 1.0000


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


Episode 11100	Average Score: nan	Min: nan	Max: nan	Noise: 0.0118	ISW: 1.0000
Episode 11150	Average Score: nan	Min: -0.575	Max: -0.575	Noise: 0.0115	ISW: 1.0000
Episode 11200	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0113	ISW: 1.0000
Episode 11250	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0111	ISW: 1.0000
Episode 11300	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0109	ISW: 1.0000
Episode 11350	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0106	ISW: 1.0000
Episode 11400	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0104	ISW: 1.0000
Episode 11450	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0102	ISW: 1.0000
Episode 11500	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0100	ISW: 1.0000
Episode 11550	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0100	ISW: 1.0000
Episode 11600	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0100	ISW: 1.0000
Episode 11650	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise:

Episode 15900	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0100	ISW: 1.0000
Episode 15950	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0100	ISW: 1.0000
Episode 16000	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0100	ISW: 1.0000
Episode 16050	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0100	ISW: 1.0000
Episode 16100	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0100	ISW: 1.0000
Episode 16150	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0100	ISW: 1.0000
Episode 16200	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0100	ISW: 1.0000
Episode 16250	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0100	ISW: 1.0000
Episode 16300	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0100	ISW: 1.0000
Episode 16350	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0100	ISW: 1.0000
Episode 16400	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0100	ISW: 1.0000
Episode 16450	Average Score: -0.575	Min: -0.575	Max: -

Episode 20700	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0100	ISW: 1.0000
Episode 20750	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0100	ISW: 1.0000
Episode 20800	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0100	ISW: 1.0000
Episode 20850	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0100	ISW: 1.0000
Episode 20900	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0100	ISW: 1.0000
Episode 20950	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0100	ISW: 1.0000
Episode 21000	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0100	ISW: 1.0000
Episode 21050	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0100	ISW: 1.0000
Episode 21100	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0100	ISW: 1.0000
Episode 21150	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0100	ISW: 1.0000
Episode 21200	Average Score: -0.575	Min: -0.575	Max: -0.575	Noise: 0.0100	ISW: 1.0000
Episode 21250	Average Score: -0.575	Min: -0.575	Max: -

In [7]:
loaded, scores = Storage.load("weights/MADDPG/eps_11050_avg_283.7328178624791.pth", device, states.shape[0])
scores = train(loaded, scores, n_episodes=60000, noise = 0.0130, 
               noise_reduction = 0.9996, train_mode=True, episode_start=len(scores)+1)
plot_result(scores)

## View the trained agent

In [11]:
loaded, scores = Storage.load("weights/MADDPG/eps_11050_avg_283.7328178624791.pth", device, states.shape[0])
loaded.network_local.eval()
env_info = env.reset(train_mode=False)[brain_name]

def act(network, states, device):
    states = torch.from_numpy(states).float().unsqueeze(0).to(device)
    ret = network(states).squeeze().cpu().data.numpy()
    return ret

for i in range(10):
    while True:
        actions = act(loaded.network_local, states, device)
        env_info = env.step(np.clip(actions, -1, 1))[brain_name]  # send all actions to the environment
        states = env_info.vector_observations                     # get next state (for each agent)
        dones = env_info.local_done                               # see if episode finished
        if np.any(dones):                                         # exit loop if episode finished
            break

1. https://towardsdatascience.com/soft-actor-critic-demystified-b8427df61665
1. https://ai.googleblog.com/2019/01/soft-actor-critic-deep-reinforcement.html