Trying to implement Prioritezed Experience according to explanation from Thomas Simonini from [here](https://medium.freecodecamp.org/improvements-in-deep-q-learning-dueling-double-dqn-prioritized-experience-replay-and-fixed-58b130cc5682)

## Import the necessary library

In [1]:
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
from unityagents import UnityEnvironment
from agents_maddpg.utils import OUNoise

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
env = UnityEnvironment(file_name="./Crawler_Windows_x86_64/Crawler.exe")
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

env_info = env.reset(train_mode=False)[brain_name]
num_agents = len(env_info.agents)
states = env_info.vector_observations
action_size = brain.vector_action_space_size
state_size = states.shape[1]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: CrawlerBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 129
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 20
        Vector Action descriptions: , , , , , , , , , , , , , , , , , , , 


## Method to plot the progress of the agent's score

In [3]:
def plot_result(scores):
    # plot the scores
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.show()

In [28]:
from agents_maddpg.storage import Storage
def train(agent, n_episodes=500, noise = 1, noise_reduction=0.9999, train_mode=True):
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=train_mode)[brain_name]
        states = env_info.vector_observations
        scores_one_episode = np.zeros(num_agents)
        agent.reset()
        while True:
            actions = agent.act(states, noise=noise)              # select an action (for each agent)
                
            env_info = env.step(np.clip(actions, -1, 1))[brain_name]              # send all actions to the environment
            next_states = env_info.vector_observations                            # get next state (for each agent)
            rewards = env_info.rewards                                            # get reward (for each agent)
            # lets try to promote cooperation
            dones = env_info.local_done                                           # see if episode finished
            agent.step(states, actions, rewards, next_states, dones)       # learn
            states = next_states                                                  # roll over states to next time step
            
            scores_one_episode += rewards
            if np.any(dones):                                                     # exit loop if episode finished
                break
                
        noise = max(noise * noise_reduction, 0.01)        
        score = np.average(scores_one_episode)
        scores.append(score)
        scores_window.append(score)
        mean_100 = np.mean(scores_window)

        if i_episode % 50 == 0:
            print('\rEpisode {}\tAverage Score: {:.3f}\tMax Score: {:.3f}\tNoise: {:.3f}'.
                      format(i_episode, 
                         mean_100, 
                         np.max(scores_window),
                        noise))
            Storage.save("weights/MADDPG/eps_{}_avg_{}.pth".format(i_episode, mean_100), agent=agent)
            
        if len(scores_window) >= 100 and np.mean(scores_window)>=2000:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}'.format(i_episode, mean_100))
            agent.save("weights/MADDPG/final.pth")
            break
            
    return scores

## Prepare the agent
* One instance of the memory ReplayBuffer is built here
* The MADDPG is one version of the DDPG that use one local actor/critic network and one target actor/critic network that are being used by all agents.
* The hyperparameters are set and the agent is initialized
* Several instances of the noise generator are created and attached to the agent

In [25]:
from agents_maddpg.utils import ReplayBuffer, SimpleNoise
from agents_maddpg.storage import Storage
from agents_maddpg.model import ActorCritic
import agents_maddpg
import torch.nn.functional as F
import random

# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = "cpu"
seed = 257
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
memory = ReplayBuffer(action_size, device, int(1e5), 256, seed)
agent = Storage.new(ActorCritic, states.shape[1], action_size, states.shape[0], device,  SimpleNoise, 
                    seed,
                    memory_size=int(1e5),
                    buffer_size=256,
                    ACTIVATION = F.relu,
                    GRADIENT_CLIP = 5,
                    TAU=1e-3,
                    UPDATE_EVERY=32,
                    TRANSFER_EVERY=1,
                    UPDATE_LOOP=16,
                    ADD_NOISE_EVERY=1,
                    BOOTSTRAP_SIZE=4,
                    LR_CRITIC = 3e-4,
                    LR_ACTOR = 3e-4)

### Test save and load

In [26]:
Storage.save("temp.ckp", agent)
loaded = Storage.load("temp.ckp", device, states.shape[0])

### Run training

In [None]:
scores = train(loaded, n_episodes=60000, noise = 1, noise_reduction = 0.9996, train_mode=True)
plot_result(scores)



Episode 50	Average Score: 0.041	Max Score: 0.401	Noise: 0.980
Episode 100	Average Score: 0.386	Max Score: 3.892	Noise: 0.961
Episode 150	Average Score: 0.874	Max Score: 3.892	Noise: 0.942
Episode 200	Average Score: 1.236	Max Score: 5.020	Noise: 0.923
Episode 250	Average Score: 1.470	Max Score: 5.079	Noise: 0.905
Episode 300	Average Score: 1.351	Max Score: 5.079	Noise: 0.887
Episode 350	Average Score: 1.070	Max Score: 3.645	Noise: 0.869


In [None]:
agent.load("weights/MADDPG/eps_2850_avg_570.9131032581789.pth")
noises = [SimpleNoise(action_size, scale=1) for i in range(int(states.shape[1]))] 
agent.set_noise(noises)
scores = train(agent, n_episodes=60000, noise = 1, noise_reduction = 0.99996, train_mode=True)
plot_result(scores)

## Continue the training with small noise

In [None]:
scores = train(agent, n_episodes=2000, noise = 0.01, noise_reduction = 0.9996, train_mode=True)
plot_result(scores)

## View the trained agent

In [None]:
from agents.model import ActorCritic
import torch.nn.functional as F
device = "cpu"
network = ActorCritic(state_size, action_size, state_size * 2 , F.leaky_relu ).to(device)
network.eval()
network.load_state_dict(torch.load("./final_weights/final_maddpg_local.pth"))
env_info = env.reset(train_mode=False)[brain_name]

def act(network, states, device):
    states = torch.from_numpy(states).float().unsqueeze(0).to(device)
    ret = network(states).squeeze().cpu().data.numpy()
    return ret

for i in range(5):
    while True:
        actions = act(network, states, device)
        env_info = env.step(np.clip(actions, -1, 1))[brain_name]  # send all actions to the environment
        states = env_info.vector_observations                     # get next state (for each agent)
        dones = env_info.local_done                               # see if episode finished
        if np.any(dones):                                         # exit loop if episode finished
            break