# Target
1. From OpenAI
    1. At test time, to see how well the policy exploits what it has learned, remove stochasticity and use the mean action instead of a sample from the distribution. This tends to improve performance over the original stochastic policy.
    1. Explore randomly prior to start SAC befor n_steps
    1. Use Value Network
    1. Set proper entropy based on formula from OpenAI H = - log(x)

## Import the necessary library

In [1]:
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
from unityagents import UnityEnvironment

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
env = UnityEnvironment(file_name="./Crawler_Windows_x86_64/Crawler.exe")
# env = UnityEnvironment(file_name="./Reacher_Windows_x86_64/Reacher.exe")
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

env_info = env.reset(train_mode=False)[brain_name]
num_agents = len(env_info.agents)
states = env_info.vector_observations
action_size = brain.vector_action_space_size
state_size = states.shape[1]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: CrawlerBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 129
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 20
        Vector Action descriptions: , , , , , , , , , , , , , , , , , , , 


## Method to plot the progress of the agent's score

In [3]:
def plot_result(scores):
    # plot the scores
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.show()

In [4]:
def train(agent, scores=[], n_episodes=500, train_mode=True, episode_start=1, start_at = 1000):
    scores_window = deque(maxlen=100)  # last 100 scores
    for s in scores[-100:]:
        scores_window.append(s)
    frame_no = 0
    for i_episode in range(episode_start, n_episodes+1):
        env_info = env.reset(train_mode=train_mode)[brain_name]
        states = env_info.vector_observations
        scores_one_episode = np.zeros(num_agents)
        while True:
            frame_no += 1
            if(frame_no < start_at):
                actions = np.random.randn(num_agents, action_size) # select a random action (for each agent)
            else:
                actions = agent.act(states)              # select an action (for each agent)
                
            env_info = env.step(np.clip(actions, -1, 1))[brain_name]              # send all actions to the environment
            next_states = env_info.vector_observations                            # get next state (for each agent)
            rewards = env_info.rewards                                            # get reward (for each agent)
            dones = env_info.local_done                                           # see if episode finished
            agent.step(states, actions, rewards, next_states, dones)              # learn
            states = next_states                                                  # roll over states to next time step
            
            scores_one_episode += rewards
            if np.any(dones):                                                     # exit loop if episode finished
                break
                       
        score = np.average(scores_one_episode)
        scores.append(score)
        scores_window.append(score)
        mean_100 = np.mean(scores_window)
        
        if i_episode % 50 == 0:
            print('\rEpisode {}\tAvg: {:.3f}\tMin: {:.3f}\tMax: {:.3f}\talpha: {:.3f}\tPLoss: {:.3f}\tCLoss: {:.3f}\tEst: {:.3f}'.
                  format(i_episode, mean_100, 
                             np.min(np.array(scores_window)[-50:]),
                             np.max(np.array(scores_window)[-50:]),
                             agent.network.log_alpha.exp().cpu().detach().numpy().item(),
                             agent.policy_loss, np.mean(agent.critics_losses),
                             agent.estimation))
            Storage.save("weights\SAC_Value\eps_{}_avg_{:.3f}.pth".format(i_episode, mean_100), scores, agent=agent)
            
        if len(scores_window) >= 100 and np.mean(scores_window)>=2000:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}'.format(i_episode, mean_100))
            Storage.save("weights\SAC_Value\final.pth", scores, agent=agent)
            break
            
    return scores

In [5]:
import agents_maddpg
import random

# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = "cpu"
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

set_seed(354)

1. Every step, there are 12 new experiences. So before any update, there are 32x12 new experiences. If, MADDPG is sample efficient by 4 times, therefore, every update, 32x12 experiences should be learned 4 times!

In [28]:
from agents_maddpg.storage_sac import Storage
from agents_maddpg.model import TanhGaussianActorCriticValue
import torch.nn.functional as F
agent = Storage.new_sac_value( TanhGaussianActorCriticValue, states.shape[1], action_size, device,  
                    memory_size=int(1e5),
                    batch_size=32,
                    ACTIVATION = F.leaky_relu,
                    TAU=1e-2,
                    LR_CRITIC = 1e-4,
                    LR_ACTOR = 1e-4,
                    LR_ALPHA = 1e-4,
                    UPDATE_EVERY=1,
                    TRANSFER_EVERY=1,
                    UPDATE_LOOP=1,
                    GAMMA=0.99,
                    TARGET_ENTROPY=1.2,
                    Q_NUMBER = 3,
                    WEIGHT_DECAY = 0)

### Test save and load

In [29]:
from agents_maddpg.sac_value import SAC_Value
Storage.save("temp.ckp", [], agent)
loaded, scores = Storage.load("temp.ckp", device, agent_class = SAC_Value)

### Run training

In [None]:
# loaded.network.log_alpha = torch.nn.Parameter(torch.tensor(-0.5, dtype=torch.float32))
scores = train(loaded, scores, n_episodes=60000,train_mode=True)
plot_result(scores)

Episode 50	Avg: 0.565	Min: -0.120	Max: 3.021	alpha: 0.946	PLoss: -49.631	CLoss: 48.030	Est: 36.290
Episode 100	Avg: 0.503	Min: -9.767	Max: 12.849	alpha: 0.854	PLoss: -137.085	CLoss: 7.346	Est: 124.477
Episode 150	Avg: 0.966	Min: -8.776	Max: 11.872	alpha: 0.652	PLoss: -293.127	CLoss: 5.822	Est: 284.478
Episode 200	Avg: 1.735	Min: -4.921	Max: 15.248	alpha: 0.490	PLoss: -358.814	CLoss: 10.067	Est: 352.681
Episode 250	Avg: 2.904	Min: -4.654	Max: 20.957	alpha: 0.372	PLoss: -372.314	CLoss: 7.204	Est: 367.173
Episode 300	Avg: 3.183	Min: -6.475	Max: 13.052	alpha: 0.282	PLoss: -374.410	CLoss: 3.291	Est: 370.747
Episode 350	Avg: 2.112	Min: -7.204	Max: 13.285	alpha: 0.202	PLoss: -321.441	CLoss: 2.247	Est: 318.748
Episode 400	Avg: 1.507	Min: -10.376	Max: 17.065	alpha: 0.150	PLoss: -280.228	CLoss: 1.754	Est: 278.406
Episode 450	Avg: 1.391	Min: -9.667	Max: 14.051	alpha: 0.112	PLoss: -244.433	CLoss: 1.207	Est: 243.252
Episode 500	Avg: 0.453	Min: -20.580	Max: 11.557	alpha: 0.089	PLoss: -208.950	CLoss:

In [None]:
loaded, scores = Storage.load("weights/SAC_Value/eps_4250_avg_53.859.pth", device)
scores = train(loaded, scores, n_episodes=60000, train_mode=True, episode_start=len(scores)+1, start_at=0)
plot_result(scores)

### View the trained agent

In [None]:
loaded, scores = Storage.load("weights/SAC_Value/eps_3700_avg_1.200.pth", device)
loaded.network.eval()
env_info = env.reset(train_mode=False)[brain_name]

def act(network, states, device):
    states = torch.from_numpy(states).float().unsqueeze(0).to(device)
    ret = network(states).squeeze().cpu().data.numpy()
    return ret

for i in range(20):
    print(i)
    while True:
        actions = act(loaded.network, states, device)
        env_info = env.step(np.clip(actions, -1, 1))[brain_name]  # send all actions to the environment
        states = env_info.vector_observations                     # get next state (for each agent)
        dones = env_info.local_done                               # see if episode finished
        if np.any(dones):                                         # exit loop if episode finished
            break