## Import the necessary library

In [1]:
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
from unityagents import UnityEnvironment

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
env = UnityEnvironment(file_name="./Crawler_Windows_x86_64/Crawler.exe")
# env = UnityEnvironment(file_name="./Reacher_Windows_x86_64/Reacher.exe")
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

env_info = env.reset(train_mode=False)[brain_name]
num_agents = len(env_info.agents)
states = env_info.vector_observations
action_size = brain.vector_action_space_size
state_size = states.shape[1]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: CrawlerBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 129
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 20
        Vector Action descriptions: , , , , , , , , , , , , , , , , , , , 


## Method to plot the progress of the agent's score

In [3]:
def plot_result(scores):
    # plot the scores
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(len(scores)), scores)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.show()

In [22]:
from agents_maddpg.storage_sac import Storage
def train(agent, scores=[], n_episodes=500, train_mode=True, episode_start=1, start_to_learn_at = 0):
    scores_window = deque(maxlen=100)  # last 100 scores
    for s in scores[-100:]:
        scores_window.append(s)
    
    frame_no = 0
    for i_episode in range(episode_start, n_episodes+1):
        env_info = env.reset(train_mode=train_mode)[brain_name]
        states = env_info.vector_observations
        scores_one_episode = np.zeros(num_agents)
        while True:
            frame_no += 1
            if(frame_no < start_to_learn_at):
                actions = agent.test(states)
            else:
                actions = agent.act(states)              # select an action (for each agent)
                
            env_info = env.step(np.clip(actions * 1.001, -1, 1))[brain_name]              # send all actions to the environment
            next_states = env_info.vector_observations                            # get next state (for each agent)
            rewards = env_info.rewards                                            # get reward (for each agent)
            dones = env_info.local_done                                           # see if episode finished
            
            if(frame_no < start_to_learn_at):
                agent.add_to_memory(states, actions, rewards, next_states, dones) # add only to memory                      
            else:
                agent.step(states, actions, rewards, next_states, dones)          # learn
            states = next_states                                                  # roll over states to next time step
            
            scores_one_episode += rewards
            if np.any(dones):                                                     # exit loop if episode finished
                break
                       
        score = np.average(scores_one_episode)
        scores.append(score)
        scores_window.append(score)
        mean_100 = np.mean(scores_window)
        
        if i_episode % 50 == 0:
            print('\rEpisode {}\tAvg: {:.3f}\tMin: {:.3f}\tMax: {:.3f}\talpha: {:.3f}\tPLoss: {:.3f}\tCLoss: {:.3f}\tEst: {:.3f}'.
                  format(i_episode, mean_100, 
                             np.min(np.array(scores_window)[-50:]),
                             np.max(np.array(scores_window)[-50:]),
                             agent.network.log_alpha.exp().cpu().detach().numpy().item(),
                             agent.policy_loss, np.mean(agent.critics_losses),
                             agent.estimation))
            Storage.save("weights\SAC\eps_{}_avg_{:.3f}.pth".format(i_episode, mean_100), scores, agent=agent)
            
        if len(scores_window) >= 100 and np.mean(scores_window)>=2000:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}'.format(i_episode, mean_100))
            Storage.save("weights\SAC\final.pth", scores, agent=agent)
            break
            
    return scores

In [23]:
import agents_maddpg
import random

# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = "cpu"
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

set_seed(354)

1. Consider data in the future with higher GAMMA
2. Bigger Q_Number
3. Amplifier of move before sending to environment
4. 3 loops of update every 2 timesteps
5. Bigger hidden layer 128 instead of 112

In [26]:
from agents_maddpg.model import TanhGaussianActorCritic
import torch.nn.functional as F
agent = Storage.new( TanhGaussianActorCritic, states.shape[1], action_size, device,  
                    memory_size=int(1e5),
                    batch_size=16,
                    ACTIVATION = F.leaky_relu,
                    TAU=1e-2,
                    LR_CRITIC = 1e-4,
                    LR_ACTOR = 1e-4,
                    LR_ALPHA = 1e-4,
                    UPDATE_EVERY=2,
                    TRANSFER_EVERY=1,
                    UPDATE_LOOP=3,
                    GAMMA=0.992,
                    TARGET_ENTROPY=2,
                    Q_NUMBER = 3,
                    WEIGHT_DECAY = 0)

### Test save and load

In [27]:
Storage.save("temp.ckp", [], agent)
loaded, scores = Storage.load("temp.ckp", device)

### Run training

In [28]:
# loaded.network.log_alpha = torch.nn.Parameter(torch.tensor(-0.5, dtype=torch.float32))
scores = train(loaded, scores, n_episodes=60000,train_mode=True, start_to_learn_at=0)
plot_result(scores)

Episode 50	Avg: 1.464	Min: -8.353	Max: 15.257	alpha: 0.774	PLoss: -264.682	CLoss: 132.077	Est: 255.743
Episode 100	Avg: 2.123	Min: -6.902	Max: 13.754	alpha: 0.532	PLoss: -355.901	CLoss: 137.241	Est: 352.960
Episode 150	Avg: 4.202	Min: -3.879	Max: 19.025	alpha: 0.361	PLoss: -343.427	CLoss: 63.450	Est: 341.852
Episode 200	Avg: 4.228	Min: -9.098	Max: 11.883	alpha: 0.266	PLoss: -301.306	CLoss: 93.403	Est: 298.919
Episode 250	Avg: 4.777	Min: -6.602	Max: 20.546	alpha: 0.175	PLoss: -242.357	CLoss: 49.654	Est: 241.328
Episode 300	Avg: 4.744	Min: -14.370	Max: 20.704	alpha: 0.132	PLoss: -201.047	CLoss: 44.774	Est: 199.968
Episode 350	Avg: 2.520	Min: -5.901	Max: 9.455	alpha: 0.106	PLoss: -164.944	CLoss: 33.613	Est: 163.805
Episode 400	Avg: 2.289	Min: -10.037	Max: 12.388	alpha: 0.076	PLoss: -108.193	CLoss: 13.181	Est: 107.848
Episode 450	Avg: 1.522	Min: -3.985	Max: 8.872	alpha: 0.068	PLoss: -96.182	CLoss: 19.965	Est: 95.674
Episode 500	Avg: 3.312	Min: -2.219	Max: 22.777	alpha: 0.055	PLoss: -63.815

Episode 4100	Avg: 38.407	Min: 6.915	Max: 185.954	alpha: 0.065	PLoss: -38.411	CLoss: 1.166	Est: 38.337
Episode 4150	Avg: 40.683	Min: 5.894	Max: 150.560	alpha: 0.066	PLoss: -40.070	CLoss: 5.408	Est: 40.098
Episode 4200	Avg: 42.489	Min: 8.606	Max: 136.555	alpha: 0.067	PLoss: -35.455	CLoss: 15.958	Est: 35.438
Episode 4250	Avg: 45.635	Min: 7.156	Max: 169.107	alpha: 0.064	PLoss: -34.889	CLoss: 6.271	Est: 34.591
Episode 4300	Avg: 51.830	Min: 4.154	Max: 247.731	alpha: 0.068	PLoss: -41.211	CLoss: 3.820	Est: 41.112
Episode 4350	Avg: 63.992	Min: 5.151	Max: 288.366	alpha: 0.077	PLoss: -43.944	CLoss: 8.362	Est: 43.845
Episode 4400	Avg: 50.859	Min: 2.452	Max: 91.680	alpha: 0.068	PLoss: -39.917	CLoss: 46.308	Est: 39.740
Episode 4450	Avg: 37.080	Min: 4.278	Max: 180.478	alpha: 0.068	PLoss: -33.552	CLoss: 20.318	Est: 33.294
Episode 4500	Avg: 48.499	Min: 9.557	Max: 208.146	alpha: 0.065	PLoss: -42.173	CLoss: 19.544	Est: 42.206
Episode 4550	Avg: 70.892	Min: 11.735	Max: 313.520	alpha: 0.065	PLoss: -45.949	C

Episode 8100	Avg: 295.563	Min: 8.614	Max: 892.614	alpha: 0.069	PLoss: -96.542	CLoss: 2.481	Est: 96.327
Episode 8150	Avg: 343.926	Min: 20.376	Max: 968.904	alpha: 0.069	PLoss: -96.707	CLoss: 3.931	Est: 96.425
Episode 8200	Avg: 292.141	Min: 5.415	Max: 840.246	alpha: 0.088	PLoss: -89.400	CLoss: 30.407	Est: 89.546
Episode 8250	Avg: 220.093	Min: 12.348	Max: 864.883	alpha: 0.080	PLoss: -95.144	CLoss: 47.435	Est: 94.957
Episode 8300	Avg: 316.114	Min: 5.655	Max: 997.563	alpha: 0.074	PLoss: -102.428	CLoss: 2.923	Est: 102.086
Episode 8350	Avg: 364.575	Min: 20.314	Max: 1038.500	alpha: 0.068	PLoss: -101.441	CLoss: 1.522	Est: 101.643
Episode 8400	Avg: 323.291	Min: 29.724	Max: 917.779	alpha: 0.071	PLoss: -97.730	CLoss: 2.085	Est: 97.851
Episode 8450	Avg: 322.792	Min: 38.377	Max: 950.004	alpha: 0.074	PLoss: -94.466	CLoss: 38.056	Est: 94.328
Episode 8500	Avg: 357.610	Min: 4.117	Max: 1038.434	alpha: 0.062	PLoss: -103.073	CLoss: 1.490	Est: 103.054
Episode 8550	Avg: 357.860	Min: 3.765	Max: 1051.876	alpha:

KeyboardInterrupt: 

## To do next
1. From OpenAI
    1. At test time, to see how well the policy exploits what it has learned, remove stochasticity and use the mean action instead of a sample from the distribution. This tends to improve performance over the original stochastic policy.
    1. Explore randomly prior to start SAC befor n_steps
    1. Use Value Network
    1. Set proper entropy based on formula from OpenAI H = - log(x)

## To experiment
1. Set entropy target as -np.prod(self.env.action_space.shape).item()  # heuristic value from Tuomas according to rlkit
1. Use Prioritized Experience Replay
1. Set fixed alpha
1. What about using that approximator of tanh likelihood trick in PPO Model?
1. What about using target actor as well?
1. What about using MADDPG with alpha optimization, entropy, and with the approximator of tanh likelihood
1. What about clipped double-Q trick on PPO and DDPG?


OpenAI
1. https://spinningup.openai.com/en/latest/algorithms/sac.html#pseudocode

1. Use Value network local and target instead of Q Network local and Target

Giving the agent a vision of what will happen in the future should help. But we have seen that a high bootstrap size at early training will make the agent failing to get enough data. However, at this stage, the agent is able to move a bit so gradually increasing the bootstrap might help.

In [32]:
torch.set_num_threads(6)
torch.manual_seed(486)
loaded, scores = Storage.load("weights/SAC/eps_9350_avg_328.937.pth", device)
loaded.memory.memory.clear()
scores = train(loaded, scores, n_episodes=60000, train_mode=True, episode_start=len(scores)+1, start_to_learn_at=int(2e4))
plot_result(scores)

Episode 9400	Avg: 410.832	Min: 0.670	Max: 1221.947	alpha: 0.058	PLoss: -30.175	CLoss: 0.239	Est: 29.878
Episode 9450	Avg: 254.566	Min: -0.327	Max: 39.428	alpha: 0.055	PLoss: -50.425	CLoss: 3.992	Est: 50.403
Episode 9500	Avg: 9.475	Min: -0.799	Max: 24.305	alpha: 0.054	PLoss: -53.621	CLoss: 5.634	Est: 53.455
Episode 9550	Avg: 5.134	Min: -4.915	Max: 15.599	alpha: 0.049	PLoss: -57.712	CLoss: 106.740	Est: 57.351
Episode 9600	Avg: 2.346	Min: -3.498	Max: 7.154	alpha: 0.049	PLoss: -50.163	CLoss: 2.999	Est: 50.227
Episode 9650	Avg: 6.674	Min: -3.908	Max: 50.557	alpha: 0.045	PLoss: -54.673	CLoss: 6.979	Est: 54.738
Episode 9700	Avg: 12.164	Min: 0.845	Max: 53.947	alpha: 0.054	PLoss: -49.331	CLoss: 8.921	Est: 49.678
Episode 9750	Avg: 11.711	Min: -3.054	Max: 35.900	alpha: 0.058	PLoss: -41.747	CLoss: 6.393	Est: 41.286
Episode 9800	Avg: 9.714	Min: 0.132	Max: 19.789	alpha: 0.055	PLoss: -32.320	CLoss: 5.177	Est: 31.945
Episode 9850	Avg: 9.209	Min: 4.024	Max: 16.471	alpha: 0.053	PLoss: -23.405	CLoss: 9.5

Episode 13400	Avg: 138.018	Min: 8.536	Max: 358.630	alpha: 0.071	PLoss: -79.283	CLoss: 35.009	Est: 79.105
Episode 13450	Avg: 115.744	Min: 8.180	Max: 295.568	alpha: 0.071	PLoss: -76.243	CLoss: 7.693	Est: 76.099
Episode 13500	Avg: 88.205	Min: 7.116	Max: 274.749	alpha: 0.076	PLoss: -72.092	CLoss: 4.538	Est: 71.766
Episode 13550	Avg: 102.060	Min: 7.746	Max: 462.206	alpha: 0.069	PLoss: -75.891	CLoss: 6.149	Est: 75.848
Episode 13600	Avg: 140.766	Min: 2.976	Max: 836.732	alpha: 0.061	PLoss: -80.308	CLoss: 8.908	Est: 80.092
Episode 13650	Avg: 172.777	Min: 10.109	Max: 675.530	alpha: 0.061	PLoss: -80.981	CLoss: 1.127	Est: 80.614
Episode 13700	Avg: 155.438	Min: 12.400	Max: 604.810	alpha: 0.063	PLoss: -84.736	CLoss: 1.358	Est: 84.667
Episode 13750	Avg: 143.163	Min: 22.552	Max: 466.370	alpha: 0.075	PLoss: -85.620	CLoss: 10.142	Est: 84.976
Episode 13800	Avg: 143.367	Min: 11.878	Max: 873.261	alpha: 0.075	PLoss: -81.335	CLoss: 3.686	Est: 81.261
Episode 13850	Avg: 128.440	Min: 0.959	Max: 488.869	alpha: 0

Episode 17300	Avg: 176.276	Min: 11.227	Max: 860.437	alpha: 0.096	PLoss: -100.302	CLoss: 5.456	Est: 100.550
Episode 17350	Avg: 215.956	Min: 37.681	Max: 930.180	alpha: 0.095	PLoss: -113.980	CLoss: 27.427	Est: 113.854
Episode 17400	Avg: 224.305	Min: 17.233	Max: 1056.956	alpha: 0.103	PLoss: -117.419	CLoss: 3.760	Est: 116.370
Episode 17450	Avg: 243.160	Min: 19.062	Max: 1211.840	alpha: 0.100	PLoss: -119.690	CLoss: 5.648	Est: 119.123
Episode 17500	Avg: 252.286	Min: 28.750	Max: 1236.513	alpha: 0.096	PLoss: -115.197	CLoss: 7.390	Est: 115.156
Episode 17550	Avg: 244.899	Min: 16.049	Max: 961.475	alpha: 0.092	PLoss: -110.491	CLoss: 5.196	Est: 109.962
Episode 17600	Avg: 215.137	Min: 1.151	Max: 681.357	alpha: 0.089	PLoss: -110.806	CLoss: 4.232	Est: 110.889
Episode 17650	Avg: 207.496	Min: 16.218	Max: 664.123	alpha: 0.098	PLoss: -118.968	CLoss: 10.112	Est: 119.182
Episode 17700	Avg: 224.719	Min: 4.929	Max: 623.731	alpha: 0.103	PLoss: -109.638	CLoss: 32.974	Est: 109.273
Episode 17750	Avg: 231.852	Min: 1

KeyboardInterrupt: 

In [36]:
scores = train(loaded, scores, n_episodes=60000, train_mode=True, episode_start=len(scores)+1, start_to_learn_at=0)

Episode 18250	Avg: 332.126	Min: 42.862	Max: 1678.263	alpha: 0.100	PLoss: -135.858	CLoss: 7.311	Est: 135.700
Episode 18300	Avg: 381.175	Min: 27.181	Max: 1060.235	alpha: 0.098	PLoss: -134.425	CLoss: 8.434	Est: 134.212
Episode 18350	Avg: 311.476	Min: 8.455	Max: 1452.880	alpha: 0.092	PLoss: -127.259	CLoss: 11.939	Est: 126.759
Episode 18400	Avg: 256.611	Min: 16.155	Max: 738.656	alpha: 0.091	PLoss: -121.881	CLoss: 29.024	Est: 121.934
Episode 18450	Avg: 231.901	Min: 27.529	Max: 1059.038	alpha: 0.089	PLoss: -115.977	CLoss: 22.155	Est: 115.386
Episode 18500	Avg: 264.555	Min: 16.324	Max: 1335.507	alpha: 0.090	PLoss: -124.919	CLoss: 3.045	Est: 124.617
Episode 18550	Avg: 271.432	Min: 24.845	Max: 873.671	alpha: 0.101	PLoss: -121.271	CLoss: 4.413	Est: 120.969
Episode 18600	Avg: 255.597	Min: 14.721	Max: 925.944	alpha: 0.091	PLoss: -110.807	CLoss: 12.170	Est: 110.691
Episode 18650	Avg: 253.568	Min: 7.019	Max: 930.653	alpha: 0.098	PLoss: -122.028	CLoss: 30.531	Est: 122.202
Episode 18700	Avg: 278.282	Mi

Episode 22050	Avg: 526.429	Min: 60.651	Max: 1648.696	alpha: 0.106	PLoss: -176.659	CLoss: 6.553	Est: 176.574
Episode 22100	Avg: 624.167	Min: 36.237	Max: 1907.172	alpha: 0.126	PLoss: -181.807	CLoss: 4.045	Est: 181.015
Episode 22150	Avg: 534.804	Min: 18.001	Max: 1502.241	alpha: 0.127	PLoss: -179.619	CLoss: 31.183	Est: 179.615
Episode 22200	Avg: 539.316	Min: 9.881	Max: 1664.664	alpha: 0.130	PLoss: -165.356	CLoss: 10.807	Est: 164.822
Episode 22250	Avg: 554.482	Min: 29.078	Max: 1751.266	alpha: 0.117	PLoss: -170.278	CLoss: 14.612	Est: 169.897
Episode 22300	Avg: 580.170	Min: 22.163	Max: 1482.407	alpha: 0.116	PLoss: -173.965	CLoss: 9.996	Est: 173.657
Episode 22350	Avg: 581.960	Min: 23.860	Max: 1221.010	alpha: 0.111	PLoss: -167.835	CLoss: 13.827	Est: 166.746
Episode 22400	Avg: 391.680	Min: 15.070	Max: 1191.366	alpha: 0.112	PLoss: -150.033	CLoss: 6.369	Est: 149.095
Episode 22450	Avg: 388.087	Min: 50.209	Max: 1275.619	alpha: 0.120	PLoss: -162.916	CLoss: 15.241	Est: 162.698
Episode 22500	Avg: 469.3

KeyboardInterrupt: 

In [38]:
scores = train(loaded, scores, n_episodes=60000, train_mode=True, episode_start=len(scores)+1, start_to_learn_at=0)
plot_result(scores)

Episode 24400	Avg: 449.323	Min: 19.725	Max: 1487.055	alpha: 0.119	PLoss: -171.707	CLoss: 10.099	Est: 171.339
Episode 24450	Avg: 478.351	Min: 73.782	Max: 1888.674	alpha: 0.135	PLoss: -183.072	CLoss: 16.098	Est: 182.765
Episode 24500	Avg: 581.737	Min: 13.387	Max: 1793.168	alpha: 0.129	PLoss: -178.102	CLoss: 5.008	Est: 177.959
Episode 24550	Avg: 570.867	Min: 54.205	Max: 1810.905	alpha: 0.127	PLoss: -181.217	CLoss: 4.325	Est: 181.142
Episode 24600	Avg: 502.182	Min: 21.262	Max: 1239.237	alpha: 0.131	PLoss: -185.252	CLoss: 4.955	Est: 184.581
Episode 24650	Avg: 412.710	Min: 44.833	Max: 892.714	alpha: 0.134	PLoss: -179.438	CLoss: 7.444	Est: 179.411
Episode 24700	Avg: 439.143	Min: 73.746	Max: 1745.300	alpha: 0.130	PLoss: -165.489	CLoss: 27.109	Est: 165.591
Episode 24750	Avg: 488.138	Min: 10.340	Max: 1578.800	alpha: 0.119	PLoss: -186.340	CLoss: 25.348	Est: 185.846
Episode 24800	Avg: 527.559	Min: 23.000	Max: 1447.916	alpha: 0.125	PLoss: -183.033	CLoss: 29.267	Est: 182.580
Episode 24850	Avg: 566.5

Episode 28200	Avg: 674.667	Min: 56.575	Max: 1744.921	alpha: 0.142	PLoss: -196.184	CLoss: 5.811	Est: 195.991
Episode 28250	Avg: 605.016	Min: 54.834	Max: 1434.539	alpha: 0.135	PLoss: -194.978	CLoss: 11.583	Est: 194.507


KeyboardInterrupt: 

In [41]:
scores = train(loaded, scores, n_episodes=60000, train_mode=True, episode_start=len(scores)+1, start_to_learn_at=0)
plot_result(scores)

Episode 28300	Avg: 593.443	Min: 16.308	Max: 2143.458	alpha: 0.139	PLoss: -179.618	CLoss: 15.395	Est: 179.592
Episode 28350	Avg: 564.736	Min: 48.521	Max: 1860.572	alpha: 0.124	PLoss: -191.683	CLoss: 27.710	Est: 191.031
Episode 28400	Avg: 538.991	Min: 46.343	Max: 1155.595	alpha: 0.129	PLoss: -191.583	CLoss: 16.037	Est: 191.607
Episode 28450	Avg: 492.697	Min: 16.739	Max: 1498.646	alpha: 0.127	PLoss: -181.786	CLoss: 16.579	Est: 181.961


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


Episode 28500	Avg: nan	Min: nan	Max: nan	alpha: nan	PLoss: nan	CLoss: nan	Est: nan
Episode 28550	Avg: nan	Min: -0.575	Max: -0.575	alpha: nan	PLoss: nan	CLoss: nan	Est: nan
Episode 28600	Avg: -0.575	Min: -0.575	Max: -0.575	alpha: nan	PLoss: nan	CLoss: nan	Est: nan
Episode 28650	Avg: -0.575	Min: -0.575	Max: -0.575	alpha: nan	PLoss: nan	CLoss: nan	Est: nan
Episode 28700	Avg: -0.575	Min: -0.575	Max: -0.575	alpha: nan	PLoss: nan	CLoss: nan	Est: nan
Episode 28750	Avg: -0.575	Min: -0.575	Max: -0.575	alpha: nan	PLoss: nan	CLoss: nan	Est: nan
Episode 28800	Avg: -0.575	Min: -0.575	Max: -0.575	alpha: nan	PLoss: nan	CLoss: nan	Est: nan
Episode 28850	Avg: -0.575	Min: -0.575	Max: -0.575	alpha: nan	PLoss: nan	CLoss: nan	Est: nan
Episode 28900	Avg: -0.575	Min: -0.575	Max: -0.575	alpha: nan	PLoss: nan	CLoss: nan	Est: nan
Episode 28950	Avg: -0.575	Min: -0.575	Max: -0.575	alpha: nan	PLoss: nan	CLoss: nan	Est: nan
Episode 29000	Avg: -0.575	Min: -0.575	Max: -0.575	alpha: nan	PLoss: nan	CLoss: nan	Est: nan


KeyboardInterrupt: 

In [48]:
loaded, scores = Storage.load("weights/SAC/eps_28150_avg_783.199.pth", device)
loaded.UPDATE_EVERY = 2
loaded.UPDATE_LOOP = 2
scores = train(loaded, scores, n_episodes=60000, train_mode=True, episode_start=len(scores)+1, start_to_learn_at=0)
plot_result(scores)

TypeError: an integer is required

## View the trained agent

In [39]:
loaded, scores = Storage.load("weights/SAC/eps_28250_avg_605.016.pth", device)
loaded.network.eval()
env_info = env.reset(train_mode=False)[brain_name]

def act(network, states, device):
    states = torch.from_numpy(states).float().unsqueeze(0).to(device)
    ret = network.actor.test(states).squeeze().cpu().data.numpy()
    return ret

for i in range(2):
    while True:
        actions = loaded.test(states)
        env_info = env.step(np.clip(actions, -1, 1))[brain_name]  # send all actions to the environment
        states = env_info.vector_observations                     # get next state (for each agent)
        dones = env_info.local_done                               # see if episode finished
        if np.any(dones):                                         # exit loop if episode finished
            break

1. https://towardsdatascience.com/soft-actor-critic-demystified-b8427df61665
1. https://ai.googleblog.com/2019/01/soft-actor-critic-deep-reinforcement.html