In [4]:
import numpy as np
import matplotlib.pyplot as plt
from unityagents import UnityEnvironment
from collections import deque
import torch
import torch.optim as optim
from research.all_code import *

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [5]:
env = UnityEnvironment(file_name='./Reacher_Windows_x86_64/Reacher.exe')

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [6]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [7]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)


Number of agents: 20
Size of each action: 4


In [17]:
class BaseTaskUnity:
    def __init__(self, train_mode):
        self.train_mode = train_mode
        pass

    def reset(self):
        env_info = self.env.reset(train_mode=self.train_mode)[brain_name]
        return np.array(env_info.vector_observations)

    def step(self, action):
        env_info = self.env.step(action)[brain_name]
        next_states = env_info.vector_observations
        rewards = env_info.rewards
        dones = env_info.local_done
        if np.any(dones):
            next_states = self.reset()
        return np.array(next_states), np.array(rewards), np.array(dones), None

    def seed(self, random_seed):
        pass

    
class ReacherV1(BaseTaskUnity):
    def __init__(self, name, train_mode):
        BaseTaskUnity.__init__(self, train_mode)
        self.name = name
        self.env = env
        self.action_dim = brain.vector_action_space_size
        self.state_dim = brain.vector_observation_space_size

    def step(self, action):
        return BaseTaskUnity.step(self, np.clip(action, -1, 1))

In [21]:
def run_steps_unity(agent):
    random_seed()
    torch.manual_seed(np.random.randint(int(1e6)))
    config = agent.config
    while True:
        agent.step()
        
        scores = agent.scores_deque
        mean_100 = np.mean(scores)
        i_episode = len(agent.scores_list)
        print('Episode {}\tAverage Score: {:.3f}\tLast Score: {:.3f}\tMax Score: {:.3f}'.format(i_episode, 
                                                                                          mean_100, 
                                                                                          scores[-1],
                                                                                         agent.score_max))
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.3f}\tMax Avg Score: {:.3f}'.format(i_episode, mean_100, np.max(scores)))
            save_path = 'PPO-ReacherV2-checkpoint.bin'
            agent.save(save_path)
        if len(scores) >= 100 and mean_100>=max_t*30.0/950:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}'.format(i_episode, mean_100))
            save_path = 'PPO-ReacherV2-solved.bin'
            agent.save(save_path)
            break

In [22]:
def ppo_continuous_unity(train_mode=True):
    config = Config()
    # task_fn = lambda log_dir: Pendulum(log_dir=log_dir)
    # task_fn = lambda log_dir: Bullet('AntBulletEnv-v0', log_dir=log_dir)
    task_fn = lambda: ReacherV1('ReacherV1',train_mode)
    config.task_fn = task_fn
    config.state_dim = 33
    config.action_dim = 4

    config.network_fn = lambda: GaussianActorCriticNet(
        config.state_dim, config.action_dim, actor_body=FCBody(config.state_dim),
        critic_body=FCBody(config.state_dim))
    config.optimizer_fn = lambda params: torch.optim.Adam(params, 3e-4, eps=1e-5)
    config.discount = 0.99
    config.use_gae = True
    config.gae_tau = 0.95
    config.gradient_clip = 5
    config.rollout_length = 256
    config.optimization_epochs = 10
    config.num_mini_batches = 256
    config.ppo_ratio_clip = 0.2
    config.log_interval = 2048
    config.max_steps = 2e7
    # config.logger = get_logger()
    agent = PPOAgent(config)
    return run_steps_unity(agent)

In [27]:
from research.all_code import *
success, avg_score, scores_list, path = ppo_continuous_unity(True)

Episode 1	Average Score: 0.020	Last Score: 0.020	Max Score: 0.410
Episode 2	Average Score: 0.016	Last Score: 0.011	Max Score: 0.220
Episode 3	Average Score: 0.012	Last Score: 0.004	Max Score: 0.080
Episode 4	Average Score: 0.020	Last Score: 0.046	Max Score: 0.450
Episode 5	Average Score: 0.037	Last Score: 0.104	Max Score: 0.720
Episode 6	Average Score: 0.047	Last Score: 0.093	Max Score: 0.610
Episode 7	Average Score: 0.056	Last Score: 0.116	Max Score: 0.740
Episode 8	Average Score: 0.070	Last Score: 0.168	Max Score: 0.730
Episode 9	Average Score: 0.082	Last Score: 0.177	Max Score: 0.710
Episode 10	Average Score: 0.093	Last Score: 0.186	Max Score: 0.870
Episode 11	Average Score: 0.102	Last Score: 0.197	Max Score: 1.130
Episode 12	Average Score: 0.115	Last Score: 0.255	Max Score: 1.160
Episode 13	Average Score: 0.113	Last Score: 0.085	Max Score: 0.430
Episode 14	Average Score: 0.119	Last Score: 0.202	Max Score: 1.030
Episode 15	Average Score: 0.123	Last Score: 0.177	Max Score: 0.740
Epis

KeyboardInterrupt: 

In [None]:
from agents.ppo import PPO
from agents.model_ppo import Gaussian
import random

random_seed()
device = "cpu"
states = env_info.vector_observations
action_size = brain.vector_action_space_size
network = Gaussian(states.shape[1], action_size).to(device)
agent = PPO(network, device,
                 LR=3e-4,
                 GRADIENT_CLIP=5, 
                 EPOCHS=10, 
                 BATCH_SIZE=256,
                GAMMA=0.99,
                GAE_TAU=0.95,
                CLIP_EPSILON=0.2)
scores = train(agent, n_episodes=6000, max_t=256, train_mode=True)
plot_result(scores)

Episode 1	Average Score: 0.002	Last Score: 0.002	Max Score: 0.050
Episode 2	Average Score: 0.004	Last Score: 0.006	Max Score: 0.130
Episode 3	Average Score: 0.010	Last Score: 0.020	Max Score: 0.260
Episode 4	Average Score: 0.011	Last Score: 0.015	Max Score: 0.270
Episode 5	Average Score: 0.016	Last Score: 0.034	Max Score: 0.570
Episode 6	Average Score: 0.027	Last Score: 0.083	Max Score: 1.140
Episode 7	Average Score: 0.033	Last Score: 0.065	Max Score: 0.430
Episode 8	Average Score: 0.040	Last Score: 0.093	Max Score: 0.440
Episode 9	Average Score: 0.073	Last Score: 0.339	Max Score: 1.910
Episode 10	Average Score: 0.078	Last Score: 0.116	Max Score: 0.520
Episode 11	Average Score: 0.096	Last Score: 0.279	Max Score: 1.020
Episode 12	Average Score: 0.109	Last Score: 0.257	Max Score: 0.920
Episode 13	Average Score: 0.127	Last Score: 0.337	Max Score: 0.980
Episode 14	Average Score: 0.142	Last Score: 0.330	Max Score: 1.070
Episode 15	Average Score: 0.146	Last Score: 0.213	Max Score: 0.690
Epis

Episode 123	Average Score: 0.912	Last Score: 1.435	Max Score: 2.820
Episode 124	Average Score: 0.922	Last Score: 1.407	Max Score: 2.680
Episode 125	Average Score: 0.936	Last Score: 1.698	Max Score: 3.240
Episode 126	Average Score: 0.949	Last Score: 1.720	Max Score: 3.520
Episode 127	Average Score: 0.961	Last Score: 1.654	Max Score: 3.650
Episode 128	Average Score: 0.972	Last Score: 1.473	Max Score: 2.760
Episode 129	Average Score: 0.983	Last Score: 1.532	Max Score: 2.730
Episode 130	Average Score: 0.992	Last Score: 1.220	Max Score: 2.870
Episode 131	Average Score: 1.007	Last Score: 1.859	Max Score: 3.870
Episode 132	Average Score: 1.018	Last Score: 1.500	Max Score: 2.920
Episode 133	Average Score: 1.030	Last Score: 1.678	Max Score: 3.430
Episode 134	Average Score: 1.036	Last Score: 1.385	Max Score: 3.080
Episode 135	Average Score: 1.048	Last Score: 1.612	Max Score: 2.970
Episode 136	Average Score: 1.058	Last Score: 1.639	Max Score: 3.410
Episode 137	Average Score: 1.069	Last Score: 1.4

Episode 243	Average Score: 2.079	Last Score: 2.377	Max Score: 3.760
Episode 244	Average Score: 2.090	Last Score: 2.489	Max Score: 4.890
Episode 245	Average Score: 2.103	Last Score: 3.076	Max Score: 7.340
Episode 246	Average Score: 2.110	Last Score: 2.367	Max Score: 4.440
Episode 247	Average Score: 2.117	Last Score: 2.129	Max Score: 3.490
Episode 248	Average Score: 2.115	Last Score: 1.978	Max Score: 3.650
Episode 249	Average Score: 2.128	Last Score: 2.990	Max Score: 5.830
Episode 250	Average Score: 2.128	Last Score: 2.015	Max Score: 4.190
Episode 251	Average Score: 2.133	Last Score: 2.411	Max Score: 4.530
Episode 252	Average Score: 2.134	Last Score: 2.152	Max Score: 4.510
Episode 253	Average Score: 2.135	Last Score: 1.787	Max Score: 4.020
Episode 254	Average Score: 2.137	Last Score: 2.224	Max Score: 5.460
Episode 255	Average Score: 2.147	Last Score: 2.747	Max Score: 6.190
Episode 256	Average Score: 2.149	Last Score: 2.128	Max Score: 4.660
Episode 257	Average Score: 2.158	Last Score: 2.7

Episode 363	Average Score: 3.048	Last Score: 3.810	Max Score: 6.430
Episode 364	Average Score: 3.064	Last Score: 3.722	Max Score: 6.470
Episode 365	Average Score: 3.076	Last Score: 3.673	Max Score: 7.560
Episode 366	Average Score: 3.088	Last Score: 4.042	Max Score: 6.600
Episode 367	Average Score: 3.098	Last Score: 3.587	Max Score: 5.890
Episode 368	Average Score: 3.112	Last Score: 3.605	Max Score: 5.650
Episode 369	Average Score: 3.126	Last Score: 4.193	Max Score: 6.770
Episode 370	Average Score: 3.139	Last Score: 3.967	Max Score: 8.720
Episode 371	Average Score: 3.146	Last Score: 3.721	Max Score: 5.800
Episode 372	Average Score: 3.159	Last Score: 4.123	Max Score: 8.290
Episode 373	Average Score: 3.176	Last Score: 4.253	Max Score: 7.530
Episode 374	Average Score: 3.190	Last Score: 4.174	Max Score: 7.520
Episode 375	Average Score: 3.210	Last Score: 4.516	Max Score: 6.920
Episode 376	Average Score: 3.223	Last Score: 4.086	Max Score: 6.770
Episode 377	Average Score: 3.232	Last Score: 3.5

Episode 483	Average Score: 4.928	Last Score: 5.616	Max Score: 8.520
Episode 484	Average Score: 4.939	Last Score: 5.300	Max Score: 8.290
Episode 485	Average Score: 4.950	Last Score: 5.181	Max Score: 8.670
Episode 486	Average Score: 4.962	Last Score: 5.504	Max Score: 7.820
Episode 487	Average Score: 4.972	Last Score: 5.460	Max Score: 8.100
Episode 488	Average Score: 4.990	Last Score: 5.904	Max Score: 9.050
Episode 489	Average Score: 5.012	Last Score: 5.958	Max Score: 9.420
Episode 490	Average Score: 5.021	Last Score: 5.073	Max Score: 8.300
Episode 491	Average Score: 5.034	Last Score: 5.527	Max Score: 9.130
Episode 492	Average Score: 5.046	Last Score: 5.561	Max Score: 9.120
Episode 493	Average Score: 5.061	Last Score: 5.377	Max Score: 9.360
Episode 494	Average Score: 5.062	Last Score: 5.462	Max Score: 8.490
Episode 495	Average Score: 5.066	Last Score: 5.586	Max Score: 9.030
Episode 496	Average Score: 5.073	Last Score: 5.505	Max Score: 9.060
Episode 497	Average Score: 5.081	Last Score: 4.9

Episode 602	Average Score: 6.779	Last Score: 8.011	Max Score: 9.510
Episode 603	Average Score: 6.794	Last Score: 7.122	Max Score: 8.760
Episode 604	Average Score: 6.815	Last Score: 7.898	Max Score: 9.260
Episode 605	Average Score: 6.839	Last Score: 8.099	Max Score: 9.340
Episode 606	Average Score: 6.858	Last Score: 7.352	Max Score: 9.590
Episode 607	Average Score: 6.880	Last Score: 7.790	Max Score: 9.330
Episode 608	Average Score: 6.896	Last Score: 7.758	Max Score: 9.420
Episode 609	Average Score: 6.917	Last Score: 7.814	Max Score: 9.580
Episode 610	Average Score: 6.938	Last Score: 7.966	Max Score: 9.650
Episode 611	Average Score: 6.965	Last Score: 8.308	Max Score: 9.640
Episode 612	Average Score: 6.987	Last Score: 8.279	Max Score: 9.700
Episode 613	Average Score: 7.009	Last Score: 7.957	Max Score: 9.010
Episode 614	Average Score: 7.038	Last Score: 8.057	Max Score: 9.730
Episode 615	Average Score: 7.054	Last Score: 7.626	Max Score: 9.300
Episode 616	Average Score: 7.073	Last Score: 7.8

In [25]:
def train(agent, n_episodes=500, max_t=500, train_mode=True):
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=train_mode)[brain_name]
        states = env_info.vector_observations
        scores_one_episode = np.zeros(states.shape[0])
        trajectories_states, trajectories_actions, trajectories_log_probs, trajectories_values, \
        trajectories_rewards, trajectories_next_states, trajectories_dones = [],[],[],[],[],[],[]
        
        for t in range(max_t):
            actions, log_probs, values = agent.act(states)
            env_info = env.step(np.clip(actions, -1, 1))[brain_name] # send all actions to the environment
            next_states = env_info.vector_observations               # get next state (for each agent)
            rewards = env_info.rewards                               # get reward (for each agent)
            dones = env_info.local_done                              # see if episode finished
            scores_one_episode += rewards
            trajectories_states.append(states)
            trajectories_actions.append(actions)
            trajectories_log_probs.append(log_probs)
            trajectories_values.append(values)
            trajectories_rewards.append(rewards)
            trajectories_next_states.append(next_states)
            trajectories_dones.append(dones)
            states = next_states                                     # roll over states to next time step
            if np.any(dones):                                        # exit loop if episode finished
                break
  
        agent.learn(trajectories_states, trajectories_actions, trajectories_log_probs, trajectories_values, 
        trajectories_rewards, trajectories_next_states, trajectories_dones)
        score = np.mean(scores_one_episode)
        scores.append(score)
        scores_window.append(score)
        mean_100 = np.mean(scores_window)
        print('Episode {}\tAverage Score: {:.3f}\tLast Score: {:.3f}\tMax Score: {:.3f}'.format(i_episode, 
                                                                                          mean_100, 
                                                                                          score,
                                                                                         np.max(scores_one_episode)))
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.3f}\tMax Avg Score: {:.3f}'.format(i_episode, mean_100, np.max(scores_window)))
            agent.save()
        if len(scores_window) >= 100 and np.mean(scores_window)>=max_t*30.0/950:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}'.format(i_episode, mean_100))
            agent.save()
            break
    return scores