In [1]:
import numpy as np
import matplotlib.pyplot as plt
from unityagents import UnityEnvironment
from collections import deque
import torch
import torch.optim as optim
from all_code import *

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
env = UnityEnvironment(file_name='../Reacher_Windows_x86_64/Reacher.exe')

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [3]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [5]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)


Number of agents: 20
Size of each action: 4


In [7]:
class BaseTaskUnity:
    def __init__(self):
        pass

    def reset(self):
        env_info = self.env.reset(train_mode=True)[brain_name]
        return np.array(env_info.vector_observations)

    def step(self, action):
        env_info = self.env.step(action)[brain_name]
        next_states = env_info.vector_observations
        rewards = env_info.rewards
        dones = env_info.local_done
        if np.any(dones):
            next_states = self.reset()
        return np.array(next_states), np.array(rewards), np.array(dones), None

    def seed(self, random_seed):
        pass

    
class ReacherV1(BaseTaskUnity):
    def __init__(self, name, log_dir=None):
        BaseTaskUnity.__init__(self)
        self.name = name
        self.env = env
        self.action_dim = brain.vector_action_space_size
        self.state_dim = brain.vector_observation_space_size

    def step(self, action):
        return BaseTaskUnity.step(self, np.clip(action, -1, 1))

In [33]:
def run_steps_unity(agent):
    np.random.seed()
    torch.manual_seed(np.random.randint(int(1e6)))
    config = agent.config
    while True:
        scores = agent.scores_deque
        if len(agent.scores_list) > 0:
            print('Episode {}\tAverage Score Last {} Episodes: {:.3f}\tAvg. Score (All Agents) Last Episode: {:.3f}'.format(len(agent.scores_list), len(scores), np.mean(scores), agent.scores_list[-1]))
        if len(agent.scores_list) > 0 and len(agent.scores_list) % 100 == 0:
            save_path = 'PPO-ReacherV2-checkpoint.bin'
            agent.save(save_path)
            print('Episode {}\tAverage Score Last {} Episodes: {:.3f}'.format(len(agent.scores_list), len(scores),
                np.mean(scores)))
        if len(scores) and (np.mean(scores) >= 30.0):
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}'.format(len(agent.scores_list)-100, np.mean(scores)))
            save_path = 'PPO-ReacherV2-solved.bin'
            agent.save(save_path)
            res = True, agent.scores_deque, agent.scores_list, save_path
            agent.close()
            return res
        if config.max_steps and agent.total_steps >= config.max_steps:
            print('\nMax episodes reached!\tFinal Average Score: {:.3f}'.format(np.mean(scores)))
            save_path = 'PPO-ReacherV2-max-steps.bin'
            agent.close()
            return False, None, None, None
        agent.step()

In [43]:
def ppo_continuous_unity():
    config = Config()
    config.num_workers = num_agents
    # task_fn = lambda log_dir: Pendulum(log_dir=log_dir)
    # task_fn = lambda log_dir: Bullet('AntBulletEnv-v0', log_dir=log_dir)
    task_fn = lambda: ReacherV1('ReacherV1')
    config.task_fn = task_fn
    config.state_dim = 33
    config.action_dim = 4

    config.network_fn = lambda: GaussianActorCriticNet(
        config.state_dim, config.action_dim, actor_body=FCBody(config.state_dim),
        critic_body=FCBody(config.state_dim))
    config.optimizer_fn = lambda params: torch.optim.Adam(params, 3e-4, eps=1e-5)
    config.discount = 0.99
    config.use_gae = True
    config.gae_tau = 0.95
    config.gradient_clip = 5
    config.rollout_length = 1024
    config.optimization_epochs = 10
    config.num_mini_batches = 256
    config.ppo_ratio_clip = 0.2
    config.log_interval = 2048
    config.max_steps = 2e7
    # config.logger = get_logger()
    agent = PPOAgent(config)
    return run_steps_unity(agent)

In [44]:
from all_code import *
success, avg_score, scores_list, path = ppo_continuous_unity()

Episode 1	Average Score Last 1 Episodes: 0.056	Avg. Score (All Agents) Last Episode: 0.056
Episode 2	Average Score Last 2 Episodes: 0.097	Avg. Score (All Agents) Last Episode: 0.138
Episode 3	Average Score Last 3 Episodes: 0.140	Avg. Score (All Agents) Last Episode: 0.225
Episode 4	Average Score Last 4 Episodes: 0.209	Avg. Score (All Agents) Last Episode: 0.417
Episode 5	Average Score Last 5 Episodes: 0.311	Avg. Score (All Agents) Last Episode: 0.717
Episode 6	Average Score Last 6 Episodes: 0.459	Avg. Score (All Agents) Last Episode: 1.202
Episode 7	Average Score Last 7 Episodes: 0.550	Avg. Score (All Agents) Last Episode: 1.095
Episode 8	Average Score Last 8 Episodes: 0.673	Avg. Score (All Agents) Last Episode: 1.535
Episode 9	Average Score Last 9 Episodes: 0.795	Avg. Score (All Agents) Last Episode: 1.763
Episode 10	Average Score Last 10 Episodes: 0.894	Avg. Score (All Agents) Last Episode: 1.790
Episode 11	Average Score Last 11 Episodes: 0.998	Avg. Score (All Agents) Last Episode: 2

KeyboardInterrupt: 

In [None]:
if success:
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(np.arange(1, len(scores_list)+1), scores_list)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.show()

In [None]:
from deep_rl.network.network_heads import GaussianActorCriticNet
from deep_rl.network.network_bodies import FCBody, DummyBody
import torch
from torch.autograd import Variable
from torchviz import make_dot
import numpy as np

In [None]:
model = FCBody(33)

x = Variable(torch.randn(33)).unsqueeze(0)
model.eval()
y = model(x)
             
make_dot(y, params=dict(list(model.named_parameters())))

In [None]:
critic_model = FCBody(33)

x = Variable(torch.randn(33)).unsqueeze(0)
model.eval()
y = model(x)
             
make_dot(y, params=dict(list(model.named_parameters())))

In [9]:
a = torch.tensor(5.)

In [10]:
a

tensor(5.)

In [11]:
a.detach()

tensor(5.)