# Reacher - PPO

## Import packages

In [1]:
import sys
import random
from collections import namedtuple, deque

import tqdm
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from unityagents import UnityEnvironment
from IPython.display import clear_output

from model import PPOPolicyNetwork
from agent import PPOAgent

## Create Unity environment

In [2]:
env = UnityEnvironment(file_name="Reacher")
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


## Constants

In [6]:
# # Unity environment parameters
# STATE_SIZE = env_info.vector_observations.shape[1]
# ACTION_SIZE = brain.vector_action_space_size
# NUMBER_OF_AGENTS = len(env_info.agents)

# # PyTorch device
# DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# # Hyperparameters
# DISCOUNT_RATE = 0.99
# TAU = 0.95
# GRADIENT_CLIP = 5
# ROLLOUT_LENGTH = 2048
# OPTIMIZATION_EPOCHS = 10
# PPO_CLIP = 0.02
# LOG_INTERVAL = 2048
# MAX_STEPS = 2e7
# MINI_BATCH_NUMBER = 32
# ENTROPY_COEFICENT = 0.01
# EPISODE_COUNT = 50
# HIDDEN_SIZE = 512

config = {
    'environment': {
        'state_size':  env_info.vector_observations.shape[1],
        'action_size': brain.vector_action_space_size,
        'number_of_agents': len(env_info.agents)
    },
    'pytorch': {
        'device': torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    },
    'hyperparameters': {
        'discount_rate': 0.99,
        'tau': 0.95,
        'gradient_clip': 5,
        'rollout_length': 2048,
        'optimization_epochs': 10,
        'ppo_clip': 0.02,
        'log_interval': 2048,
        'max_steps': 2e7,
        'mini_batch_number': 32,
        'entropy_coefficent': 0.01,
        'episode_count': 20,
        'hidden_size': 512,
        'adam_learning_rate': 3e-4,
        'adam_epsilon': 1e-5
    }
}

In [8]:
policy = PPOPolicyNetwork(config)
optimizier = optim.Adam(policy.parameters(), config['hyperparameters']['adam_learning_rate'], 
                        eps=config['hyperparameters']['adam_epsilon'])
agent = PPOAgent(env, brain_name, policy, optimizier, config)


all_scores = []
for i in tqdm.tqdm(range(config['hyperparameters']['episode_count'])):
    agent.step()
    
    env_info = env.reset(train_mode=True)[brain_name]    
    states = env_info.vector_observations                 
    scores = np.zeros(config['environment']['number_of_agents'])                         
    while True:
        actions, log_probs, _, values = policy(states)
        env_info = env.step(actions.cpu().detach().numpy())[brain_name]
        next_states = env_info.vector_observations         
        rewards = env_info.rewards                         
        dones = env_info.local_done                     
        scores += env_info.rewards                      
        states = next_states                               
        if np.any(dones):                                  
            break
    m = np.mean(scores)
    all_scores.append(m)
    
    if len(all_scores) > 100:
        h = np.mean(np.array(all_scores[-100:]))
    else:
        h = np.mean(np.array(all_scores))
    if h > 30.0:
        torch.save(policy.state_dict(), f"models/ppo-{h}-episode-{i}-hiddensize-{config['hyperparameters']['episode_count']['hidden_size']}.pth")
    clear_output(True)
    print('Episode: {} Total score this episode: {} Last {} average: {}'.format(i + 1, m, min(i + 1, 100), h))


100%|██████████| 20/20 [05:35<00:00, 17.07s/it][A
[A

Episode: 20 Total score this episode: 0.6594999852590263 Last 20 average: 0.4708499894756824
