In [1]:
import os
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3 import PPO
from stable_baselines3.common import policies

In [2]:
import EnvironmentConfigurations as EnvConfig

In [3]:
evaluation_freq = 25000

class AgentCallback(BaseCallback):
  def __init__(self, check_freq, save_path, verbose=1):
    super(AgentCallback, self).__init__(verbose)
    self.check_freq = check_freq
    self.save_path = save_path

  def __init_callback(self):
    if self.save_path is not None:
      os.makedirs(self.save_path, exist_ok=True)
  
  def _on_step(self):
    if self.n_calls % self.check_freq == 0:
      model_path = os.path.join(self.save_path, f"model_{self.n_calls}")
      self.model.save(model_path)
    return True
  
  
agentCallback = AgentCallback(check_freq=evaluation_freq, save_path=f"{EnvConfig.AGENT_MODEL_PATH_PREFIX}{EnvConfig.configurations[EnvConfig.CURRENT_CONFIGURATION_INDEX]['name']}")



In [4]:
from Cnn import CustomCNN
from EnvironmentHelpers import create_vectorised_environment
from utils.layer_activation_monitoring import LayerActivationMonitoring, register_hooks
from utils.layer_activation_monitoring import plot_activations

env_params = {
    "env_config": EnvConfig.configurations[EnvConfig.CURRENT_CONFIGURATION_INDEX],
    "is_reward_shaping_on": True,
    "render": False
}

evaluation_env_params = {
    "env_config": EnvConfig.configurations[EnvConfig.CURRENT_CONFIGURATION_INDEX],
    "is_reward_shaping_on": False,
    "render": False
}

agent_params = {
    "tensorboard_log":f"{EnvConfig.TENSORBOARD_LOG_PATH_PREFIX}{EnvConfig.configurations[EnvConfig.CURRENT_CONFIGURATION_INDEX]['name']}",
    "verbose":1,
    "n_epochs":3,
    "n_steps": 4096,
    "learning_rate": 1e-4,
    "batch_size": 64,
    "seed": 0,
    'policy_kwargs': {'features_extractor_class': CustomCNN}
}


In [6]:
# RUN THE ENVIRONMENT IN PARALLEL MODE WITH 2 ENVS

from EnvironmentHelpers import create_vectorised_environment
from stable_baselines3.common.callbacks import EvalCallback

from utils.initialisation import initialise_network_weights

env = create_vectorised_environment(**env_params, n_envs=2)

evaluation_env = create_vectorised_environment(**evaluation_env_params, n_envs=1)

evaluation_callback = EvalCallback(
            evaluation_env, 
            n_eval_episodes=10, 
            eval_freq=evaluation_freq,
            log_path=f"{EnvConfig.TENSORBOARD_LOG_PATH_PREFIX}{EnvConfig.configurations[EnvConfig.CURRENT_CONFIGURATION_INDEX]['name']}",
            best_model_save_path=f'models/{EnvConfig.configurations[EnvConfig.CURRENT_CONFIGURATION_INDEX]["name"]}')

model = PPO(policies.ActorCriticCnnPolicy, env, device="cuda", **agent_params)
# model = PPO.load(f"{EnvConfig.AGENT_MODEL_PATH_PREFIX}{EnvConfig.configurations[EnvConfig.CURRENT_CONFIGURATION_INDEX]['name']}/model_2650000", **agent_params)
# model.set_env(env)

register_hooks(model)
initialise_network_weights(model.policy)

# model.learn(total_timesteps=30000000, callback=[agentCallback, evaluation_callback], reset_num_timesteps=False)


model.learn(total_timesteps=30000000, callback=[agentCallback, evaluation_callback])
# model.learn(total_timesteps=3000000, callback=[agentCallback])


Logging to ./logs/logs_for_deathmatch\PPO_43
--------------------------------
| time/              |         |
|    fps             | 25      |
|    iterations      | 1       |
|    time_elapsed    | 319     |
|    total_timesteps | 5308192 |
--------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 23          |
|    iterations           | 2           |
|    time_elapsed         | 700         |
|    total_timesteps      | 5316384     |
| train/                  |             |
|    approx_kl            | 0.052063845 |
|    clip_fraction        | 0.54        |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.23       |
|    explained_variance   | -0.115      |
|    learning_rate        | 0.0001      |
|    loss                 | 0.286       |
|    n_updates            | 1941        |
|    policy_gradient_loss | 0.0255      |
|    value_loss           | 0.594       |
----------



Eval num_timesteps=5350000, episode_reward=4.40 +/- 3.23
Episode length: 99.10 +/- 16.83
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 99.1        |
|    mean_reward          | 4.4         |
| time/                   |             |
|    total_timesteps      | 5350000     |
| train/                  |             |
|    approx_kl            | 0.025176445 |
|    clip_fraction        | 0.265       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.17       |
|    explained_variance   | 0.215       |
|    learning_rate        | 0.0001      |
|    loss                 | 0.0669      |
|    n_updates            | 1956        |
|    policy_gradient_loss | -0.0173     |
|    value_loss           | 0.61        |
-----------------------------------------
New best mean reward!
--------------------------------
| time/              |         |
|    fps             | 21      |
|    iterations      | 7       |
|

In [6]:
env.close()
evaluation_env.close()

In [None]:
# EVALUATE ACTIVATIONS
from utils.initialisation import initialise_network_weights

env = create_vectorised_environment(**env_params, n_envs=2)
# evaluation_env = create_vectorised_environment(**env_params, n_envs=1)


model = PPO(policies.ActorCriticCnnPolicy, env, **agent_params)

register_hooks(model)
initialise_network_weights(model.policy)

model.learn(total_timesteps=1024, callback=[LayerActivationMonitoring()])

In [None]:
plot_activations(model.policy.features_extractor.hooks)

In [None]:

model = PPO.load(f"{EnvConfig.AGENT_MODEL_PATH_PREFIX}{EnvConfig.configurations[EnvConfig.CURRENT_CONFIGURATION_INDEX]['name']}/model_125000")
model.set_env(env)
model.learn(total_timesteps=3000000, callback=agentCallback, reset_num_timesteps=False)

In [None]:
env.close()