```
Copyright 2024 The HIVEX Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
See the License for the specific language governing permissions and
limitations under the License.
```

# Example Training using Stable Baselines3 and VecEnv

Note:
1. Install the dependencies as described in the README.md.
2. select the correct kernel for this jupyter notebook at the top right.

## Import Libraries

In [1]:
"""HIVEX stablebaselines3 VecEnv training example."""
from torch import nn

# Stable Baselines3
from stable_baselines3 import PPO, A2C
from stable_baselines3.common.vec_env import VecNormalize

# hivex
from hivex.training.examples.stable_baselines3_vec_env_train.vec_env_callback import (
    EvaluationCallback,
)
from hivex.training.framework_wrappers.unity_stable_baselines3.unity_vec_env_wrapper import (
    HivexVecEnvWrapper,
)
from hivex.training.framework_wrappers.wrapper_utils import initialize_unity_environment

## Define initialize_model function

In [2]:
def initialize_model(vec_env: HivexVecEnvWrapper, algorithm: str = "ppo"):
    def remove_none_entries(d):
        return {k: v for k, v in list(d.items()) if v is not None}

    tensorboard_log_path = "/tensorboard_logs"

    policy_layers_comma_sep: str = "128,128,128"
    value_layers_comma_sep: str = "128,128,128"

    policy_layers = [
        int(layer_width) for layer_width in policy_layers_comma_sep.split(",")
    ]
    value_layers = [
        int(layer_width) for layer_width in value_layers_comma_sep.split(",")
    ]

    net_arch = [dict(vf=value_layers, pi=policy_layers)]

    activation_function = None
    log_std_init = None
    ppo_a2c_ortho_init = None
    policy_kwargs = remove_none_entries(
        dict(
            activation_fn=nn.ReLU if activation_function == "ReLU" else None,
            net_arch=net_arch,
            log_std_init=log_std_init,
            ortho_init=ppo_a2c_ortho_init,
        )
    )

    if algorithm == "ppo":
        algorithm_specific_parameters = dict(
                target_kl=0.1,
                gae_lambda=0.95,
                n_epochs=1,
                clip_range=0.2,
            )

        model_optional_parameters = dict(
                batch_size=10,
                n_steps=100,
            )
        
        model = PPO(
            policy="MlpPolicy",
            env=vec_env,
            verbose=2,
            tensorboard_log=str(tensorboard_log_path),
            device="cuda",
            gamma=0.9,
            policy_kwargs=policy_kwargs,
            learning_rate=5e-5,
            **model_optional_parameters,
            **algorithm_specific_parameters,
        )
    elif algorithm == "a2c":
        algorithm_specific_parameters = remove_none_entries(dict(gae_lambda=0.95))
        model_optional_parameters = remove_none_entries(dict(n_steps=100))

        model = A2C(
            policy="MlpPolicy",
            env=vec_env,
            verbose=2,
            tensorboard_log=str(tensorboard_log_path),
            device="cuda",
            gamma=0.9,
            policy_kwargs=policy_kwargs,
            learning_rate=5e-5,
            **model_optional_parameters,
            **algorithm_specific_parameters,
        )
    else:
        raise ValueError(f"Unknown algorithm: {algorithm}")

    return model

## Initialize Environment

In [3]:
unity_env = initialize_unity_environment(worker_id=0, hivex_env_tag="WindFarmControl", no_graphics=True)
vec_env = HivexVecEnvWrapper(unity_env)
vec_env_normalized = VecNormalize(vec_env, norm_reward=True)

  gym.logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


## Initialize Model

In [4]:
model = initialize_model(vec_env=vec_env_normalized, algorithm="ppo")



Using cpu device


## Run Training

In [5]:
eval_callback = EvaluationCallback(
    eval_env=vec_env_normalized,
    eval_freq=5000,
    n_eval_episodes=1,
    n_agents=8,
    eval_path="/eval",
    normalization=False,
)

model.learn(total_timesteps=1000, callback=[eval_callback], progress_bar=True)

vec_env_normalized.close()
print("Closed environment")

Logging to /tensorboard_logs\PPO_66
initializeing logger
-----------------------------
| time/              |      |
|    fps             | 6381 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 800  |
-----------------------------
-------------------------------------------
| time/                   |               |
|    fps                  | 4614          |
|    iterations           | 2             |
|    time_elapsed         | 0             |
|    total_timesteps      | 1600          |
| train/                  |               |
|    approx_kl            | 2.3102686e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.42         |
|    explained_variance   | 0.318         |
|    learning_rate        | 5e-05         |
|    loss                 | 0.0574        |
|    n_updates            | 1             |
|    policy_gradient_loss | 0.000292      |
|    std                  | 1