<a href="https://colab.research.google.com/github/iskra3138/stable-baselines/blob/main/git_based_Stable_baselines_A2C_HPO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 필요한 Library 및 git repo 설치

In [None]:
# Stable Baselines only supports tensorflow 1.x for now

!apt install swig
!pip install stable-baselines[mpi]==2.10.0
!pip install optuna

In [None]:
!git clone --recursive https://github.com/araffin/rl-baselines-zoo.git

# Directpry change

In [None]:
%cd /content/rl-baselines-zoo/

# TF 1.X 선택 및 필요 모듈 호출

In [None]:
%tensorflow_version 1.x

import os
import numpy as np
import gym
import argparse
import time
import yaml

import optuna
from optuna.pruners import SuccessiveHalvingPruner, MedianPruner
from optuna.samplers import RandomSampler, TPESampler
from optuna.integration.skopt import SkoptSampler

from stable_baselines import SAC, TD3
from stable_baselines.common.noise import AdaptiveParamNoiseSpec, NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines.common.vec_env import VecNormalize, DummyVecEnv, VecEnv
from stable_baselines.common import set_global_seeds
from stable_baselines.bench import Monitor

from utils import make_env, ALGOS, linear_schedule, get_latest_run_id, get_wrapper_class
from utils.utils import StoreDict

from utils.callbacks import TrialEvalCallback

# <font color='red'> Arguments 입력 </font>

In [None]:
# Default Values
parser = argparse.ArgumentParser()
parser.add_argument('--env', type=str, default="CartPole-v1", help='environment ID')
parser.add_argument('-tb', '--tensorboard-log', help='Tensorboard log dir', default='', type=str)
parser.add_argument('-i', '--trained-agent', help='Path to a pretrained agent to continue training',
                    default='', type=str)
parser.add_argument('--algo', help='RL Algorithm', default='ppo2',
                    type=str, required=False, choices=list(ALGOS.keys()))
parser.add_argument('-n', '--n-timesteps', help='Overwrite the number of timesteps', default=-1,
                    type=int)
parser.add_argument('--log-interval', help='Override log interval (default: -1, no change)', default=-1,
                    type=int)
parser.add_argument('--eval-freq', help='Evaluate the agent every n steps (if negative, no evaluation)',
                    default=10000, type=int)
parser.add_argument('--eval-episodes', help='Number of episodes to use for evaluation',
                    default=5, type=int)
parser.add_argument('--save-freq', help='Save the model every n steps (if negative, no checkpoint)',
                    default=-1, type=int)
parser.add_argument('-f', '--log-folder', help='Log folder', type=str, default='logs')
parser.add_argument('--seed', help='Random generator seed', type=int, default=0)
parser.add_argument('--n-trials', help='Number of trials for optimizing hyperparameters', type=int, default=10)
parser.add_argument('-optimize', '--optimize-hyperparameters', action='store_true', default=False,
                    help='Run hyperparameters search')
parser.add_argument('--n-jobs', help='Number of parallel jobs when optimizing hyperparameters', type=int, default=1)
parser.add_argument('--sampler', help='Sampler to use when optimizing hyperparameters', type=str,
                    default='tpe', choices=['random', 'tpe', 'skopt'])
parser.add_argument('--pruner', help='Pruner to use when optimizing hyperparameters', type=str,
                    default='median', choices=['halving', 'median', 'none'])
parser.add_argument('--verbose', help='Verbose mode (0: no output, 1: INFO)', default=1,
                    type=int)
parser.add_argument('--gym-packages', type=str, nargs='+', default=[],
                    help='Additional external Gym environemnt package modules to import (e.g. gym_minigrid)')
parser.add_argument('-params', '--hyperparams', type=str, nargs='+', action=StoreDict,
                    help='Overwrite hyperparameter (e.g. learning_rate:0.01 train_freq:10)')
parser.add_argument('-uuid', '--uuid', action='store_true', default=False,
                    help='Ensure that the run has a unique ID')
parser.add_argument('--env-kwargs', type=str, nargs='+', action=StoreDict,
                    help='Optional keyword argument to pass to the env constructor')
args = parser.parse_args()

In [None]:
# Custom Values
args.algo = 'a2c'
args.log_folder = './log'
env_id = 'CartPole-v1'
tensorboard_log = './tb_log'
normalize = False
n_envs = 1
seed = 0
log_dir ='./log' 
save_path = './save'



In [None]:
# Load hyperparameters from yaml file
with open('hyperparams/{}.yml'.format(args.algo), 'r') as f:
    hyperparams_dict = yaml.safe_load(f)
    if env_id in list(hyperparams_dict.keys()):
        hyperparams = hyperparams_dict[env_id]
    elif is_atari:
        hyperparams = hyperparams_dict['atari']
    else:
        raise ValueError("Hyperparameters not found for {}-{}".format(args.algo, env_id))

# Delete keys so the dict can be pass to the model constructor
if 'n_envs' in hyperparams.keys():
    del hyperparams['n_envs']
del hyperparams['n_timesteps']

# <font color='red'> HPO 탐색공간 입력 </font>

In [None]:
# https://github.com/araffin/rl-baselines-zoo/blob/master/utils/hyperparams_opt.py
# n_steps = 128로 고정
def sample_a2c_params(trial):
    """
    Sampler for A2C hyperparams.

    :param trial: (optuna.trial)
    :return: (dict)
    """
    gamma = trial.suggest_categorical('gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    #n_steps = trial.suggest_categorical('n_steps', [8, 16, 32, 64, 128, 256, 512, 1024, 2048])
    n_steps = trial.suggest_categorical('n_steps', [128])
    lr_schedule = trial.suggest_categorical('lr_schedule', ['linear', 'constant'])
    learning_rate = trial.suggest_loguniform('lr', 1e-5, 1)
    ent_coef = trial.suggest_loguniform('ent_coef', 0.00000001, 0.1)
    vf_coef = trial.suggest_uniform('vf_coef', 0, 1)
    # normalize = trial.suggest_categorical('normalize', [True, False])
    # TODO: take into account the normalization (also for the test env)

    return {
        'n_steps': n_steps,
        'gamma': gamma,
        'learning_rate': learning_rate,
        'lr_schedule': lr_schedule,
        'ent_coef': ent_coef,
        'vf_coef': vf_coef
    }

HYPERPARAMS_SAMPLER = {
    'a2c': sample_a2c_params,
}

# 환경 및 모델 함수 정의

In [None]:
# https://github.com/araffin/rl-baselines-zoo/blob/master/utils/utils.py
def make_env(env_id, rank=0, seed=0, log_dir=None, wrapper_class=None, env_kwargs=None):
    """
    Helper function to multiprocess training
    and log the progress.
    :param env_id: (str)
    :param rank: (int)
    :param seed: (int)
    :param log_dir: (str)
    :param wrapper: (type) a subclass of gym.Wrapper to wrap the original
                    env with
    :param env_kwargs: (Dict[str, Any]) Optional keyword argument to pass to the env constructor
    """
    if log_dir is not None:
        os.makedirs(log_dir, exist_ok=True)

    if env_kwargs is None:
        env_kwargs = {}

    def _init():
        set_global_seeds(seed + rank)
        env = gym.make(env_id, **env_kwargs)

        # Dict observation space is currently not supported.
        # https://github.com/hill-a/stable-baselines/issues/321
        # We allow a Gym env wrapper (a subclass of gym.Wrapper)
        if wrapper_class:
            env = wrapper_class(env)

        env.seed(seed + rank)
        log_file = os.path.join(log_dir, str(rank)) if log_dir is not None else None
        env = Monitor(env, log_file)
        return env

    return _init

In [None]:
# obtain a class object from a wrapper name string in hyperparams
# and delete the entry
env_wrapper = get_wrapper_class(hyperparams)
if 'env_wrapper' in hyperparams.keys():
    del hyperparams['env_wrapper']

In [None]:
env_kwargs = {} if args.env_kwargs is None else args.env_kwargs

In [None]:
# https://github.com/araffin/rl-baselines-zoo/blob/master/train.py
is_atari = False
algo_ = []
def create_env(n_envs, eval_env=False, no_log=False):
    """
    Create the environment and wrap it if necessary
    :param n_envs: (int)
    :param eval_env: (bool) Whether is it an environment used for evaluation or not
    :param no_log: (bool) Do not log training when doing hyperparameter optim
        (issue with writing the same file)
    :return: (Union[gym.Env, VecEnv])
    """
    global hyperparams
    global env_kwargs

    # Do not log eval env (issue with writing the same file)
    log_dir = None if eval_env or no_log else save_path

    if is_atari:
        if args.verbose > 0:
            print("Using Atari wrapper")
        env = make_atari_env(env_id, num_env=n_envs, seed=args.seed)
        # Frame-stacking with 4 frames
        env = VecFrameStack(env, n_stack=4)
    elif algo_ in ['dqn', 'ddpg']:
        if hyperparams.get('normalize', False):
            print("WARNING: normalization not supported yet for DDPG/DQN")
        env = gym.make(env_id, **env_kwargs)
        env.seed(args.seed)
        if env_wrapper is not None:
            env = env_wrapper(env)
    else:
        if n_envs == 1:
            env = DummyVecEnv([make_env(env_id, 0, args.seed, wrapper_class=env_wrapper, log_dir=log_dir, env_kwargs=env_kwargs)])
        else:
            # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)])
            # On most env, SubprocVecEnv does not help and is quite memory hungry
            env = DummyVecEnv([make_env(env_id, i, args.seed, log_dir=log_dir,
                                        wrapper_class=env_wrapper, env_kwargs=env_kwargs) for i in range(n_envs)])
        if normalize:
            # Copy to avoid changing default values by reference
            local_normalize_kwargs = normalize_kwargs.copy()
            # Do not normalize reward for env used for evaluation
            if eval_env:
                if len(local_normalize_kwargs) > 0:
                    local_normalize_kwargs['norm_reward'] = False
                else:
                    local_normalize_kwargs = {'norm_reward': False}

            if args.verbose > 0:
                if len(local_normalize_kwargs) > 0:
                    print("Normalization activated: {}".format(local_normalize_kwargs))
                else:
                    print("Normalizing input and reward")
            env = VecNormalize(env, **local_normalize_kwargs)

    # Optional Frame-stacking
    if hyperparams.get('frame_stack', False):
        n_stack = hyperparams['frame_stack']
        env = VecFrameStack(env, n_stack)
        print("Stacking {} frames".format(n_stack))
    if args.algo == 'her':
        # Wrap the env if need to flatten the dict obs
        if isinstance(env, VecEnv):
            env = _UnvecWrapper(env)
        env = HERGoalEnvWrapper(env)
    return env

In [None]:
# https://github.com/araffin/rl-baselines-zoo/blob/master/train.py

def create_model(*_args, **kwargs):
    """
    Helper to create a model with different hyperparameters
    """
    return ALGOS[args.algo](env=create_env(n_envs, no_log=True), tensorboard_log=tensorboard_log,
                            verbose=0, **kwargs)

# Optimization 함수 정의

In [None]:
# https://github.com/araffin/rl-baselines-zoo/blob/master/utils/hyperparams_opt.py
def hyperparam_optimization(algo, model_fn, env_fn, n_trials=10, n_timesteps=5000, hyperparams=None,
                            n_jobs=1, sampler_method='random', pruner_method='halving',
                            seed=0, verbose=1):
    """
    :param algo: (str)
    :param model_fn: (func) function that is used to instantiate the model
    :param env_fn: (func) function that is used to instantiate the env
    :param n_trials: (int) maximum number of trials for finding the best hyperparams
    :param n_timesteps: (int) maximum number of timesteps per trial
    :param hyperparams: (dict)
    :param n_jobs: (int) number of parallel jobs
    :param sampler_method: (str)
    :param pruner_method: (str)
    :param seed: (int)
    :param verbose: (int)
    :return: (pd.Dataframe) detailed result of the optimization
    """
    # TODO: eval each hyperparams several times to account for noisy evaluation
    # TODO: take into account the normalization (also for the test env -> sync obs_rms)
    if hyperparams is None:
        hyperparams = {}

    n_startup_trials = 10
    # test during 5 episodes
    n_eval_episodes = 5
    # evaluate every 20th of the maximum budget per iteration
    n_evaluations = 20
    eval_freq = int(n_timesteps / n_evaluations)

    # n_warmup_steps: Disable pruner until the trial reaches the given number of step.
    if sampler_method == 'random':
        sampler = RandomSampler(seed=seed)
    elif sampler_method == 'tpe':
        sampler = TPESampler(n_startup_trials=n_startup_trials, seed=seed)
    elif sampler_method == 'skopt':
        # cf https://scikit-optimize.github.io/#skopt.Optimizer
        # GP: gaussian process
        # Gradient boosted regression: GBRT
        sampler = SkoptSampler(skopt_kwargs={'base_estimator': "GP", 'acq_func': 'gp_hedge'})
    else:
        raise ValueError('Unknown sampler: {}'.format(sampler_method))

    if pruner_method == 'halving':
        pruner = SuccessiveHalvingPruner(min_resource=1, reduction_factor=4, min_early_stopping_rate=0)
    elif pruner_method == 'median':
        pruner = MedianPruner(n_startup_trials=n_startup_trials, n_warmup_steps=n_evaluations // 3)
    elif pruner_method == 'none':
        # Do not prune
        pruner = MedianPruner(n_startup_trials=n_trials, n_warmup_steps=n_evaluations)
    else:
        raise ValueError('Unknown pruner: {}'.format(pruner_method))

    if verbose > 0:
        print("Sampler: {} - Pruner: {}".format(sampler_method, pruner_method))

    study = optuna.create_study(sampler=sampler, pruner=pruner)
    algo_sampler = HYPERPARAMS_SAMPLER[algo]

    def objective(trial):

        kwargs = hyperparams.copy()

        trial.model_class = None
        if algo == 'her':
            trial.model_class = hyperparams['model_class']

        # Hack to use DDPG/TD3 noise sampler
        if algo in ['ddpg', 'td3'] or trial.model_class in ['ddpg', 'td3']:
            trial.n_actions = env_fn(n_envs=1).action_space.shape[0]
        kwargs.update(algo_sampler(trial))

        model = model_fn(**kwargs)

        eval_env = env_fn(n_envs=1, eval_env=True)
        # Account for parallel envs
        eval_freq_ = eval_freq
        if isinstance(model.get_env(), VecEnv):
            eval_freq_ = max(eval_freq // model.get_env().num_envs, 1)
        # TODO: use non-deterministic eval for Atari?
        eval_callback = TrialEvalCallback(eval_env, trial, n_eval_episodes=n_eval_episodes,
                                          eval_freq=eval_freq_, deterministic=True)

        try:
            model.learn(n_timesteps, callback=eval_callback)
            # Free memory
            model.env.close()
            eval_env.close()
        except AssertionError:
            # Sometimes, random hyperparams can generate NaN
            # Free memory
            model.env.close()
            eval_env.close()
            raise optuna.exceptions.TrialPruned()
        is_pruned = eval_callback.is_pruned
        cost = -1 * eval_callback.last_mean_reward

        del model.env, eval_env
        del model

        if is_pruned:
            raise optuna.exceptions.TrialPruned()

        return cost

    try:
        study.optimize(objective, n_trials=n_trials, n_jobs=n_jobs)
    except KeyboardInterrupt:
        pass

    print('Number of finished trials: ', len(study.trials))

    print('Best trial:')
    trial = study.best_trial

    print('Value: ', trial.value)

    print('Params: ')
    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))

    return study.trials_dataframe()

# <font color = 'blue' > 탐색 </font>

In [None]:
# 적절한 값 입력 후 탐색
data_frame = hyperparam_optimization(algo=args.algo, 
                                     model_fn=create_model, 
                                     env_fn=create_env, 
                                     n_trials=1000, 
                                     n_timesteps=50000 , 
                                     hyperparams=hyperparams,
                                     n_jobs=2, 
                                     sampler_method='tpe', 
                                     pruner_method='median',
                                     seed=0, 
                                     verbose=1)

# 결과 저장

In [None]:
report_name = "report_{}_{}-trials-{}-{}-{}_{}.csv".format(env_id, args.n_trials, args.n_timesteps,
                                                        args.sampler, args.pruner, int(time.time()))
log_path = os.path.join(args.log_folder, args.algo, report_name)
print("Writing report to {}".format(log_path))

os.makedirs(os.path.dirname(log_path), exist_ok=True)
data_frame.to_csv(log_path)

# 결과 다운로드

In [None]:
from google.colab import files
files.download(log_path)