In [1]:
from hyperopt import hp
from hyperopt import fmin, tpe
from hyperopt import space_eval

from stable_baselines3 import A2C
from stable_baselines3.common.env_util import make_vec_env
# from stable_baselines3.common.vec_env import SubprocVecEnv
# from stable_baselines3.common.evaluation import evaluate_policy

import PIL

import time

from utils import make_env

pygame 2.4.0 (SDL 2.26.4, Python 3.10.10)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
def dict_to_str(args):
    string = ""

    for key, value in args.items():
        if isinstance(value, int):
            string += f", {key}_{value}"
        else:
            string += f", {key}_{value: .5f}"

    return string[2:]

def save_frames_as_gif(frames, file_path):
    frame_images = []
    for frame in frames:
        frame_images.append(PIL.Image.fromarray(frame))

    frame_images[0].save(file_path, format='GIF',
                        append_images=frame_images[1:],
                        save_all=True,
                        duration=30,
                        loop=0)

In [3]:
def rwd_func_1(info, prev_info):
    reward = 0

    reward += 1 * (info["score"] - prev_info["score"])

    return reward

def rwd_func_2(info, prev_info):
    reward = 0

    reward -= 0.1

    reward += 1.1 * (info["score"] - prev_info["score"])
    
    if info["life"] < prev_info["life"]:
        reward -= 0.9

    return reward

def rwd_func_3(info, prev_info):
    reward = 0

    if (prev_info["head pos"][0] == info["size"]-1 and info["head pos"][0] == 0) or (
        prev_info["head pos"][1] == info["size"]-1 and info["head pos"][1] == 0) or (
        prev_info["head pos"][0] == 0 and info["head pos"][0] == info["size"]-1) or (
        prev_info["head pos"][1] == 0 and info["head pos"][1] == info["size"]-1):
        reward -= 0.5

    reward -= 0.1

    reward += 1.1 * (info["score"] - prev_info["score"])
    
    if info["life"] < prev_info["life"]:
        reward -= 0.9

    return reward

def rwd_func_4(info, prev_info):
    reward = 0

    if info["head food dist"] < prev_info["head food dist"]:
        reward += 0.2
    else:
        reward -= 0.1

    if (prev_info["head pos"][0] == info["size"]-1 and info["head pos"][0] == 0) or (
        prev_info["head pos"][1] == info["size"]-1 and info["head pos"][1] == 0) or (
        prev_info["head pos"][0] == 0 and info["head pos"][0] == info["size"]-1) or (
        prev_info["head pos"][1] == 0 and info["head pos"][1] == info["size"]-1):
        reward -= 0.5

    reward -= 0.1

    reward += 1.1 * (info["score"] - prev_info["score"])
    
    if info["life"] < prev_info["life"]:
        reward -= 0.9

    return reward

In [4]:
def objective(args):
    global best_avg_rwd, expr_no, total_timesteps, fixed_args
    args.update(fixed_args)

    start = time.time()


    # if args["n_steps"] >= 128 and args["n_envs"] >= 16:
    #     args["n_envs"] = 16
    # if args["n_steps"] >= 256 and args["n_envs"] >= 8:
    #     args["n_envs"] = 8
    # if args["n_steps"] >= 512 and args["n_envs"] >= 4:
    #     args["n_envs"] = 4

    print(f"---Chosen Hyperparameters of opt-epoch:")
    print(args)


    match args["env_rwd_func"]:
        case 0:
            env_rwd_func = None
        case 1:
            env_rwd_func = rwd_func_1
        case 2:
            env_rwd_func = rwd_func_2
        case 3:
            env_rwd_func = rwd_func_3
        case 4:
            env_rwd_func = rwd_func_4
    
    vec_env = make_vec_env(
        lambda: make_env(env_rwd_func=env_rwd_func, max_step=args["env_max_step"], num_stack=args["env_num_stack"]), 
        n_envs=args["n_envs"], 
        # vec_env_cls=SubprocVecEnv
    )


    args_str = dict_to_str(args)
    env_num_stack = args["env_num_stack"]
    del args["env_rwd_func"], args["env_max_step"], args["env_num_stack"], args["n_envs"]
    

    model = A2C(
        "MlpPolicy", vec_env, 
        **args,
        verbose=0, tensorboard_log=f"logs/A2C hp-tuning/no.{expr_no}"
    )
    model.learn(total_timesteps=total_timesteps, log_interval=1, tb_log_name=args_str) # total_timesteps=15_000_000, 


    vec_env.close()
    del vec_env
    env = make_env(max_step=1000, num_stack=env_num_stack)


    n_episode = 10
    # avg_rwd, _ = evaluate_policy(model, env, n_eval_episodes=trial_num, deterministic=False)
    avg_rwd = 0
    bestscore = 0
    bestscore_frames = None
    for episode in range(n_episode):
        state = env.reset()
        done = False
        score = 0
        frames = []
        while not done:
            action, _states = model.predict(state, deterministic=False)
            state, reward, done, info = env.step(action)
            obs = env.render("rgb_array")
            frames.append(obs)
        score = info["score"]
        avg_rwd += score
        print(f"an agent on environment: {episode+1}/{n_episode} Episodes, score: {score}")
        if score > bestscore:
            bestscore = score
            bestscore_frames = frames.copy()
    avg_rwd /= n_episode

    if avg_rwd > best_avg_rwd:
        best_avg_rwd = avg_rwd

        print("Saving new best model.")
        model.save(f"models/hp/A2C/no.{expr_no}/{avg_rwd}.zip")

        print("Saving new best model's gif.")            
        save_frames_as_gif(bestscore_frames, f"rl videos/hp/A2C/no.{expr_no}/{avg_rwd}.gif")


    env.close()
    del env
    del model


    print(f"---Time taken for opt-epoch: {(time.time() - start)/60} Minutes")
    print(f'---Average Episode Reward of opt-epoch: {avg_rwd}')
    print("\n")
    return -avg_rwd/10

In [6]:
def get_space():
    return {
        'env_rwd_func': hp.choice('env_rwd_func', [0, 1, 2, 3, 4]),
        'env_max_step': hp.choice('env_max_step', [500, 1000, 1500, 2000, 2500, 3000, 5000]),
        'env_num_stack': hp.choice('env_num_stack', [2, 3, 4, 5, 6, 7, 8]),
        'n_envs': hp.choice('n_envs', [1, 2, 4, 6, 8, 10, 16, 20, 32, 64, 128, 250, 500, 1000]),
        'learning_rate': hp.choice('learning_rate', [1e-6, 3e-6, 6e-6, 1e-5, 3e-5, 6e-5, 1e-4, 3e-4, 6e-4, 1e-3, 3e-3, 6e-3, 1e-2]),
        'n_steps': hp.choice('n_steps', [4, 5, 8, 10, 16, 20, 32, 50, 64, 128, 256, 400, 512, 800, 1024, 1500, 2048]),
        'gamma': hp.choice('gamma', [0.70, 0.80, 0.90, 0.95, 0.99, 0.995]),
        # 'gae_lambda': hp.uniform('gae_lambda', 0.01, 1.0),
        'normalize_advantage': hp.choice('normalize_advantage', [True, False]),
        # 'ent_coef': hp.uniform('ent_coef', 0.0, 0.5),
        # 'vf_coef': hp.uniform('vf_coef', 0.1, 1),
        # 'max_grad_norm': hp.uniform('max_grad_norm', 0.3, 30),
        'use_rms_prop': hp.choice('use_rms_prop', [True, False])
    }

In [7]:
best_avg_rwd = float("-inf")
expr_no = 1
total_timesteps = 1_000_000

best = fmin(objective, get_space(), algo=tpe.suggest, max_evals=100)

---Chosen Hyperparameters of opt-epoch:                
{'env_max_step': 5000, 'env_num_stack': 8, 'env_rwd_func': 4, 'gamma': 0.95, 'learning_rate': 0.0001, 'n_envs': 1, 'n_steps': 20, 'normalize_advantage': False, 'use_rms_prop': True}
  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]

  if info["head food dist"] < prev_info["head food dist"]:

  prev_info["head pos"][0] == 0 and info["head pos"][0] == info["size"]-1) or (

  prev_info["head pos"][1] == 0 and info["head pos"][1] == info["size"]-1):

  if (prev_info["head pos"][0] == info["size"]-1 and info["head pos"][0] == 0) or (

  prev_info["head pos"][1] == info["size"]-1 and info["head pos"][1] == 0) or (



\Ran agent on environment: 1/10 Episodes, score: 5     
\Ran agent on environment: 2/10 Episodes, score: 8     
\Ran agent on environment: 3/10 Episodes, score: 0     
\Ran agent on environment: 4/10 Episodes, score: 4     
\Ran agent on environment: 5/10 Episodes, score: 3     
\Ran agent on environment: 6/10 Episodes, score: 6     
\Ran agent on environment: 7/10 Episodes, score: 5     
\Ran agent on environment: 8/10 Episodes, score: 2     
\Ran agent on environment: 9/10 Episodes, score: 6     
\Ran agent on environment: 10/10 Episodes, score: 7    
Saving new best model.                                 
Saving new best model's gif.                           
---Time taken for opt-epoch: 51.15685174067815 Minutes 
---Average Episode Reward of opt-epoch: 4.6            
---Chosen Hyperparameters of opt-epoch:                                 
{'env_max_step': 5000, 'env_num_stack': 8, 'env_rwd_func': 1, 'gamma': 0.99, 'learning_rate': 0.0003, 'n_envs': 4, 'n_steps': 800, 'normalize_a

  prev_info["head pos"][0] == 0 and info["head pos"][0] == info["size"]-1) or (

  if (prev_info["head pos"][0] == info["size"]-1 and info["head pos"][0] == 0) or (

  prev_info["head pos"][1] == info["size"]-1 and info["head pos"][1] == 0) or (

  prev_info["head pos"][1] == 0 and info["head pos"][1] == info["size"]-1):



\Ran agent on environment: 1/10 Episodes, score: 0                         
\Ran agent on environment: 2/10 Episodes, score: 0                         
\Ran agent on environment: 3/10 Episodes, score: 0                         
\Ran agent on environment: 4/10 Episodes, score: 0                         
\Ran agent on environment: 5/10 Episodes, score: 0                         
\Ran agent on environment: 6/10 Episodes, score: 0                         
\Ran agent on environment: 7/10 Episodes, score: 1                         
\Ran agent on environment: 8/10 Episodes, score: 1                         
\Ran agent on environment: 9/10 Episodes, score: 0                         
\Ran agent on environment: 10/10 Episodes, score: 0                        
---Time taken for opt-epoch: 26.524602631727856 Minutes                    
---Average Episode Reward of opt-epoch: 0.2                                
---Chosen Hyperparameters of opt-epoch:                                    
{'env_max_st

In [8]:
best

{'env_max_step': 5,
 'env_num_stack': 5,
 'env_rwd_func': 0,
 'gamma': 1,
 'learning_rate': 11,
 'n_envs': 13,
 'n_steps': 14,
 'normalize_advantage': 0,
 'use_rms_prop': 0}

In [9]:
space_eval(get_space(), best)

{'env_max_step': 3000,
 'env_num_stack': 7,
 'env_rwd_func': 0,
 'gamma': 0.8,
 'learning_rate': 0.006,
 'n_envs': 1000,
 'n_steps': 1024,
 'normalize_advantage': True,
 'use_rms_prop': True}

In [5]:
def get_space():
    return {
        'env_rwd_func': hp.choice('env_rwd_func', [0, 1, 2, 3]),
        'env_max_step': hp.choice('env_max_step', [1500, 2000, 2500, 3000, 3500]),
        'env_num_stack': hp.choice('env_num_stack', [4, 5, 6, 7]),
        'n_envs': hp.choice('n_envs', [8, 16, 64, 128, 256, 512, 1024]),
        'learning_rate': hp.choice('learning_rate', [1e-4, 3e-4, 4e-4, 5e-4, 6e-4, 7e-4, 9e-4, 1e-3, 3e-3, 6e-3, 7e-3, 8e-3, 1e-2, 3e-2, 6e-2, 1e-1]),
        'n_steps': hp.choice('n_steps', [1, 4, 5, 8, 10, 16, 20, 32, 50, 64, 128, 256, 400, 512, 800, 1024, 1500, 2048]),
        'gamma': hp.uniform('gamma', 0.75, 0.85),
        'gae_lambda': hp.uniform('gae_lambda', 0.01, 1.0),
        'normalize_advantage': hp.choice('normalize_advantage', [True, False]),
        'ent_coef': hp.uniform('ent_coef', 0.0, 0.5),
        'vf_coef': hp.uniform('vf_coef', 0.1, 1),
        'max_grad_norm': hp.uniform('max_grad_norm', 0.3, 30),
    }

In [6]:
best_avg_rwd = float("-inf")
expr_no = 2
total_timesteps = 2_000_000

best = fmin(objective, get_space(), algo=tpe.suggest, max_evals=50)

---Chosen Hyperparameters of opt-epoch:               
{'ent_coef': 0.19218002626830366, 'env_max_step': 2000, 'env_num_stack': 7, 'env_rwd_func': 0, 'gae_lambda': 0.678666281055582, 'gamma': 0.845807024412953, 'learning_rate': 0.1, 'max_grad_norm': 15.692779125887494, 'n_envs': 128, 'n_steps': 800, 'normalize_advantage': True, 'vf_coef': 0.40962507638362533}
\Ran agent on environment: 1/10 Episodes, score: 0    
\Ran agent on environment: 2/10 Episodes, score: 0    
\Ran agent on environment: 3/10 Episodes, score: 2    
\Ran agent on environment: 4/10 Episodes, score: 1    
\Ran agent on environment: 5/10 Episodes, score: 0    
\Ran agent on environment: 6/10 Episodes, score: 0    
\Ran agent on environment: 7/10 Episodes, score: 0    
\Ran agent on environment: 8/10 Episodes, score: 0    
\Ran agent on environment: 9/10 Episodes, score: 0    
\Ran agent on environment: 10/10 Episodes, score: 0   
Saving new best model.                                
Saving new best model's gif.     




---Time taken for opt-epoch: 15.94200219710668 Minutes
---Average Episode Reward of opt-epoch: 0.3           
---Chosen Hyperparameters of opt-epoch:                               
{'ent_coef': 0.46834323915969894, 'env_max_step': 2000, 'env_num_stack': 6, 'env_rwd_func': 2, 'gae_lambda': 0.8357702577941016, 'gamma': 0.8445595450270638, 'learning_rate': 0.0007, 'max_grad_norm': 6.655502137794559, 'n_envs': 16, 'n_steps': 2048, 'normalize_advantage': True, 'vf_coef': 0.15079755043808077}
\Ran agent on environment: 1/10 Episodes, score: 3                    
\Ran agent on environment: 2/10 Episodes, score: 3                    
\Ran agent on environment: 3/10 Episodes, score: 4                    
\Ran agent on environment: 4/10 Episodes, score: 3                    
\Ran agent on environment: 5/10 Episodes, score: 4                    
\Ran agent on environment: 6/10 Episodes, score: 3                    
\Ran agent on environment: 7/10 Episodes, score: 4                    
\Ran agent 

  prev_info["head pos"][1] == 0 and info["head pos"][1] == info["size"]-1):

  if (prev_info["head pos"][0] == info["size"]-1 and info["head pos"][0] == 0) or (

  prev_info["head pos"][1] == info["size"]-1 and info["head pos"][1] == 0) or (

  prev_info["head pos"][0] == 0 and info["head pos"][0] == info["size"]-1) or (



\Ran agent on environment: 1/10 Episodes, score: 0                       
\Ran agent on environment: 2/10 Episodes, score: 0                       
\Ran agent on environment: 3/10 Episodes, score: 0                       
\Ran agent on environment: 4/10 Episodes, score: 0                       
\Ran agent on environment: 5/10 Episodes, score: 2                       
\Ran agent on environment: 6/10 Episodes, score: 1                       
\Ran agent on environment: 7/10 Episodes, score: 0                       
\Ran agent on environment: 8/10 Episodes, score: 1                       
\Ran agent on environment: 9/10 Episodes, score: 0                       
\Ran agent on environment: 10/10 Episodes, score: 0                      
---Time taken for opt-epoch: 15.582719135284425 Minutes                  
---Average Episode Reward of opt-epoch: 0.4                              
---Chosen Hyperparameters of opt-epoch:                                  
{'ent_coef': 0.26335964765958647, 'env

In [7]:
best

{'ent_coef': 0.011874390262348211,
 'env_max_step': 4,
 'env_num_stack': 3,
 'env_rwd_func': 0,
 'gae_lambda': 0.08615770805410888,
 'gamma': 0.815493672875537,
 'learning_rate': 9,
 'max_grad_norm': 21.84386244450146,
 'n_envs': 6,
 'n_steps': 5,
 'normalize_advantage': 0,
 'vf_coef': 0.10264691687178268}

In [8]:
space_eval(get_space(), best)

{'ent_coef': 0.011874390262348211,
 'env_max_step': 3500,
 'env_num_stack': 7,
 'env_rwd_func': 0,
 'gae_lambda': 0.08615770805410888,
 'gamma': 0.815493672875537,
 'learning_rate': 0.006,
 'max_grad_norm': 21.84386244450146,
 'n_envs': 1024,
 'n_steps': 16,
 'normalize_advantage': True,
 'vf_coef': 0.10264691687178268}

In [5]:
def get_space():
    return {
        'env_rwd_func': hp.choice('env_rwd_func', [0, 1, 2, 3]),
        'env_max_step': hp.choice('env_max_step', [0, 1500, 3500]),
        'learning_rate': hp.choice('learning_rate', [1e-4, 3e-4, 4e-4, 6e-4, 7e-4, 1e-3, 3e-3, 6e-3, 7e-3, 8e-3, 9e-3, 1e-2]),
        'n_steps': hp.choice('n_steps', [3, 4, 5, 6, 8, 10, 16, 32, 64, 128, 256, 512, 1024]),
        'gae_lambda': hp.uniform('gae_lambda', 0.001, 1.0),
        'ent_coef': hp.uniform('ent_coef', 0.0, 0.2),
    }

In [6]:
best_avg_rwd = float("-inf")
expr_no = 3
total_timesteps = 5_000_000
fixed_args = {"env_num_stack": 4, "n_envs": 1000, "gamma": 0.80, "normalize_advantage": True}

best = fmin(objective, get_space(), algo=tpe.suggest, max_evals=100)

---Chosen Hyperparameters of opt-epoch:                
{'ent_coef': 0.06715835167057442, 'env_max_step': 1500, 'env_rwd_func': 1, 'gae_lambda': 0.15806189871721002, 'learning_rate': 0.003, 'n_steps': 16, 'env_num_stack': 4, 'n_envs': 1000, 'gamma': 0.8, 'normalize_advantage': True}
an agent on environment: 1/10 Episodes, score: 3       
an agent on environment: 2/10 Episodes, score: 4       
an agent on environment: 3/10 Episodes, score: 3       
an agent on environment: 4/10 Episodes, score: 4       
an agent on environment: 5/10 Episodes, score: 3       
an agent on environment: 6/10 Episodes, score: 3       
an agent on environment: 7/10 Episodes, score: 4       
an agent on environment: 8/10 Episodes, score: 3       
an agent on environment: 9/10 Episodes, score: 3       
an agent on environment: 10/10 Episodes, score: 7      
Saving new best model.                                 
Saving new best model's gif.                           
  0%|          | 0/100 [37:33<?, ?trial/s, b




---Time taken for opt-epoch: 37.6095109462738 Minutes  
---Average Episode Reward of opt-epoch: 3.7            
---Chosen Hyperparameters of opt-epoch:                                  
{'ent_coef': 0.033317625650326255, 'env_max_step': 3500, 'env_rwd_func': 0, 'gae_lambda': 0.959359039151814, 'learning_rate': 0.0007, 'n_steps': 64, 'env_num_stack': 4, 'n_envs': 1000, 'gamma': 0.8, 'normalize_advantage': True}
an agent on environment: 1/10 Episodes, score: 11                        
an agent on environment: 2/10 Episodes, score: 9                           
an agent on environment: 3/10 Episodes, score: 3                           
an agent on environment: 4/10 Episodes, score: 11                          
an agent on environment: 5/10 Episodes, score: 12                          
an agent on environment: 6/10 Episodes, score: 5                           
an agent on environment: 7/10 Episodes, score: 9                           
an agent on environment: 8/10 Episodes, score: 4        

  prev_info["head pos"][0] == 0 and info["head pos"][0] == info["size"]-1) or (

  prev_info["head pos"][1] == 0 and info["head pos"][1] == info["size"]-1):

  prev_info["head pos"][1] == info["size"]-1 and info["head pos"][1] == 0) or (

  if (prev_info["head pos"][0] == info["size"]-1 and info["head pos"][0] == 0) or (



an agent on environment: 1/10 Episodes, score: 3                           
an agent on environment: 2/10 Episodes, score: 6                           
an agent on environment: 3/10 Episodes, score: 3                           
an agent on environment: 4/10 Episodes, score: 4                           
an agent on environment: 5/10 Episodes, score: 1                           
an agent on environment: 6/10 Episodes, score: 3                           
an agent on environment: 7/10 Episodes, score: 6                           
an agent on environment: 8/10 Episodes, score: 3                           
an agent on environment: 9/10 Episodes, score: 4                           
an agent on environment: 10/10 Episodes, score: 4                          
---Time taken for opt-epoch: 37.87392356793086 Minutes                     
---Average Episode Reward of opt-epoch: 3.7                                
---Chosen Hyperparameters of opt-epoch:                                    
{'ent_coef':

In [7]:
best

{'ent_coef': 0.014722305311628495,
 'env_max_step': 2,
 'env_rwd_func': 0,
 'gae_lambda': 0.08179432834573155,
 'learning_rate': 6,
 'n_steps': 6}

In [8]:
space_eval(get_space(), best)

{'ent_coef': 0.014722305311628495,
 'env_max_step': 3500,
 'env_rwd_func': 0,
 'gae_lambda': 0.08179432834573155,
 'learning_rate': 0.003,
 'n_steps': 16}

In [None]:
# {'ent_coef': 0.04728136562858354, 'env_max_step': 3500, 'env_num_stack': 4, 'env_rwd_func': 0, 'gae_lambda': 0.009744980806713911, 'gamma': 0.8606867601900862, 'learning_rate': 0.006, 'max_grad_norm': 13.564322735262257, 'n_steps': 128, 'vf_coef': 0.5746884979668675}
# {'ent_coef': 0.048131320125026966, 'env_max_step': 3500, 'env_num_stack': 4, 'env_rwd_func': 3, 'gae_lambda': 0.9650094161425868, 'gamma': 0.7910967341909643, 'learning_rate': 0.0007, 'max_grad_norm': 7.697800855483016, 'n_steps': 8, 'vf_coef': 0.20991523532588405} trin it for more than 5mil
# {'ent_coef': 0.013584894198168228, 'env_max_step': 1500, 'env_num_stack': 8, 'env_rwd_func': 2, 'gae_lambda': 0.631485162290132, 'gamma': 0.8164578775162806, 'learning_rate': 0.0009, 'max_grad_norm': 2.8580838035360085, 'n_steps': 5, 'vf_coef': 0.5761365542365204}
# {'ent_coef': 0.0032490067827949387, 'env_max_step': 1500, 'env_num_stack': 8, 'env_rwd_func': 2, 'gae_lambda': 0.30487890025338343, 'gamma': 0.8075603609540954, 'learning_rate': 0.0009, 'max_grad_norm': 4.250884836635666, 'n_steps': 5, 'vf_coef': 0.3819627894410967}