Baseline testing for the newest reward function.

In [1]:
from classes.fighter_envs import StreetFighter
from classes.evaluate2 import eval_plots_all, eval_plotter, eval_subplots, evaluate, get_ep_info, get_eval_stats
from classes.utils import ExperimentRunner, build_all_permutations, add_to_recorder
from classes.constants import*
from classes.dqn_envs import make_dqn_env
from stable_baselines3 import DQN
from classes.utils import cleanup_device
from classes.utils import RandomGridSearch

from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from time import time
import traceback

num_eval_episodes = 50

In [4]:
baseline_eval_env = make_dqn_env(StreetFighter, render_mode=None, n_procs=0)

In [5]:
mean, std, total, mean_steps, all_end_ep_info = evaluate(model=None, eval_env=baseline_eval_env, n_eval_episodes=num_eval_episodes, return_episode_rewards=True, get_ep_info=True)

100%|██████████| 50/50 [06:28<00:00,  7.76s/it]


In [6]:
reward_stats = {
    'Mean total episode rewards':mean,
    'Std total episode rewards':std,
    'Reward sum all episodes':total,
    'Mean episode steps':mean_steps,
}

In [7]:
baseline_recorder = {}
add_to_recorder(baseline_recorder, reward_stats)

In [8]:
baseline_df = pd.DataFrame.from_dict(baseline_recorder)
display(baseline_df)

Unnamed: 0,Mean total episode rewards,Std total episode rewards,Reward sum all episodes,Mean episode steps
0,-23.28,17.01886,-1164,12208.08


In [9]:
baseline_df2 = pd.DataFrame.from_dict(all_end_ep_info)
baseline_df2.sort_values('Player rounds won', ascending=False).head(10)

Unnamed: 0,episode,Player rounds won,Mean player health,Enemy rounds won,Mean enemy health
31,32,10,107.51,4,83.23
16,17,10,97.07,6,87.02
40,41,7,94.65,4,103.22
44,45,7,94.6,2,90.91
46,47,6,103.93,3,89.41
26,27,6,94.02,3,93.25
0,1,5,102.14,4,80.39
4,5,5,85.62,4,82.18
36,37,4,86.97,3,93.09
19,20,3,92.54,3,92.25


In [12]:
baseline_df2[['Player rounds won','Mean player health','Enemy rounds won','Mean enemy health']].mean()

Player rounds won      2.5800
Mean player health    88.3244
Enemy rounds won       2.5200
Mean enemy health     94.1192
dtype: float64

In [2]:
h_params = {
    'policy': ['MlpPolicy', 'CnnPolicy'],
    'gamma': [0.9, 0.99],
    'learning_rate': [0.00001],
    'batch_size': [256],
    'buffer_size': [50_000],
    'train_freq': [16, 32],
    'gradient_steps': [1],
    'exploration_fraction': [0.3, 0.5],
    'exploration_final_eps': [0.1, 0.2],
    'target_update_interval': [100, 10000],
    'policy_kwargs': [dict(net_arch=[256, 256])],
    'seed':[2],
}

In [3]:
h_params_list = build_all_permutations(h_params)

In [5]:
h_params_list

[{'policy': 'MlpPolicy',
  'gamma': 0.9,
  'learning_rate': 1e-05,
  'batch_size': 256,
  'buffer_size': 50000,
  'train_freq': 16,
  'gradient_steps': 1,
  'exploration_fraction': 0.3,
  'exploration_final_eps': 0.1,
  'target_update_interval': 100,
  'policy_kwargs': {'net_arch': [256, 256]},
  'seed': 2},
 {'policy': 'MlpPolicy',
  'gamma': 0.9,
  'learning_rate': 1e-05,
  'batch_size': 256,
  'buffer_size': 50000,
  'train_freq': 16,
  'gradient_steps': 1,
  'exploration_fraction': 0.3,
  'exploration_final_eps': 0.1,
  'target_update_interval': 10000,
  'policy_kwargs': {'net_arch': [256, 256]},
  'seed': 2},
 {'policy': 'MlpPolicy',
  'gamma': 0.9,
  'learning_rate': 1e-05,
  'batch_size': 256,
  'buffer_size': 50000,
  'train_freq': 16,
  'gradient_steps': 1,
  'exploration_fraction': 0.3,
  'exploration_final_eps': 0.2,
  'target_update_interval': 100,
  'policy_kwargs': {'net_arch': [256, 256]},
  'seed': 2},
 {'policy': 'MlpPolicy',
  'gamma': 0.9,
  'learning_rate': 1e-05,
 

In [6]:
expr = ExperimentRunner(model_class=DQN, base_env=StreetFighter, verbose=True)

running experiments on cpu


In [7]:
expr.set_env_func(env_func=make_dqn_env)
expr.set_env_func_options(render_mode = None, n_procs = 4)
expr.set_eval_func_options(return_episode_rewards=False, deterministic=False)
expr

expr: model: <class 'stable_baselines3.dqn.dqn.DQN'>,
base_env_class: <class 'common.fighter_envs.StreetFighter'>
h_parameters: None,
model_ops: {'tensorboard_log': './results/board/', 'verbose': 0},
model_train_ops: {'total_timesteps': 50000, 'progress_bar': True, 'tb_log_name': 'DQN'},
env_builder: make_dqn_env,
env_builder opts: {'render_mode': None, 'n_procs': 4},
eval function: evaluate_policy,
eval function opts: {'return_episode_rewards': False, 'deterministic': False},
device: cpu,
model current learning timesteps: 0

In [10]:
from classes.utils import cleanup_device
import traceback

In [11]:
recorder = {}
total = len(h_params_list)

In [12]:
trial = 1   

In [13]:
for hyper_params in h_params_list:
    try:

        print(f'Trial: {trial} out of {total}')
        print(hyper_params)

        start = time()

        expr.set_tb_log_name(f'Trial')
        expr.train_model(hyper_params=hyper_params, total_timesteps=500_000)
        expr.save_model(f'./results/models/{trial}')
        rewards, steps = expr.evaluate_model(n_eval_episodes=10)

        elapsed = round((time() - start) / 60, 3)

        results = {
                    'Mean Episode Rewards':np.mean(rewards),
                    'Reward std' :np.std(rewards),
                    'Total Rewards': np.sum(rewards),
                    'Mean Steps':np.mean(steps),
                    'Step Std': np.std(steps)
                    }

        add_to_recorder(recorder, {'Trial': trial, 'Time in minutes': elapsed})
        add_to_recorder(recorder, results)
        add_to_recorder(recorder, h_params)
        

        if trial % 10 == 0:
            print('saving dataframe!')
            record_df = pd.DataFrame.from_dict(recorder)
            record_df.to_pickle(f'{OPT_DIR}dataframe/trial_{trial}.pkl')

        cleanup_device()
        print(f'Total Trial time: {elapsed}')

        trial += 1

    except Exception as e:
            print(traceback.format_exc(10))



Trial: 1 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.9, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 16, 'gradient_steps': 1, 'exploration_fraction': 0.3, 'exploration_final_eps': 0.1, 'target_update_interval': 100, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/1
Evaluating...
Total Trial time: 17.983
Trial: 2 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.9, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 16, 'gradient_steps': 1, 'exploration_fraction': 0.3, 'exploration_final_eps': 0.1, 'target_update_interval': 10000, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/2
Evaluating...
Total Trial time: 19.561
Trial: 3 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.9, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 16, 'gradient_steps': 1, 'exploration_fraction': 0.3, 'exploration_final_eps': 0.2, 'target_update_interval': 100, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/3
Evaluating...
Total Trial time: 17.41
Trial: 4 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.9, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 16, 'gradient_steps': 1, 'exploration_fraction': 0.3, 'exploration_final_eps': 0.2, 'target_update_interval': 10000, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/4
Evaluating...
Total Trial time: 17.244
Trial: 5 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.9, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 16, 'gradient_steps': 1, 'exploration_fraction': 0.5, 'exploration_final_eps': 0.1, 'target_update_interval': 100, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/5
Evaluating...
Total Trial time: 19.065
Trial: 6 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.9, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 16, 'gradient_steps': 1, 'exploration_fraction': 0.5, 'exploration_final_eps': 0.1, 'target_update_interval': 10000, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/6
Evaluating...
Total Trial time: 16.731
Trial: 7 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.9, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 16, 'gradient_steps': 1, 'exploration_fraction': 0.5, 'exploration_final_eps': 0.2, 'target_update_interval': 100, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/7
Evaluating...
Total Trial time: 17.253
Trial: 8 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.9, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 16, 'gradient_steps': 1, 'exploration_fraction': 0.5, 'exploration_final_eps': 0.2, 'target_update_interval': 10000, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/8
Evaluating...
Total Trial time: 16.376
Trial: 9 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.9, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 32, 'gradient_steps': 1, 'exploration_fraction': 0.3, 'exploration_final_eps': 0.1, 'target_update_interval': 100, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/9
Evaluating...
Total Trial time: 13.688
Trial: 10 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.9, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 32, 'gradient_steps': 1, 'exploration_fraction': 0.3, 'exploration_final_eps': 0.1, 'target_update_interval': 10000, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/10
Evaluating...
saving dataframe!
Total Trial time: 12.578
Trial: 11 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.9, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 32, 'gradient_steps': 1, 'exploration_fraction': 0.3, 'exploration_final_eps': 0.2, 'target_update_interval': 100, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/11
Evaluating...
Total Trial time: 12.682
Trial: 12 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.9, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 32, 'gradient_steps': 1, 'exploration_fraction': 0.3, 'exploration_final_eps': 0.2, 'target_update_interval': 10000, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/12
Evaluating...
Total Trial time: 12.113
Trial: 13 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.9, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 32, 'gradient_steps': 1, 'exploration_fraction': 0.5, 'exploration_final_eps': 0.1, 'target_update_interval': 100, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/13
Evaluating...
Total Trial time: 12.551
Trial: 14 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.9, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 32, 'gradient_steps': 1, 'exploration_fraction': 0.5, 'exploration_final_eps': 0.1, 'target_update_interval': 10000, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/14
Evaluating...
Total Trial time: 12.045
Trial: 15 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.9, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 32, 'gradient_steps': 1, 'exploration_fraction': 0.5, 'exploration_final_eps': 0.2, 'target_update_interval': 100, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/15
Evaluating...
Total Trial time: 12.087
Trial: 16 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.9, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 32, 'gradient_steps': 1, 'exploration_fraction': 0.5, 'exploration_final_eps': 0.2, 'target_update_interval': 10000, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/16
Evaluating...
Total Trial time: 11.533
Trial: 17 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.99, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 16, 'gradient_steps': 1, 'exploration_fraction': 0.3, 'exploration_final_eps': 0.1, 'target_update_interval': 100, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/17
Evaluating...
Total Trial time: 17.625
Trial: 18 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.99, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 16, 'gradient_steps': 1, 'exploration_fraction': 0.3, 'exploration_final_eps': 0.1, 'target_update_interval': 10000, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/18
Evaluating...
Total Trial time: 19.121
Trial: 19 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.99, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 16, 'gradient_steps': 1, 'exploration_fraction': 0.3, 'exploration_final_eps': 0.2, 'target_update_interval': 100, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/19
Evaluating...
Total Trial time: 17.25
Trial: 20 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.99, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 16, 'gradient_steps': 1, 'exploration_fraction': 0.3, 'exploration_final_eps': 0.2, 'target_update_interval': 10000, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/20
Evaluating...
saving dataframe!
Total Trial time: 18.502
Trial: 21 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.99, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 16, 'gradient_steps': 1, 'exploration_fraction': 0.5, 'exploration_final_eps': 0.1, 'target_update_interval': 100, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/21
Evaluating...
Total Trial time: 17.445
Trial: 22 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.99, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 16, 'gradient_steps': 1, 'exploration_fraction': 0.5, 'exploration_final_eps': 0.1, 'target_update_interval': 10000, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/22
Evaluating...
Total Trial time: 18.511
Trial: 23 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.99, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 16, 'gradient_steps': 1, 'exploration_fraction': 0.5, 'exploration_final_eps': 0.2, 'target_update_interval': 100, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/23
Evaluating...
Total Trial time: 17.534
Trial: 24 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.99, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 16, 'gradient_steps': 1, 'exploration_fraction': 0.5, 'exploration_final_eps': 0.2, 'target_update_interval': 10000, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/24
Evaluating...
Total Trial time: 18.254
Trial: 25 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.99, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 32, 'gradient_steps': 1, 'exploration_fraction': 0.3, 'exploration_final_eps': 0.1, 'target_update_interval': 100, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/25
Evaluating...
Total Trial time: 12.86
Trial: 26 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.99, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 32, 'gradient_steps': 1, 'exploration_fraction': 0.3, 'exploration_final_eps': 0.1, 'target_update_interval': 10000, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/26
Evaluating...
Total Trial time: 13.09
Trial: 27 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.99, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 32, 'gradient_steps': 1, 'exploration_fraction': 0.3, 'exploration_final_eps': 0.2, 'target_update_interval': 100, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/27
Evaluating...
Total Trial time: 12.722
Trial: 28 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.99, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 32, 'gradient_steps': 1, 'exploration_fraction': 0.3, 'exploration_final_eps': 0.2, 'target_update_interval': 10000, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/28
Evaluating...
Total Trial time: 12.352
Trial: 29 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.99, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 32, 'gradient_steps': 1, 'exploration_fraction': 0.5, 'exploration_final_eps': 0.1, 'target_update_interval': 100, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/29
Evaluating...
Total Trial time: 13.604
Trial: 30 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.99, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 32, 'gradient_steps': 1, 'exploration_fraction': 0.5, 'exploration_final_eps': 0.1, 'target_update_interval': 10000, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/30
Evaluating...
saving dataframe!
Total Trial time: 11.871
Trial: 31 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.99, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 32, 'gradient_steps': 1, 'exploration_fraction': 0.5, 'exploration_final_eps': 0.2, 'target_update_interval': 100, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/31
Evaluating...
Total Trial time: 11.84
Trial: 32 out of 64
{'policy': 'MlpPolicy', 'gamma': 0.99, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 32, 'gradient_steps': 1, 'exploration_fraction': 0.5, 'exploration_final_eps': 0.2, 'target_update_interval': 10000, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

Model Saved to: ./results/models/32
Evaluating...
Total Trial time: 11.542
Trial: 33 out of 64
{'policy': 'CnnPolicy', 'gamma': 0.9, 'learning_rate': 1e-05, 'batch_size': 256, 'buffer_size': 50000, 'train_freq': 16, 'gradient_steps': 1, 'exploration_fraction': 0.3, 'exploration_final_eps': 0.1, 'target_update_interval': 100, 'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
Learning...


Output()

KeyboardInterrupt: 

In [2]:
import pandas as pd

In [3]:
results_df = pd.read_pickle('./results/opt/dataframe/trial_30.pkl')

In [5]:
results_df.sort_values('Mean Episode Rewards', ascending = False).head(5)

Unnamed: 0,Trial,Time in minutes,Mean Episode Rewards,Reward std,Total Rewards,Mean Steps,Step Std,policy,gamma,learning_rate,batch_size,buffer_size,train_freq,gradient_steps,exploration_fraction,exploration_final_eps,target_update_interval,policy_kwargs,seed
26,27,12.722,-22.1,0.0,-22.1,2.624881,0.0,"[MlpPolicy, CnnPolicy]","[0.9, 0.99]",[1e-05],[256],[50000],"[16, 32]",[1],"[0.3, 0.5]","[0.1, 0.2]","[100, 10000]","[{'net_arch': [256, 256]}]",[2]
19,20,18.502,-22.3,0.0,-22.3,0.830662,0.0,"[MlpPolicy, CnnPolicy]","[0.9, 0.99]",[1e-05],[256],[50000],"[16, 32]",[1],"[0.3, 0.5]","[0.1, 0.2]","[100, 10000]","[{'net_arch': [256, 256]}]",[2]
0,1,17.983,-22.4,0.0,-22.4,3.919184,0.0,"[MlpPolicy, CnnPolicy]","[0.9, 0.99]",[1e-05],[256],[50000],"[16, 32]",[1],"[0.3, 0.5]","[0.1, 0.2]","[100, 10000]","[{'net_arch': [256, 256]}]",[2]
28,29,13.604,-22.5,0.0,-22.5,3.742993,0.0,"[MlpPolicy, CnnPolicy]","[0.9, 0.99]",[1e-05],[256],[50000],"[16, 32]",[1],"[0.3, 0.5]","[0.1, 0.2]","[100, 10000]","[{'net_arch': [256, 256]}]",[2]
17,18,19.121,-22.6,0.0,-22.6,2.615339,0.0,"[MlpPolicy, CnnPolicy]","[0.9, 0.99]",[1e-05],[256],[50000],"[16, 32]",[1],"[0.3, 0.5]","[0.1, 0.2]","[100, 10000]","[{'net_arch': [256, 256]}]",[2]


In [None]:
'''
Hyperparameters for trial 27
{'policy': 'MlpPolicy', 
'gamma': 0.99, 'learning_rate': 1e-05, 
'batch_size': 256, 'buffer_size': 50000, 
'train_freq': 32, 'gradient_steps': 1, 
'exploration_fraction': 0.3, 
'exploration_final_eps': 0.2, 
'target_update_interval': 100, 
'policy_kwargs': {'net_arch': [256, 256]}, 'seed': 2}
'''
