## Explore

In [54]:
from glob import glob
from explicit_memory.utils import read_yaml
import pandas as pd
import numpy as np

results_all = []
for results_path in glob("./training_results/PPO/explore/LSTM/s/*/results.yaml"):
    train_path = results_path.replace("results.yaml", "train.yaml")
    train = read_yaml(train_path)
    results = read_yaml(results_path)
    results_all.append(
        {
            "num_episodes": train["num_episodes"],
            "num_rollouts": train["num_rollouts"],
            "epoch_per_rollout": train["epoch_per_rollout"],
            "gamma": train["gamma"],
            "batch_size": train["batch_size"],
            "epsilon": train["epsilon"],
            "entropy_weight": train["entropy_weight"],
            "tau": train["tau"],
            "test_score": results["test_score"]["mean"],
            "val_score": max([foo["mean"] for foo in results["validation_score"]]),
            "path": results_path.split("/")[-2],
        }
    )

df = pd.DataFrame(results_all)
df_sorted = df.sort_values(by="test_score", ascending=False)
print(f"number of training results: {len(df_sorted)}")

top_k = 10
df_sorted[:top_k]

number of training results: 48


Unnamed: 0,num_episodes,num_rollouts,epoch_per_rollout,gamma,batch_size,epsilon,entropy_weight,tau,test_score,val_score,path
9,10,10,64,0.920597,8,0.2,0.361795,0.93,659.5,740.6,2024-03-04 13:31:41.203765
13,20,20,16,0.583264,8,0.2,0.013287,0.93,594.1,694.0,2024-03-04 14:32:31.708967
46,20,20,32,0.671473,8,0.2,0.035405,0.93,518.5,682.1,2024-03-04 14:16:05.050180
40,20,20,32,0.852593,8,0.2,0.076833,0.93,518.5,655.1,2024-03-04 13:49:50.256336
39,20,20,64,0.93011,8,0.2,0.485659,0.93,518.5,527.7,2024-03-04 13:51:02.934742
38,20,20,32,0.653656,8,0.2,0.016617,0.93,518.5,445.8,2024-03-04 14:33:44.277205
7,20,20,16,0.536671,8,0.2,0.030837,0.93,518.5,567.9,2024-03-04 14:10:13.828523
36,20,20,32,0.671021,8,0.2,0.045817,0.93,518.5,751.5,2024-03-04 14:13:13.505272
21,10,10,32,0.915594,8,0.2,0.244205,0.93,518.5,594.6,2024-03-04 13:25:16.587624
1,20,20,32,0.665172,8,0.2,0.033631,0.93,518.5,682.1,2024-03-04 14:02:49.866091


In [55]:
df_sorted[:top_k].select_dtypes(include=[np.number]).mean()

num_episodes          18.000000
num_rollouts          18.000000
epoch_per_rollout     35.200000
gamma                  0.740015
batch_size             8.000000
epsilon                0.200000
entropy_weight         0.134409
tau                    0.930000
test_score           540.160000
val_score            634.140000
dtype: float64

In [56]:
df_sorted[-top_k:].select_dtypes(include=[np.number]).mean()

num_episodes          14.000000
num_rollouts          14.000000
epoch_per_rollout     36.800000
gamma                  0.740311
batch_size             8.000000
epsilon                0.200000
entropy_weight         0.145203
tau                    0.930000
test_score           172.700000
val_score            197.000000
dtype: float64

## MM

In [53]:
from glob import glob
from explicit_memory.utils import read_yaml
import pandas as pd
import numpy as np

results_all = []
for results_path in glob("./training_results/PPO/mm/LSTM/s/*/results.yaml"):
    train_path = results_path.replace("results.yaml", "train.yaml")
    train = read_yaml(train_path)
    results = read_yaml(results_path)
    results_all.append(
        {
            "num_episodes": train["num_episodes"],
            "num_rollouts": train["num_rollouts"],
            "epoch_per_rollout": train["epoch_per_rollout"],
            "gamma": train["gamma"],
            "batch_size": train["batch_size"],
            "epsilon": train["epsilon"],
            "entropy_weight": train["entropy_weight"],
            "tau": train["tau"],
            "split_reward_training": train["split_reward_training"],
            "test_score": results["test_score"]["mean"],
            "val_score": max([foo["mean"] for foo in results["validation_score"]]),
            "path": results_path.split("/")[-2],
        }
    )

df = pd.DataFrame(results_all)
df_sorted = df.sort_values(by="test_score", ascending=False)
print(f"number of training results: {len(df_sorted)}")

top_k = 10
df_sorted[:top_k]

number of training results: 90


Unnamed: 0,num_episodes,num_rollouts,epoch_per_rollout,gamma,batch_size,epsilon,entropy_weight,tau,split_reward_training,test_score,val_score,path
42,10,5,16,0.982625,8,0.2,0.094691,0.93,False,830.0,689.1,2024-03-04 10:24:04.727747
38,10,10,16,0.987418,8,0.2,0.032658,0.93,False,794.4,581.2,2024-03-04 12:27:29.829646
10,10,10,32,0.926026,32,0.2,0.01709,0.93,True,788.4,726.9,2024-03-04 10:48:02.554893
16,10,20,16,0.966896,16,0.2,0.006873,0.93,False,774.0,520.0,2024-03-04 12:39:36.176969
48,10,20,64,0.905213,16,0.2,0.00475,0.93,False,767.2,686.0,2024-03-04 10:03:18.880292
1,10,20,8,0.918732,8,0.2,0.013955,0.93,True,767.1,600.9,2024-03-04 10:22:55.084595
9,10,5,16,0.973576,16,0.2,0.032107,0.93,True,752.1,659.0,2024-03-04 11:46:47.327109
79,10,5,32,0.990241,8,0.2,0.081392,0.93,True,727.2,503.0,2024-03-04 12:41:08.378212
14,10,10,16,0.961367,8,0.2,0.03133,0.93,False,711.4,674.8,2024-03-04 11:32:41.550455
64,10,10,32,0.936878,8,0.2,0.028375,0.93,False,709.4,491.6,2024-03-04 10:17:06.416677


In [17]:
df_sorted[:top_k].select_dtypes(include=[np.number]).median()

num_episodes          10.000000
num_rollouts          10.000000
epoch_per_rollout     24.000000
gamma                  0.954536
batch_size             8.000000
epsilon                0.200000
entropy_weight         0.029853
tau                    0.930000
test_score           759.600000
val_score            666.900000
dtype: float64

In [49]:
df_sorted[:top_k].select_dtypes(include=[np.number]).mean()

num_episodes          10.000000
num_rollouts          11.500000
epoch_per_rollout     24.800000
gamma                  0.954897
batch_size            12.800000
epsilon                0.200000
entropy_weight         0.034322
tau                    0.930000
test_score           762.120000
val_score            613.250000
dtype: float64

In [19]:
top_k = 10
df_sorted[-top_k:]

Unnamed: 0,num_episodes,num_rollouts,epoch_per_rollout,gamma,batch_size,epsilon,entropy_weight,tau,split_reward_training,test_score,val_score,path
38,10,10,4,0.973388,32,0.2,0.083553,0.93,False,339.5,446.5,2024-03-04 10:45:49.924939
13,10,50,8,0.922868,8,0.2,0.091613,0.93,True,339.5,409.2,2024-03-04 10:40:11.553908
47,10,5,8,0.953923,64,0.2,0.02015,0.93,False,339.5,520.8,2024-03-04 10:45:07.080757
46,10,5,16,0.935459,64,0.2,0.062494,0.93,False,339.5,520.8,2024-03-04 11:48:41.803476
63,10,5,8,0.965219,32,0.2,0.094691,0.93,True,339.1,484.6,2024-03-04 10:38:33.437266
20,10,10,4,0.944083,16,0.2,0.052377,0.93,True,339.1,443.4,2024-03-04 12:12:52.668551
61,10,20,32,0.943515,16,0.2,0.025201,0.93,False,334.4,507.8,2024-03-04 10:46:35.126642
24,10,5,32,0.973892,32,0.2,0.006537,0.93,False,289.2,355.7,2024-03-04 11:14:50.641405
41,10,5,4,0.959442,8,0.2,0.048885,0.93,False,162.9,164.0,2024-03-04 10:39:25.992493
34,10,5,4,0.990239,8,0.2,0.068885,0.93,True,95.4,304.9,2024-03-04 11:07:57.392812


In [22]:
df_sorted[-top_k:].select_dtypes(include=[np.number]).median()

num_episodes          10.000000
num_rollouts           5.000000
epoch_per_rollout      8.000000
gamma                  0.956683
batch_size            24.000000
epsilon                0.200000
entropy_weight         0.057435
tau                    0.930000
test_score           339.100000
val_score            444.950000
dtype: float64

In [23]:
df_sorted[-top_k:].select_dtypes(include=[np.number]).mean()

num_episodes          10.000000
num_rollouts          12.000000
epoch_per_rollout     12.000000
gamma                  0.956203
batch_size            28.000000
epsilon                0.200000
entropy_weight         0.055439
tau                    0.930000
test_score           291.810000
val_score            415.770000
dtype: float64