In [13]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

# Executive Summary

This script aims to recover the meaning of the data from `04_26_2024` to `04_28_2024` as well as `11_01_2024`. 

We start with:
- `04_26_2024/exp_0.py`: Tune PDA on GridWorld and LunarLander with RKHS and NN. Also then runs PPO and DQN (10 runs)
- `04_26_2024/exp_1.py`: Tune PDA on InvertedPendulum. Also then runs PPO and PPDG (26 runs)
- `04_27_2024/exp_0.py`: Tune PDA stepsize in LQR (14 runs)
- `11_01_2024/exp_0.py`: Tun PDA for GridWorld-v1 (new version).

**Note**: All environments are by default reward maximization. 

In [14]:
path = "/Users/calebju/Code/RL-general-action-state/logs"

# 04_26_2024/exp_0.py
data_1 = np.zeros(10)
for i in range(len(data_1)):
    df = pd.read_csv(os.path.join(path, "04_26_2024/exp_0/run_%d/seed=0.csv" % i))
    data_1[i] = df['episode rewards'].iloc[-1]

# 04_26_2024/exp_1.py
data_2 = np.zeros(26)
for i in range(len(data_2)):
    df = pd.read_csv(os.path.join(path, "04_26_2024/exp_1/run_%d/seed=0.csv" % i))
    data_2[i] = df['episode rewards'].iloc[-1]

# 04_27_2024/exp_0.py
data_3 = np.zeros(14)
for i in range(len(data_3)):
    df = pd.read_csv(os.path.join(path, "04_27_2024/exp_0/run_%d/seed=0.csv" % i))
    data_3[i] = df['episode rewards'].iloc[-1]

In [30]:
print("Experiment 04_26_2024/exp_0.py\n")
print("id|    score\n------------")
for i in range(len(data_1)):
    print("%d |%.2e %s" % (i, -data_1[i], "*" if -data_1[i] == np.min(-data_1) else ""))

Experiment 04_26_2024/exp_0.py

id|    score
------------
0 |1.13e+02 
1 |9.39e+01 
2 |1.13e+02 
3 |1.11e+01 
4 |3.75e+01 
5 |7.00e+00 *
6 |2.51e+01 
7 |1.91e+02 
8 |1.15e+02 
9 |5.89e+01 


Note that we do not tune stepsizes here, we just try different Bregman divergences and function approximation.

In [16]:
print("Experiment 04_26_2024/exp_1.py\n")
print("id |    score\n------------")
for i in range(len(data_2)):
    print("%d |%.2e %s" % (i, -data_2[i], "*" if -data_2[i] == np.min(-data_2[:24]) else ""))

Experiment 04_26_2024/exp_1.py

id |    score
------------
0 |-2.97e+01 
1 |-9.99e+01 
2 |-6.66e+01 
3 |-6.72e+01 
4 |-4.64e+01 
5 |-6.85e+01 
6 |-3.44e+01 
7 |-8.38e+01 
8 |-1.22e+01 
9 |-1.14e+01 
10 |-3.38e+01 
11 |-2.22e+01 
12 |-1.48e+01 
13 |-3.89e+01 
14 |-1.05e+01 
15 |-4.90e+00 
16 |-5.85e+00 
17 |-1.22e+01 
18 |-1.74e+01 
19 |-4.74e+01 
20 |-5.61e+01 
21 |-1.00e+02 *
22 |-7.70e+01 
23 |-4.58e+01 
24 |-4.58e+01 
25 |-7.48e+01 


The best performing PDA for solving Inverted Pendulum has **no policy noise, 10 PE epochs, PO eta=1, and `pda_2` stepsize**.

In [17]:
print("Experiment 04_27_2024/exp_0.py\n")
print("id |    score\n------------")
for i in range(len(data_3)):
    print("%d |%.2e %s" % (i, -data_3[i], "*" if -data_3[i] == np.min(-data_3[:12]) else ""))

Experiment 04_27_2024/exp_0.py

id |    score
------------
0 |3.49e+06 
1 |1.37e+08 
2 |2.44e+07 
3 |1.31e+08 
4 |3.42e+06 *
5 |4.20e+07 
6 |4.47e+07 
7 |6.98e+07 
8 |8.63e+07 
9 |6.98e+08 
10 |1.96e+07 
11 |4.47e+08 
12 |5.22e+06 
13 |nan 


The best performing PDA for solving LQR has **PDA noise of 1 (large), PO eta = 0.001 (medium), and `pda_2` stepsize**.

## Tuning SB3

In [42]:
path = "/Users/calebju/Code/RL-general-action-state/logs"

# 02_10_2026/exp_0.py
data_4 = np.zeros(42)
for i in range(len(data_4)):
    df = pd.read_csv(os.path.join(path, "02_10_2026/exp_0/run_%d/seed=0.csv" % i))
    data_4[i] = df['episode rewards'].iloc[-1]

# 02_10_2026/exp_1.py
data_5 = np.zeros(16)
for i in range(len(data_5)):
    df = pd.read_csv(os.path.join(path, "02_10_2026/exp_1/run_%d/seed=0.csv" % i))
    if len(df) > 0:
        data_5[i] = df['episode rewards'].iloc[-1]
    else:
        data_5[i] = -1e30 # in case the algorithm failed

In [48]:
print("Experiment 02_10_2026/exp_0.py for LunarLander-v3\n")
print("id|    score\n------------")
for i in range(14):
    print("%d |%.2e" % (i, -data_4[i]))

Experiment 02_10_2026/exp_0.py for LunarLander-v3

id|    score
------------
0 |5.97e+01
1 |1.15e+02
2 |1.22e+01
3 |4.12e+01
4 |1.22e+01
5 |2.42e+01
6 |2.28e+02
7 |5.89e+01
8 |-6.24e+00
9 |-2.73e+00
10 |2.30e+01
11 |-2.73e+00
12 |7.00e+00
13 |1.92e+02


Best performing:
- PPO [0,6]: lr=0.0003
- DQN [7,13]: lr=0.0003

In [44]:
print("Experiment 02_10_2026/exp_0.py for InvertedPendulum-v4\n")
print("id|    score\n------------")
for i in range(14,28):
    print("%d |%.2e" % (i, -data_4[i]))

Experiment 02_10_2026/exp_0.py for InvertedPendulum-v4

id|    score
------------
14 |-5.85e+00 
15 |-4.58e+01 
16 |-4.53e+01 
17 |-5.57e+01 
18 |-4.53e+01 
19 |-6.19e+01 
20 |-2.97e+00 
21 |-4.36e+01 
22 |-5.39e+01 
23 |-7.48e+01 *
24 |-2.97e+00 
25 |-7.48e+01 *
26 |-2.97e+00 
27 |-2.97e+00 


Best performing:
- PPO [14,20]: lr=0.01
- DDPG [21,27]: lr=0.001

In [45]:
print("Experiment 02_10_2026/exp_0.py for gym_examples/LQREnv-v0\n")
print("id|    score\n------------")
for i in range(28,42):
    print("%d |%.2e %s" % (i, -data_4[i], "*" if -data_4[i] == np.min(-data_4[28:35]) else ""))

Experiment 02_10_2026/exp_0.py for gym_examples/LQREnv-v0

id|    score
------------
28 |6.53e+06 
29 |5.22e+06 *
30 |6.30e+06 
31 |1.07e+07 
32 |6.30e+06 
33 |1.09e+11 
34 |1.80e+12 
35 |nan 
36 |nan 
37 |nan 
38 |nan 
39 |nan 
40 |nan 
41 |nan 


Best performing:
- PPO [28,34]: lr=0.0003
- DDPG: none work

In [46]:
print("Experiment 02_10_2026/exp_1.py for gym_examples/GridWorld-v0\n")
print("id|    score\n------------")
for i in range(0,8):
    print("%d |%.2e %s" % (i, -data_5[i], "*" if -data_5[i] == np.min(-data_5[0:8]) else ""))

Experiment 02_10_2026/exp_1.py for gym_examples/GridWorld-v0

id|    score
------------
0 |1.62e+02 
1 |2.32e+01 *
2 |1.68e+02 
3 |1.28e+02 
4 |1.91e+02 
5 |1.44e+02 
6 |1.00e+30 
7 |1.00e+30 


Best performing:
- PPO [0,3]: lr=0.001
- DQN [4,7]: lr=0.001

In [47]:
print("Experiment 02_10_2026/exp_1.py for LunarLander-v3\n")
print("id|    score\n------------")
for i in range(8,16):
    print("%d |%.2e %s" % (i, -data_5[i], "*" if -data_5[i] == np.min(-data_5[8:]) else ""))

Experiment 02_10_2026/exp_1.py for LunarLander-v3

id|    score
------------
8 |5.97e+01 
9 |1.22e+01 
10 |2.42e+01 
11 |2.28e+02 
12 |5.89e+01 
13 |-2.73e+00 *
14 |7.00e+00 
15 |1.92e+02 


Best performing:
- PPO [8,11]: lr=0.001
- DQN [12,15]: lr=0.001