In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

# Executive Summary

This script aims to analyze the performance of the different algorithms, mainly for *InvertedPendulum* and *LQR*

In [None]:
path = "/Users/calebju/Code/RL-general-action-state/logs"
path2 = "/Users/calebju/Code/RL-general-action-state/logs2"
n_seeds = 10

# 04_28_2024/exp_0.py (Inverted)
data_1 = np.zeros((3,n_seeds,500_000), dtype=float)
aux_1 = np.zeros((3,n_seeds,500_000), dtype=float)
len_1 = np.zeros((3, n_seeds), dtype=int)
ones_5 = (1./5)*np.ones(5)
for i in range(len(data_1)):
    for j in range(n_seeds):
        df = pd.read_csv(os.path.join(path, "04_28_2024/exp_0/run_%d/seed=%d.csv" % (i, j)))
        temp = df['episode rewards']
        temp2 = df['episode len']
        len_1[i,j] = len(temp)
        # data_1[i,j,:len_1[i,j]] = np.convolve(temp, ones_5)[:-2]
        data_1[i,j,:len_1[i,j]] = temp
        aux_1[i,j,:len_1[i,j]] = np.cumsum(temp2)

# 04_282024/exp_1.py (LQR)
data_2 = np.zeros((2, n_seeds, 500_000), dtype=float)
aux_2 = np.zeros((2, n_seeds, 500_000), dtype=float)
len_2 = np.zeros((2, n_seeds), dtype=int)
for i in range(len(data_2)):
    for j in range(n_seeds):
        df = pd.read_csv(os.path.join(path2 if i==0 else path, "04_28_2024/exp_1/run_%d/seed=%d.csv" % (i,j)))
        temp = df['episode rewards']
        temp2 = df['episode len']
        len_2[i,j] = len(temp)
        # data_2[i,j,:len_2[i,j]] = np.convolve(temp, ones_5)[:-2]
        data_2[i,j,:len_2[i,j]] = temp
        aux_2[i,j,:len_2[i,j]] = np.cumsum(temp2)

### InvertedPendulum
Since RL episodes end at different time points, we query the current episode's cost at fixed points instead.

Earlier episodes tend to be shorter, so will query more often early.

In [None]:
xs = np.append(
    np.append(np.arange(1_000, step=10), np.arange(1_000, 10_000, step=25)), 
    np.arange(10_000, 100_000, step=100)
)
clean_1 = np.zeros((data_1.shape[0], n_seeds, len(xs)), dtype=int)
for i in range(clean_1.shape[0]):
    for j in range(clean_1.shape[1]):
        for k,x in enumerate(xs):
            # finds index i such that len_i < x <= len_(i+1)
            clean_1[i,j,k] = np.argmax(x <= aux_1[i,j,:])

Now we can plot.

In [None]:
plt.style.use('ggplot')
_, ax = plt.subplots(figsize=(6,5))

label_arr = ['pda', 'ppo', 'ddpg']
lss_arr = ['solid', 'dashed', 'dotted']
color_arr = ['red', 'green', 'purple']
ones_5 = 0.1*np.ones(10)

for i in range(len(data_1)):
    ys = np.zeros(clean_1.shape[1:])
    for j in range(n_seeds):
        ys[j] = data_1[i, j, clean_1[i,j]]
        ys[j] = np.convolve(ys[j], ones_5)[:-len(ones_5)+1]
    med = np.mean(ys, axis=0)
    rng = np.std(ys, axis=0)
    rng *= 2.28 # based on 2-sided t-score with p=0.05
    ax.plot(xs, -med, label=label_arr[i], linestyle=lss_arr[i], color=color_arr[i])
    ax.fill_between(xs, -med-rng, -med+rng, color=color_arr[i], alpha=0.1)

ax.legend(loc="lower left")
ax.set(
    title="Costs in InvertedPendulum over 10 trials",
    ylabel="Cumulative discounted cost\n smoothed over %d periods" % len(ones_5),
    xlabel="Samples",
    xlim=(-5_000, 100_000),
    ylim=(-175, 50)
)

plt.tight_layout()
plt.savefig("invpend_v2.png", dpi=90)

Why does PPO have such a large variance? Let's plot every seed.

In [None]:
plt.style.use('ggplot')
_, ax = plt.subplots(figsize=(6,4))

label_arr = ['pda', 'ppo', 'ddpg']
lss_arr = ['solid', 'dashed', 'dotted']
color_arr = ['red', 'green', 'purple']
ones_5 = 0.2*np.ones(5)

i = 1
ys = np.zeros(clean_1.shape[1:])
for j in range(n_seeds):
    l = len_1[i,j]
    _xs = aux_1[i,j,:l]
    ys = data_1[i,j,:l]
    ax.plot(_xs, -ys, label=label_arr[i])

# ax.legend(loc="right")
ax.set(
    title="Costs in InvertedPendulum\nfor %s (no bucket)" % label_arr[i],
    ylabel="Smoothed cumulative reward",
    xlabel="Samples",
    xlim=(-5_000, 100_000),
)

### LQR

In [None]:
plt.style.use('ggplot')
_, ax = plt.subplots(figsize=(8,6))

label_arr = ['pda', 'ppo', 'ddpg']
lss_arr = ['solid', 'dashed', 'dotted']
color_arr = ['red', 'green', 'purple']

for i in range(len(data_2)):
    l = np.min(len_2[i])
    xs = np.max(aux_2[i,:,:l], axis=0)
    ys = np.max(data_2[i,:,:l], axis=0)
    # ys = np.convolve(ys, ones_5)[:-4]
    rng = np.std(data_2[i,:,:l], axis=0)
    ax.plot(xs, -ys, label=label_arr[i], linestyle=lss_arr[i], color=color_arr[i])
    ax.fill_between(xs, -ys-0.25*rng, -ys+1.96*rng, color=color_arr[i], alpha=0.1)

ax.legend()
ax.set(
    title="Convergence of LQR",
    ylabel="Smoothed cumulative reward",
    xlabel="Samples",
    ylim=(1e1,1e9),
    yscale="log",
)

Let's now do the bucketed version. First we clean.

In [None]:
xs = np.arange(500_000, step=1_000)
clean_2 = np.zeros((data_2.shape[0], n_seeds, len(xs)), dtype=int)
for i in range(clean_2.shape[0]):
    for j in range(clean_2.shape[1]):
        for k,x in enumerate(xs):
            # finds index i such that len_i < x <= len_(i+1)
            clean_2[i,j,k] = np.argmax(x <= aux_2[i,j,:])

Now we plot.

In [None]:
plt.style.use('ggplot')
_, ax = plt.subplots(figsize=(6,5))

label_arr = ['pda', 'ppo', 'ddpg']
lss_arr = ['solid', 'dashed', 'dotted']
color_arr = ['red', 'green', 'purple']
ones_5 = 0.2*np.ones(5)

for i in range(len(data_2)):
    ys = np.zeros(clean_2.shape[1:])
    for j in range(n_seeds):
        ys[j,:] = data_2[i, j, clean_2[i,j,:]]
        ys[j,:] = np.convolve(ys[j,:], ones_5)[:-len(ones_5)+1]
    med = np.mean(ys, axis=0)
    rng = np.std(ys, axis=0)
    rng *= 2.28 # based on 2-sided t-score with p=0.05
    ax.plot(xs, -med, label=label_arr[i], linestyle=lss_arr[i], color=color_arr[i])
    ax.fill_between(xs, -med-rng, -med+rng, color=color_arr[i], alpha=0.1)

ax.plot([],[], linestyle=lss_arr[2], color=color_arr[2], label="ddpg (not shown)")

ax.legend(loc="upper left")
ax.set(
    title="Costs in LQR over 10 trials",
    ylabel="Cumulative discounted cost\n smoothed over %d periods" % len(ones_5),
    xlabel="Samples",
    xlim=(-10_000, 500_000),
    ylim=(5e5,1e8),
    yscale="log",
)

plt.tight_layout()
plt.savefig("lqr_v2.png", dpi=90)