In [None]:
from algorithms import td_prediction
from env import get_random_walk_env
from policy import get_equiprobable_policy
import tqdm
import numpy as np
import matplotlib.pyplot as plt

### Random Walk (7.2 RL2e Example)

In [None]:
NUM_STATES = 21
env = get_random_walk_env(num_states=NUM_STATES)
policy = get_equiprobable_policy(env)
episodes = []
for i in range(10):
    episode = []
    state, _ = env.reset()
    done = False
    truncated = False
    while not done and not truncated:
        action = policy(state)
        next_state, reward, done, truncated, _ = env.step(action)
        episode.append((state, action, reward))
        state = next_state
    episodes.append(episode)


In [None]:
def rmse(v, v_hat):
    return ((v - v_hat) ** 2).mean() ** 0.5

true_V = np.arange(-9, 10) / 10 # derived using Bellman equation on equiprobable policy

In [None]:
N_STEP_VALUES = [1,2,4,8,16,32,64,128,256,512]
ALPHAS = np.arange(0, 1.01, 0.01)
NUM_EXPERIMENTS = 100

plt.figure(figsize=(20,20))
for n in tqdm.tqdm(N_STEP_VALUES):
    v_over_time = np.zeros((len(ALPHAS), NUM_EXPERIMENTS, env.observation_space.n))
    for i, alpha in enumerate(ALPHAS):
        for j in range(NUM_EXPERIMENTS):
            # get dict of V and turn it into ndarray
            V = td_prediction(env, 1, episodes, alpha, n=n)
            V_np = np.zeros(env.observation_space.n)
            for k, v in V.items():
                V_np[k] = v
            v_over_time[i, j] = V_np
    rmse_per_experiment = np.apply_along_axis(lambda x: rmse(x[1:NUM_STATES-1], true_V), 2, v_over_time)
    average_rmse = rmse_per_experiment.mean(axis=1)
    plt.plot(ALPHAS, average_rmse, label=f'n={n}')

plt.xlabel('$\\alpha$')
plt.ylabel('Average RMS error over 19 states and first 10 episodes')

plt.legend()
plt.show()    