In [None]:
from env import get_random_walk_env, get_four_rooms_env
from policy import equiprobable_policy, greedy_epsilon_policy
from algo import td0, mc_prediction, generate_episode, on_policy_mc_control_fv, off_policy_mc_prediction, on_policy_mc_Q_prediction
from utils import load_or_generate_data
from visualize import plot_policy_on_grid_world, plot_grid_world_value_function

import matplotlib.pyplot as plt
import numpy as np
from tqdm import trange
import os


plt.style.use('seaborn-v0_8-bright')

np.set_printoptions(precision=3, suppress=True, threshold=1000, linewidth=1000)

In [None]:
FIG_BASE_DIR = 'figures/'
CACHE_BASE_DIR = 'cache/'

if not os.path.exists(FIG_BASE_DIR):
    os.makedirs(FIG_BASE_DIR)

if not os.path.exists(CACHE_BASE_DIR):
    os.makedirs(CACHE_BASE_DIR)

In [None]:
# Set up the environment + initial value function
env = get_random_walk_env()
initial_V = np.zeros(env.observation_space.n)
initial_V[1:-1] = 0.5

In [None]:
V_T0_ESTIMATES_FIG = os.path.join(FIG_BASE_DIR, 'V_T0_estimates.png')
true_V = np.arange(1, 6) / 6

if os.path.exists(V_T0_ESTIMATES_FIG):
    # load and plot png file
    plt.figure(figsize=(10, 6))
    plt.imshow(plt.imread(V_T0_ESTIMATES_FIG))
    plt.axis('off')
    plt.show()
else:
    V_T0, estimates_over_time_T0 = td0(env, equiprobable_policy(env), alpha=0.1, gamma=1, n_episodes=101, initial_V=initial_V, verbose=True)

    episodes_of_interest = [0, 1, 10, 100]
    x = np.arange(1, 6)
    line_colors = ['black', 'red', 'green', 'blue']
    plt.figure(figsize=(10, 6))
    for i, ep_idx in enumerate(episodes_of_interest):
        plt.plot(x, estimates_over_time_T0[ep_idx][1:6], label=f'Episode {ep_idx}', color=line_colors[i], marker='o', linewidth=1)
    plt.plot(x,true_V, label='True V', color='black', linestyle='--', marker='o', linewidth=1)
    plt.xlabel('State')
    plt.ylabel('Value')
    plt.xticks(np.arange(1, 6), ['A', 'B', 'C', 'D', 'E'])
    plt.legend()
    plt.title('TD(0) prediction')
    plt.savefig(V_T0_ESTIMATES_FIG)
    plt.show()

In [None]:
def rmse(V1, V2):
    """
    Simple function to calculate the root mean squared error between two value functions
    """
    return np.sqrt(np.mean((V1 - V2) ** 2))

In [None]:
MC_VS_TD0_RMSE_FIG = os.path.join(FIG_BASE_DIR, 'MC_vs_TD0_RMSE.png')

linestyles = ['-', '--', '-.', ':']
alphas_MC = [0.04, 0.03, 0.02, 0.01]
alphas_T0 = [0.15, 0.1, 0.05]
NUM_EPISODES = 101
NUM_RUNS = 100

if os.path.exists(MC_VS_TD0_RMSE_FIG):
    # load and plot png file
    plt.figure(figsize=(10, 6))
    plt.imshow(plt.imread(MC_VS_TD0_RMSE_FIG))
    plt.axis('off')
    plt.show()
else:
    plt.figure(figsize=(10, 6))
    for i, alpha in enumerate(alphas_MC):
        V_over_time = np.zeros((NUM_RUNS, NUM_EPISODES, env.observation_space.n))
        for r in trange(NUM_RUNS):
            V, V_over_time[r] = mc_prediction(env, equiprobable_policy(env), alpha=alpha, gamma=1, n_episodes=NUM_EPISODES, initial_V=initial_V)
        rmse_over_time = np.apply_along_axis(lambda x: rmse(x[1:6], true_V), 2, V_over_time)
        rmse_over_time = np.mean(rmse_over_time, axis=0)
        plt.plot(rmse_over_time, label=f'MC $\\alpha={alpha}$', linewidth=1, color='red', linestyle=linestyles[i])

    for i, alpha in enumerate(alphas_T0):
        V_over_time = np.zeros((NUM_RUNS, NUM_EPISODES, env.observation_space.n))
        for r in trange(NUM_RUNS):
            V, V_over_time[r] = td0(env, equiprobable_policy(env), alpha=alpha, gamma=1, n_episodes=NUM_EPISODES, initial_V=initial_V)
        rmse_over_time = np.apply_along_axis(lambda x: rmse(x[1:6], true_V), 2, V_over_time)
        rmse_over_time = np.mean(rmse_over_time, axis=0)
        plt.plot(rmse_over_time, label=f'TD(0) $\\alpha={alpha}$', linewidth=1, color='blue', linestyle=linestyles[i])

    plt.xlabel('Episode')
    plt.ylabel('RMSE')
    plt.legend()
    plt.title('Empirical RMSE averaged over states for TD(0) and MC')
    plt.savefig(MC_VS_TD0_RMSE_FIG)
    plt.show()

### MC Off-policy vs On-policy Comparison

Some setup for future experiments.

1. `random_policy_10k_episodes` - a list of 10k episodes generated by a random equiprobable policy.
2. `mc_fv_Q` - Q state-action value function generated by MC first-visit on-policy control. Policy here is an epsilon-greedy policy with epsilon=0.1.
3. `mc_fv_episodes` - the episodes generated while training `mc_fv_Q`.

**Important Note**: Our discount rate is `0.99`, and our epsilon for the epsilon-greedy policy is `0.1`. With this combo, we may expect that our epsilon-greedy policy will be greedy most of the time. Since, discount is so high and environment so small, we may get stuck in a suboptimal path (as long as it ends in a reward).


In [None]:
env = get_four_rooms_env()
GAMMA = 0.99
EPSILON = 0.1


In [None]:
RANDOM_POLICY_10k_EPISODES_PATH = os.path.join(CACHE_BASE_DIR, 'random_policy_10k_episodes.pkl')
ON_POLICY_MC_Q_AND_EPISODES_PATH = os.path.join(CACHE_BASE_DIR, 'on_policy_mc_q_and_episodes.pkl')


def run_10k_episodes_rand_policy(env):
    random_policy_10k_episodes = []
    random_policy = equiprobable_policy(env)
    for i in range(10_000):
        random_policy_10k_episodes.append(generate_episode(env, random_policy))
    
    return random_policy_10k_episodes


random_policy_10k_episodes = load_or_generate_data(RANDOM_POLICY_10k_EPISODES_PATH, run_10k_episodes_rand_policy, env)
mc_fv_Q, mc_fv_episodes = load_or_generate_data(ON_POLICY_MC_Q_AND_EPISODES_PATH, on_policy_mc_control_fv, env, lambda Q: greedy_epsilon_policy(env, Q, EPSILON), gamma=GAMMA, n_episodes=10_000, verbose=True, extractor=lambda x: (dict(x[2]), x[3]))

`pi_greedy` - is what I will call the greedy policy derived from `mc_fv_Q`. For state that were not visited, we will assume that the policy is equiprobable.

In [None]:
pi_greedy = greedy_epsilon_policy(env, mc_fv_Q, 0)

### Plotting the greedy policy derived from MC first-visit on-policy control

In [None]:
MC_ON_POLICY_MC_CONTROL_FIG = os.path.join(FIG_BASE_DIR, 'on_policy_mc_control.png')
plot_policy_on_grid_world(env, mc_fv_Q, title='$\\pi_{Greedy}$ from on-policy MC control (10k episodes)', save_path=MC_ON_POLICY_MC_CONTROL_FIG)

### Off Policy MC Prediction

Using `mc_fv_episodes` and `random_policy_10k_episodes`, we will compute two new $Q_\pi$ estimates. One with $\pi_{greedy}$ and one with $\pi_{random}$.

In [None]:
# signature -> off_policy_mc_prediction(env: Env, gamma: float, n_episodes: int, pre_generated_eps: List, verbose: bool = False):
RANDOM_OFF_POLICY_ESTIMATES_PATH = os.path.join(CACHE_BASE_DIR, 'random_off_policy_mc_estimates.pkl')
SOFT_EPSILON_GREEDY_OFF_POLICY_ESTIMATES_PATH = os.path.join(CACHE_BASE_DIR, 'soft_epsilon_greedy_off_policy_mc_estimates.pkl')

# rand_off_policy_mc_Q, rand_off_policy_mc_V = load_or_generate_data(RANDOM_OFF_POLICY_ESTIMATES_PATH, off_policy_mc_prediction, env, GAMMA, random_policy_10k_episodes, verbose=True)
# greedy_off_policy_mc_Q, greedy_off_policy_mc_V = load_or_generate_data(SOFT_EPSILON_GREEDY_OFF_POLICY_ESTIMATES_PATH, off_policy_mc_prediction, env, GAMMA,  mc_fv_episodes, verbose=True)
rand_off_policy_mc_Q, rand_off_policy_mc_V = off_policy_mc_prediction(env, GAMMA, random_policy_10k_episodes, verbose=True)
greedy_off_policy_mc_Q, greedy_off_policy_mc_V = off_policy_mc_prediction(env, GAMMA, mc_fv_episodes, verbose=True)

In [None]:
def convert_V_dict_to_array_grid_world(env, V_dict):
    V_map = np.zeros((env.unwrapped.cols, env.unwrapped.rows))
    for x in range(env.unwrapped.cols):
        for y in range(env.unwrapped.rows):
            V_map[y, x] = np.max(V_dict.get((x, y), 0))
    flip_V_map = np.flipud(V_map) # flip y-axis to match grid world
    return flip_V_map

In [None]:
RANDOM_OFF_POLICY_Q_FIG = os.path.join(FIG_BASE_DIR, 'random_off_policy_mc_V.png')
plot_grid_world_value_function(env, rand_off_policy_mc_V, title='Off-Policy $V_\\pi$ (Random Behavior Policy)', save_path=RANDOM_OFF_POLICY_Q_FIG)
convert_V_dict_to_array_grid_world(env, rand_off_policy_mc_V)

In [None]:
SOFT_EPSILON_GREEDY_OFF_POLICY_Q_FIG = os.path.join(FIG_BASE_DIR, 'soft_epsilon_greedy_off_policy_mc_V.png')
plot_grid_world_value_function(env, greedy_off_policy_mc_V, title='Off-Policy $V_\\pi$ (Epsiodes to Train $\\pi_{greedy}$ Policy)', save_path=SOFT_EPSILON_GREEDY_OFF_POLICY_Q_FIG)
convert_V_dict_to_array_grid_world(env, greedy_off_policy_mc_V)

### On Policy MC Prediction (using $\pi_{greedy}$)

In [None]:
on_policy_Q = on_policy_mc_Q_prediction(env, pi_greedy, gamma=GAMMA, n_episodes=10_000, verbose=True)
on_policy_V = {state: np.max(action_values) for state, action_values in on_policy_Q.items()}

In [None]:
ON_POLICY_GREEDY_PI_ESTIMATES_FIG = os.path.join(FIG_BASE_DIR, 'on_policy_greedy_V_pi_estimates.png')
plot_grid_world_value_function(env, on_policy_V, title='On-Policy $V_\\pi$ ($\\pi_{greedy}$ Policy)', save_path=SOFT_EPSILON_GREEDY_OFF_POLICY_Q_FIG)
convert_V_dict_to_array_grid_world(env, on_policy_V)