# Test All Control Methods
This notebook shows an example usage of control methods implemented:

1. MC-GLIE
2. SARSA Zero
3. SARSA Lambda
4. Q-learning

We use the grid world environment as an example case

In [1]:
import sys
sys.path.insert(0,'../')
from myenvs import gridworld
from solvers import mc
import numpy as np
from collections import defaultdict
import pdb
np.random.seed()

env = gridworld.DetermGridWorld(size=[4,4],plotgrid=False)
env.reset()
def optimal_policy(state):
    policy = [0, 3, 3, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 2, 2, 0]
    action = policy[state]
    return action
def random_policy(state):
    action = np.random.choice(np.arange(0,env.nA), 1)[0]
    return action

In [2]:
# At any given state, act greedily w.r.to Q(s,a) to get an action
from solvers import utils
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import rc
rc('text', usetex=True)
import seaborn as sns

#mpl.rcParams['figure.dpi']=300
%config InlineBackend.figure_format = 'svg'

def plot_optimal_policy(Qsa,env):
    """
    Given a Q(s,a) plots an optimal policy with in a gridworld environment.
    Optimal policy is determined by acting greedily with Q(s,a)
    """
    actions = env.actions
    optimal_policy = [10]*env.observation_space.n
    for state in range(env.observation_space.n):
        optimal_policy[state] = utils.greedy_policy(env,Qsa,state)
        
    ax = plt.subplot(111)
    env._plotenv()

    for ind,state in enumerate(env.flatgrid):
        ax.text(state[0],state[1],actions[optimal_policy[ind]])
    plt.title(r'$\pi^*$')
    plt.tight_layout()
    
    return optimal_policy

def plot_learning_trace(opc):
    fig = plt.figure(figsize=(12,4))
    ax = plt.subplot(1,3,1)
    ax.plot(np.arange(opc.num_episodes),opc.trace.lengths)
    ax.set_ylabel('Episode length')
    
    ax = plt.subplot(1,3,2)
    ax.plot(np.arange(opc.num_episodes),opc.trace.rewards)
    ax.set_ylabel('Episode Rewards')
    
    ax = plt.subplot(1,3,3)
    ax.plot(np.arange(opc.num_episodes),opc.trace.epsilon)
    ax.set_ylabel(r'$\epsilon$')    
    
    fig.text(0.5, 0.04, 'Number of Episodes', ha='center')
    plt.subplots_adjust(wspace = 0.35)
    plt.locator_params(axis='y', nbins=5)
    plt.locator_params(axis='x', nbins=5)
    sns.despine()
    sns.set_context('paper')

In [3]:
# MC-GLIE
from solvers import mc

glie = mc.GLIE(env, num_episodes=1000)
Qsa = glie.solve()
op_mcglie = plot_optimal_policy(Qsa,env)
plot_learning_trace(glie)

ValueError: attempt to get argmax of an empty sequence

In [None]:
from solvers import sarsa
sarsa = sarsa.SARSA(env,num_episodes=10000, verbose = True)
Qsa_SarsaZero = sarsa.Zero()
op_sarsazero = plot_optimal_policy(Qsa_SarsaZero,env)
plot_learning_trace(sarsa)

In [None]:
from solvers import sarsa
sarsa = sarsa.SARSA(env,num_episodes=1000, verbose = True)
Qsa_SarsaLambda = sarsa.Lambda()
op_sarsalambda = plot_optimal_policy(Qsa_SarsaLambda,env)
plot_learning_trace(sarsa)

In [None]:
from solvers import qlearning
ql = qlearning.Qlearning(env,num_episodes=1000, verbose = True)
Qsa_Qltabular = ql.tabular()
op_qltabular = plot_optimal_policy(Qsa_Qltabular,env)
plot_learning_trace(ql)

In [None]:
env.reset()
is_done = False
ax = plt.subplot(111)
env._plotenv()
while not is_done:
    action = op_qltabular[env.s]
    sp,r,is_done = env.step(action) # take the action
    env.render()
plt.show()

In [None]:
def plot_alpha_episodetrace(opc, ax):
    ax.plot(np.arange(opc.num_episodes),opc.trace.lengths)
    ax.set_ylabel('Episode length')
    ax.set_xlabel('Episodes')

    plt.subplots_adjust(wspace = 0.35)
    plt.locator_params(axis='y', nbins=5)
    plt.locator_params(axis='x', nbins=5)
    sns.despine()
    sns.set_context('paper')
    
alpha = np.linspace(0.1,1.0,5)
fig = plt.figure(figsize=(4,4))
ax = plt.subplot(1,1,1)
for a in np.nditer(alpha):
    ql = qlearning.Qlearning(env,num_episodes=100, verbose = False, alpha = a)
    qsa = ql.tabular()
    plot_alpha_episodetrace(ql, ax)
plt.show()