In [1]:
import numpy as np
import torch 
import torch.nn as nn 
from torch.distributions import Categorical
import matplotlib.pyplot as plt 
from matplotlib import rcParams
rcParams['font.size'] = 24
rcParams['figure.figsize'] = (16, 8)
from tqdm import tqdm 

import importlib 
import ipywidgets
from ipywidgets import interact
from IPython.display import Image
import IPython

from rllib.dataset.datatypes import Observation
from rllib.util.utilities import get_entropy_and_log_p

from rllib.util.training.agent_training import train_agent
from rllib.environment import GymEnvironment
from rllib.environment.mdps import EasyGridWorld
from rllib.policy import TabularPolicy
from rllib.value_function import TabularQFunction, TabularValueFunction

import warnings
warnings.filterwarnings("ignore")

ModuleNotFoundError: No module named 'numpy'

In [2]:
def extract_policy(q_function):
    """Extract a policy from the q_function."""
    policy = TabularPolicy(num_states=q_function.num_states,
                           num_actions=q_function.num_actions)
    for state in range(policy.num_states):
        q_val = q_function(torch.tensor(state).long())
        action = torch.argmax(q_val)

        policy.set_value(state, action)

    return policy

def integrate_q(q_function, policy):
    value_function = TabularValueFunction(num_states=q_function.num_states)
    for state in range(policy.num_states):
        state = torch.tensor(state).long()
        pi = Categorical(logits=policy(state))
        value = 0
        for action in range(policy.num_actions):
            value += pi.probs[action] * \
                q_function(state, torch.tensor(action).long())

        value_function.set_value(state, value)

    return value_function


environment = EasyGridWorld()
Image("images/grid_world.png")

# Plotters
def plot_value_function(value_function, ax):
    ax.imshow(value_function, vmin=-1, vmax=25)
    rows, cols = value_function.shape
    for i in range(rows):
        for j in range(cols):
            ax.text(j, i, f"{value_function[i, j]:.1f}",
                    ha="center", va="center", color="w")

def policy2str(policy):
    left = u'\u2190'
    right = u'\u2192'
    up = u'\u2191'
    down = u'\u2193'
    policy_str = ""
    if 0 == policy:
        policy_str += down 
    if 1 == policy:
        policy_str += up 
    if 2 == policy:
        policy_str += right
    if 3 == policy:
        policy_str += left
    return policy_str

def plot_value_function(value_function, ax):
    ax.imshow(value_function, vmin=-4, vmax=30)
    rows, cols = value_function.shape
    for row in range(rows):
        for col in range(cols):
            ax.text(row, col, f"{value_function[col, row]:.1f}", ha="center", va="center", color="w", fontsize=24)

def plot_policy(policy, ax):
    rows, cols = policy.shape
    ax.imshow(np.zeros((rows, cols)))
    for row in range(environment.height):
        for col in range(environment.width):
            ax.text(col, row, policy2str(policy[row, col]), ha="center", va="center", color="r", fontsize=24)


def plot_value_and_policy(value_function, policy):
    fig, axes = plt.subplots(ncols=2, nrows=1, figsize=(20, 8))

    plot_value_function(value_function, axes[0])
    plot_policy(policy, axes[1])
    
class EpsGreedyPolicy(object):
    def __init__(self, policy, epsilon):
        self.policy = policy
        self.epsilon = epsilon 
    
    def __call__(self, state):
        if not isinstance(state, torch.Tensor):
            torch.tensor(state).long()
        dist = Categorical(logits=self.policy(state))
        probs = dist.probs
        mixture = probs * (1 - self.epsilon) + self.epsilon / self.policy.num_actions
        return Categorical(probs=mixture).logits

def init_value_function(num_states, terminal_states=None):
    """Initialize value function."""
    value_function = TabularValueFunction(num_states=num_states)
    terminal_states = [] if terminal_states is None else terminal_states
    for terminal_state in terminal_states:
        value_function.set_value(terminal_state, 0)

    return value_function


def build_mrp_matrices(environment, policy):
    mrp_kernel = np.zeros((environment.num_states, 1, environment.num_states))
    mrp_reward = np.zeros((environment.num_states, 1))

    for state in range(environment.num_states):
        state = torch.tensor(state).long()
        policy_ = Categorical(logits=policy(state))

        for a, p_action in enumerate(policy_.probs):
            for transition in environment.transitions[(state.item(), a)]:
                with torch.no_grad():
                    p_ns = transition["probability"]
                    mrp_reward[state, 0] += p_action * p_ns * transition["reward"]
                    mrp_kernel[state, 0, transition["next_state"]
                               ] += p_action * p_ns

    return mrp_kernel, mrp_reward

def linear_system_policy_evaluation(environment, policy, gamma, value_function=None):
    """Evaluate a policy in an MDP solving the system bellman of equations.

    V = r + gamma * P * V
    V = (I - gamma * P)^-1 r
    """

    if value_function is None:
        value_function = init_value_function(environment.num_states)

    kernel, reward = build_mrp_matrices(environment=environment, policy=policy)

    A = torch.eye(environment.num_states) - gamma * kernel[:, 0, :]
    # torch.testing.assert_allclose(A.inverse() @ A, torch.eye(model.num_states))
    vals = A.inverse() @ reward[:, 0]
    for state in range(environment.num_states):
        value_function.set_value(state, vals[state].item())

    return value_function

def plot_all(estimated_value, policy, exploration_value_function, testing_value_function):
    fig, axes = plt.subplots(ncols=2, nrows=1, figsize=(20, 6))
    plot_value_function(estimated_value, axes[0])
    axes[0].set_title('Estimated Value')
    plot_policy(policy, axes[1])
    axes[1].set_title('Policy')
    
    fig, axes = plt.subplots(ncols=2, nrows=1, figsize=(20, 6))
    plot_value_function(exploration_value_function, axes[0])
    axes[0].set_title('Exploration Value')
    plot_value_function(testing_value_function, axes[1])
    axes[1].set_title('Testing Value')

# Tabular Q Learning 

In [3]:
def q_learning(gamma=0.9, alpha=0.5, eps=0., optimistic_init=False):
    output = ipywidgets.Output()
    
    global state 
    q_function = TabularQFunction(
        num_states=environment.num_states, num_actions=environment.num_actions)
    nn.init.ones_(q_function.nn.head.weight)

    if optimistic_init:
        q_function.nn.head.weight.data = 10 / \
            (1 - gamma) * q_function.nn.head.weight.data  # Initialization

    state = environment.reset()
    state = 0 
    environment.state = state 

    def step(num_iter):
        global state

        for i in range(num_iter):
            if np.random.rand() < eps:
                action = np.random.choice(environment.num_actions)
            else:
                action = torch.argmax(q_function(
                    torch.tensor(state).long())).item()

            q_val = q_function(torch.tensor(state).long(),
                               torch.tensor(action).long())

            next_state, reward, done, info = environment.step(action)

            next_q = torch.max(q_function(torch.tensor(next_state).long()))
            reward = torch.tensor(reward).double()
            td = reward + gamma * next_q - q_val

            q_function.set_value(state, action, q_val + alpha * td)

            state = next_state

        plot()

    def plot():
        with output:
            output.clear_output()
            policy = extract_policy(q_function)
            value_function = integrate_q(q_function, policy)
            exploration_vf = linear_system_policy_evaluation(environment, EpsGreedyPolicy(policy, eps), gamma)
            testing_vf = linear_system_policy_evaluation(environment, policy, gamma)

            plot_all(
                value_function.table.reshape(5, 5).detach().numpy(),
                policy.table.argmax(0).reshape(5, 5).detach().numpy(),
                exploration_vf.table.reshape(5, 5).detach().numpy(), 
                testing_vf.table.reshape(5, 5).detach().numpy()
            )
            plt.show()
    
    plot() 
    button = ipywidgets.Button(description="Step 100")
    button.on_click(lambda b: step(num_iter=100))
    button2 = ipywidgets.Button(description="Step 1000")
    button2.on_click(lambda b: step(num_iter=1000))
    display(output, button, button2)
        

interact(
    q_learning,
    gamma=ipywidgets.FloatSlider(
        value=0.9, min=0., max=0.99, step=1e-2, continuous_update=False),
    alpha=ipywidgets.FloatSlider(
        value=0.5, min=0., max=2.0, step=1e-2, continuous_update=False),
    eps=ipywidgets.FloatSlider(
        value=0., min=0., max=1.0, step=1e-2, continuous_update=False),
    optimistic_init=ipywidgets.Checkbox(value=False)
);

interactive(children=(FloatSlider(value=0.9, continuous_update=False, description='gamma', max=0.99, step=0.01…

## Demo Guide:

#### some explanation
This demo shows one of the Q learning method on the grid world example.
Note that Q learning is model free which means it learns the value function directly.
- gamma: the discount factor of the environment.
- alpha: the learning rate in the Q learning
- eps: the parameter trading the exploration and exploitation. With epislon probability will pick random action and 1-epislon pick the best action.
- optimistic init: implement the optimistic Q learning. More details could be checked the [work](https://papers.nips.cc/paper/2001/file/6f2688a5fce7d48c8d19762b88c32c3b-Paper.pdf)


- step 100: starting in the up left corner, forward 100 steps and collect samples.
- step 1000:  starting in the up left corner, forward 1000 steps and collect samples.

- Estimated value: the estimated value function learned by interacting the world.
- Policy: the optimal policy based on the learned MDP
- Exploration value: given the policy we can evaluate the policy based on the epislon greedy algorithm by solving the Bellman equation by solving linear system. 
- Testing value: given the policy we can evaluate the policy by solving the Bellman equation by solving linear system.


#### play around
- Play with the 0 epislon and see how is the Q learning performance? Think about why it get stuck.
- Use the optimistic initialization. Check the difference with of setting the initial value 1.


# Q Learning with function approximation

- Q Learning: approximate Q with a parametric function. 
- DQN: Approximate Q with a parametric function and use a target network to compute the delays. 
https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf
- DDQN: Approximate Q with a parametric function and use the target network to compute the maximum. See https://arxiv.org/pdf/1509.06461.pdf 

In [7]:
from rllib.policy import EpsGreedy, SoftMax
from rllib.util.parameter_decay import ExponentialDecay

def run(env_name, agent_name, exploration):
    if "CartPole" in env_name:
        max_steps = 200
    else:
        max_steps = 1000 
        
    environment = GymEnvironment(env_name)
    agent = getattr(
        importlib.import_module("rllib.agent"), 
        f"{agent_name}Agent"
    ).default(environment)
    
    if exploration == "eps-greedy":
        policy = EpsGreedy(agent.algorithm.critic, ExponentialDecay(start=1.0, end=0.01, decay=500))
    elif exploration == "softmax":
        policy = SoftMax(agent.algorithm.critic, ExponentialDecay(start=1.0, end=0.01, decay=500))
    agent.set_policy(policy)
    try:
        train_agent(environment=environment, agent=agent, num_episodes=40, max_steps=max_steps, render=True, plot_flag=False)
    except KeyboardInterrupt:
        pass 
    environment.close()
        
    plt.plot(agent.logger.get("train_return-0"), linewidth=16)
    plt.xlabel("Episode")
    plt.ylabel("Return")
    plt.show()
    

interact(
    run,
    env_name = ["CartPole-v0", "Acrobot-v1", "MountainCar-v0"],
    agent_name = ["QLearning", "DQN", "DDQN"],
    exploration = ["softmax", "eps-greedy"]
);

interactive(children=(Dropdown(description='env_name', options=('CartPole-v0', 'Acrobot-v1', 'MountainCar-v0')…

## Demo Guide:

#### some explanation
This demo shows running the model free reinforcement learning on different enviornment with different function approximation techniques.

- env_name: CartPole, Acrobot, MountainCar. Note that the Acrobot and MountainCar will take longer time for convergence.
- agent_name: QLearning, DQN, DDQN. Different function approximation techniques.  
- alpha: the learning rate in the Q learning
- exploration: 
    - softmax: A soft-max policy is one that has a policy given by: $\pi(a|s) \propto \rho(a|s) \exp[q(s, a) / \tau] $,
    where$ \rho(a|s)$ is a prior policy, usually selected at random.


#### play around
- Play with the different approximation method on different environment
- Compare the difference of softmax exploration and eps-greedy


# Tabular SARSA 

In [10]:
environment = EasyGridWorld()
def sarsa(gamma=0.9, alpha=0.5, eps=0., optimistic_init=False):
    output = ipywidgets.Output()
    global state, action 
    q_function = TabularQFunction(num_states=environment.num_states, num_actions=environment.num_actions)
    nn.init.ones_(q_function.nn.head.weight)
    if optimistic_init:
        q_function.nn.head.weight.data = 10 / (1 - gamma) * q_function.nn.head.weight.data
        
    state = environment.reset()
    if np.random.rand() < eps:
        action = np.random.choice(environment.num_actions)
    else:
        action = torch.argmax(q_function(torch.tensor(state).long())).item()

    def step(num_iter):
        global state, action
        for i in range(num_iter):
            q_val = q_function(torch.tensor(state).long(), torch.tensor(action).long())

            next_state, reward, done, info = environment.step(action)
    
            if np.random.rand() < eps:
                next_action = np.random.choice(environment.num_actions)
            else:
                next_action = torch.argmax(q_function(torch.tensor(next_state).long())).item()


            next_q = q_function(torch.tensor(next_state).long(), torch.tensor(next_action).long())
            reward = torch.tensor(reward).double()
            td = reward + gamma * next_q - q_val 

            q_function.set_value(state, action, q_val + alpha * td)
            state, action = next_state, next_action

        plot()
              
    
    def plot():
        with output:
            output.clear_output()
            policy = extract_policy(q_function)
            value_function = integrate_q(q_function, policy)
            exploration_vf = linear_system_policy_evaluation(environment, EpsGreedyPolicy(policy, eps), gamma)
            testing_vf = linear_system_policy_evaluation(environment, policy, gamma)


            plot_all(
                value_function.table.reshape(5, 5).detach().numpy(),
                policy.table.argmax(0).reshape(5, 5).detach().numpy(),
                exploration_vf.table.reshape(5, 5).detach().numpy(), 
                testing_vf.table.reshape(5, 5).detach().numpy()
            )
            plt.show()
        
    plot() 
    button = ipywidgets.Button(description="Step 100")
    button.on_click(lambda b: step(num_iter=100))
    button2 = ipywidgets.Button(description="Step 1000")
    button2.on_click(lambda b: step(num_iter=1000))
    display(output, button, button2)

interact(
    sarsa, 
    gamma=ipywidgets.FloatSlider(value=0.9, min=0., max=0.99, step=1e-2, continuous_update=False),
    alpha=ipywidgets.FloatSlider(value=0.5, min=0., max=2.0, step=1e-2, continuous_update=False),
    eps=ipywidgets.FloatSlider(value=0., min=0., max=1.0, step=1e-2, continuous_update=False),
    optimistic_init=ipywidgets.Checkbox(value=False)
);

interactive(children=(FloatSlider(value=0.9, continuous_update=False, description='gamma', max=0.99, step=0.01…

## Demo Guide:

#### some explanation
This demo runs SARSA reinforcment learning approach: SARSA algorithm is a slight variation of the popular Q-Learning algorithm. For a learning agent in any Reinforcement Learning algorithm it’s policy can be of two types:- 
 
Q-Learning technique is an Off Policy technique and uses the greedy approach to learn the Q-value. SARSA technique, on the other hand, is an On Policy and uses the action performed by the current policy to learn the Q-value.
This difference is visible in the difference of the update statements for each technique:- 

Q-Learning: $Q(s_{t},a_{t}) = Q(s_{t},a_{t}) + \alpha (r_{t+1}+\gamma max_{a}Q(s_{t+1},a)-Q(s_{t},a_{t}))$

SARSA: $Q(s_{t},a_{t}) = Q(s_{t},a_{t}) + \alpha (r_{t+1}+\gamma Q(s_{t+1},a_{t+1})-Q(s_{t},a_{t}))$

Here, the update equation for SARSA depends on the current state, current action, reward obtained, next state and next action. This observation lead to the naming of the learning technique as SARSA stands for State Action Reward State Action which symbolizes the tuple (s, a, r, s’, a’).

- gamma: the discount factor of the environment.
- alpha: the learning rate in the Q learning
- eps: the parameter trading the exploration and exploitation. With epislon probability will pick random action and 1-epislon pick the best action.
- optimistic init: implement the optimistic Q learning. More details could be checked the [work](https://papers.nips.cc/paper/2001/file/6f2688a5fce7d48c8d19762b88c32c3b-Paper.pdf)

- step 100: starting in the up left corner, forward 100 steps and collect samples.
- step 1000:  starting in the up left corner, forward 1000 steps and collect samples.

- Estimated value: the estimated value function learned by interacting the world.
- Policy: the optimal policy based on the learned MDP
- Exploration value: given the policy we can evaluate the policy based on the epislon greedy algorithm by solving the Bellman equation by solving linear system. 
- Testing value: given the policy we can evaluate the policy by solving the Bellman equation by solving linear system.


#### play around
- Similarly to the Q learning example, play around and check the difference
