## Objetivos

Usaremos Python para implementar varios algoritmos de reinforcement learning.

Ejecutaremos los algoritmos en algunos problemas para entender las propiedades y diferentes comportamientos emergentes. 
En este tutorial pondremos el foco en los fundamentos de RL en un simple gridworld.

# Setup

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from collections import namedtuple
import itertools


np.set_printoptions(precision=3, suppress=1)
plt.style.use('seaborn-notebook')
plt.style.use('seaborn-whitegrid')


## Environments: Grid-Worlds

**(Simple) Tabular Grid-World**

Puedes visualizar el mundo grilla en donde entrenaremos nuestro agente (ejecutando la celda siguiente).

`S` indica el estado inicial y `G` indica el objetivo.  El agente tiene cuatro acciones posibles: arriba, derecha, abajo, e izquierda.  Rewards es: `-5` si salimos fuera de las paredes, `+10` si alcanzamos el objetivo, y `0` en otro caso.  El episodio termina cuando el agente alcance el objetivo. El descuento es $\gamma = 0.9$.


In [None]:
#@title Environment: Gridworld Implementation
class Grid(object):

  def __init__(self, discount=0.9, penalty_for_walls=-5):
    # -1: wall
    # 0: empty, episode continues
    # other: number indicates reward, episode will terminate
    self._layout = np.array([
      [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
      [-1,  0,  0,  0,  0,  0, -1,  0,  0, -1],
      [-1,  0,  0,  0, -1,  0,  0,  0, 10, -1],
      [-1,  0,  0,  0, -1, -1,  0,  0,  0, -1],
      [-1,  0,  0,  0, -1, -1,  0,  0,  0, -1],
      [-1,  0,  0,  0,  0,  0,  0,  0,  0, -1],
      [-1,  0,  0,  0,  0,  0,  0,  0,  0, -1],
      [-1,  0,  0,  0,  0,  0,  0,  0,  0, -1],
      [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
    ])
    self._start_state = (2, 2)
    self._goal_state = (8, 2)
    self._state = self._start_state
    self._number_of_states = np.prod(np.shape(self._layout))
    self._discount = discount
    self._penalty_for_walls = penalty_for_walls
    self._layout_dims = self._layout.shape

  @property
  def number_of_states(self):
      return self._number_of_states
    
  def plot_grid(self):
    plt.figure(figsize=(3, 3))
    plt.imshow(self._layout <= -1, interpolation="nearest")     
    ax = plt.gca()
    ax.grid(0)
    plt.xticks([])
    plt.yticks([])
    plt.title("The grid")
    plt.text(
        self._start_state[0], self._start_state[1], 
        r"$\mathbf{S}$", ha='center', va='center')
    plt.text(
        self._goal_state[0], self._goal_state[1], 
        r"$\mathbf{G}$", ha='center', va='center')
    h, w = self._layout.shape
    for y in range(h-1):
      plt.plot([-0.5, w-0.5], [y+0.5, y+0.5], '-k', lw=2)
    for x in range(w-1):
      plt.plot([x+0.5, x+0.5], [-0.5, h-0.5], '-k', lw=2)

  
  def get_obs(self):
    y, x = self._state
    return y*self._layout.shape[1] + x
  
  def int_to_state(self, int_obs):
    x = int_obs % self._layout.shape[1]
    y = int_obs // self._layout.shape[1]
    return y, x

  def step(self, action):
    y, x = self._state

    if action == 0:  # up
      new_state = (y - 1, x)
    elif action == 1:  # right
      new_state = (y, x + 1)
    elif action == 2:  # down
      new_state = (y + 1, x)
    elif action == 3:  # left
      new_state = (y, x - 1)
    else:
      raise ValueError("Invalid action: {} is not 0, 1, 2, or 3.".format(action))

    new_y, new_x = new_state
    if self._layout[new_y, new_x] == -1:  # wall
      reward = self._penalty_for_walls
      discount = self._discount
      new_state = (y, x)
    elif self._layout[new_y, new_x] == 0:  # empty cell
      reward = 0.
      discount = self._discount
    else:  # a goal
      reward = self._layout[new_y, new_x]
      discount = 0.
      new_state = self._start_state
    
    self._state = new_state
    return reward, discount, self.get_obs()

In [None]:
# Visualise the environment

# Instantiate the tabular environment
grid = Grid()

# Plot tabular environment
grid.plot_grid()

In [None]:
#@title Policies (Uniformly random and e-greedy) 
#Expected syntax: `policy(q_values)` 

# uniformly random policy
def random_policy(q):
  return np.random.randint(4)


## Funciones de ayuda (para visualizar y ejecutar los experimentos)

In [None]:
#@title Helper functions for visualisation

map_from_action_to_subplot = lambda a: (2, 6, 8, 4)[a]
map_from_action_to_name = lambda a: ("up", "right", "down", "left")[a]

def plot_values(values, colormap='pink', vmin=-1, vmax=10):
  plt.imshow(values, interpolation="nearest", cmap=colormap, vmin=vmin, vmax=vmax)
  plt.yticks([])
  plt.xticks([])
  plt.colorbar(ticks=[vmin, vmax])

def plot_state_value(action_values):
  q = action_values
  fig = plt.figure(figsize=(4, 4))
  vmin = np.min(action_values)
  vmax = np.max(action_values)
  v = 0.9 * np.max(q, axis=-1) + 0.1 * np.mean(q, axis=-1)
  plot_values(v, colormap='summer', vmin=vmin, vmax=vmax)
  plt.title("$v(s)$")

def plot_action_values(action_values):
  q = action_values
  fig = plt.figure(figsize=(8, 8))
  fig.subplots_adjust(wspace=0.3, hspace=0.3)
  vmin = np.min(action_values)
  vmax = np.max(action_values)
  dif = vmax - vmin
  for a in [0, 1, 2, 3]:
    plt.subplot(3, 3, map_from_action_to_subplot(a))
    
    plot_values(q[..., a], vmin=vmin - 0.05*dif, vmax=vmax + 0.05*dif)
    action_name = map_from_action_to_name(a)
    plt.title(r"$q(s, \mathrm{" + action_name + r"})$")
    
  plt.subplot(3, 3, 5)
  v = 0.9 * np.max(q, axis=-1) + 0.1 * np.mean(q, axis=-1)
  plot_values(v, colormap='summer', vmin=vmin, vmax=vmax)
  plt.title("$v(s)$")
      
  
def smooth(x, window=10):
  return x[:window*(len(x)//window)].reshape(len(x)//window, window).mean(axis=1)
  

def plot_stats(stats, window=10):
  plt.figure(figsize=(16,4))
  plt.subplot(121)
  xline = range(0, len(stats.episode_lengths), window)
  plt.plot(xline, smooth(stats.episode_lengths, window=window))
  plt.ylabel('Episode Length')
  plt.xlabel('Episode Count')
  plt.subplot(122)
  plt.plot(xline, smooth(stats.episode_rewards, window=window))
  plt.ylabel('Episode Return')
  plt.xlabel('Episode Count')

In [None]:
#@title [IMPORTANT] Ejecuta los experimentos

# Loop de interacción simple con el MDP:
# 1) Interactíaa con el entorno
# 2) El agente obtiene observación, recomensa y descuento del entorno. 
# y produce la siguiente acción
def run_experiment(env, agent, number_of_steps):
    mean_reward = 0.
    try:
      action = agent.initial_action()
    except AttributeError:
      action = 0
      
    # Interaction wih the MDP
    for i in range(number_of_steps):
      reward, discount, next_state = env.step(action)
      action = agent.step(reward, discount, next_state)
      mean_reward += (reward - mean_reward)/(i + 1.)

    return mean_reward

In [None]:
#@title Funciones para visualizar las políticas
def plot_policy(grid, policy):
  action_names = [r"$\uparrow$",r"$\rightarrow$", r"$\downarrow$", r"$\leftarrow$"]
  grid.plot_grid()
  plt.title('Policy Visualization')
  for i in range(9):
    for j in range(10):
      action_name = action_names[policy[i,j]]
      plt.text(j, i, action_name, ha='center', va='center')

def plot_greedy_policy(grid, q):
  action_names = [r"$\uparrow$",r"$\rightarrow$", r"$\downarrow$", r"$\leftarrow$"]
  greedy_actions = np.argmax(q, axis=2)
  grid.plot_grid()
  plt.title('Greedy Policy')
  for i in range(9):
    for j in range(10):
      action_name = action_names[greedy_actions[i,j]]
      plt.text(j, i, action_name, ha='center', va='center')

# Entrenar nuestro agente

Cada agente tiene una función step:

### `__init__(self, number_of_actions, number_of_states, initial_observation)`:
El constructor dará al agente el número de acciones, número de estados y la observación inicial. 

### `step(self, reward, discount, next_observation, ...)`:
Los pasos deberá actualizar los valores internos del agente y retornar la siguiente acción a tomar.


## Evaluación de la política

Primero vamos a evaluar una política $\pi$


Algoritmo:

**Inicializar** $Q(s, a)$ para todo s ∈ $\mathcal{S}$ y a ∈ $\mathcal{A}(s)$

**Loop forever**:

1. $S \gets{}$actual estado no terminal
 
2. $A \gets{} \text{behaviour_policy}(S)$
 
3. Toma acción $A$; observa recomensa $R$, descuento $\gamma$, y estado, $S'$

4. $Q(S, A) \gets Q(S, A) + \alpha (R + \gamma Q(S', \pi(S')) − Q(S, A))$

In [None]:
# uniformly random policy
def random_policy(q):
  return np.random.randint(4)

In [None]:
#@title [Coding Task] Policy Evaluation AGENT
class PolicyEval_AGENT(object):

  def __init__(
      self, number_of_states, number_of_actions, initial_state, evaluated_policy, 
      behaviour_policy=random_policy, step_size=0.1):
    self._action = 0
    self._state = initial_state
    self._number_of_states = number_of_states
    self._number_of_actions = number_of_actions
    self._step_size = step_size
    self._behaviour_policy = behaviour_policy
    self._evaluated_policy = evaluated_policy
    
    # initialize  q-values
    self._q = np.zeros((number_of_states, number_of_actions))
    
  @property
  def q_values(self):
    return self._q

  def step(self, reward, discount, next_state):
    s = self._state
    a = self._action
    r = reward
    d = discount
    next_s = next_state
    
    # Q-value table update
    qsa_next = self._q[next_s][self._evaluated_policy(self._q[next_s])]
    alfa = self._step_size
    qsa = self._q[s][a]
    self._q[s][a] = self._q[s][a] + alfa*(r + d*qsa_next - self._q[s][a])
  
    # Get the action to send to execute in the environment and return it
    self._state = next_s
    self._action = self._behaviour_policy(self._q[self._state])
  
    return self._action
    

**Pruébalo!** Ejecuta la evaluación de la política del agente, evaluando una política uniformemente aleatoria sobre el entorno Grid, probar con $\texttt{num_steps} = 1e3, 1e5, 1e7$. 


In [None]:
num_steps = int(1e5) # @param

grid = Grid()

agent = PolicyEval_AGENT(
    number_of_states=grid._layout.size, 
    number_of_actions=4, 
    initial_state=grid.get_obs(),
    evaluated_policy=random_policy,
    behaviour_policy=random_policy,
    step_size=0.1)

# run experiment and get the value functions from agent
run_experiment(grid, agent, num_steps)
q = agent.q_values.reshape(grid._layout.shape + (4,))

# visualise value functions
plot_action_values(q)

## Obteniendo política Greedy (mejoramiento)

In [None]:
# visualise the greedy policy
plot_greedy_policy(grid, q)

# Show start action.
start_action = np.argmax(q[grid._start_state])
if start_action == 0:
  start_action = "Up"
elif start_action == 1:
  start_action = "Right"
elif start_action == 2:
  start_action = "Down"
else:
  start_action = "Left"
print("\nAction from start state: "+start_action)