In [1]:
from enum import Enum
import random 
random.seed(0)
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## MDP ##

In [3]:
CLIFF_WORLD = np.array([[0, 0, 0, 0, 0, 0, 0, 0],
                       [0, 0, 0, 0, 0, 0, 0, 0],
                       [0, 0, 0, 0, 0, 0, 0, 0],
                       [0, 1, 1, 1, 1, 1, 1, 0]])

In [6]:
OPEN = 0
CLIFF = 1

ROWS = range(len(CLIFF_WORLD))
COLS = range(len(CLIFF_WORLD[0]))
TOP_EDGE = 0
BOTTOM_EDGE = len(CLIFF_WORLD)-1
LEFT_EDGE = 0
RIGHT_EDGE = len(CLIFF_WORLD[0])-1


STATE_START = (BOTTOM_EDGE, LEFT_EDGE)
STATE_GOAL = (BOTTOM_EDGE, RIGHT_EDGE)

REWARD_BASIC = -1
REWARD_CLIFF = -100

class Action(Enum):
    UP = 1 
    DOWN = 2 
    LEFT = 3 
    RIGHT = 4 
    
    
def state_transition(state, action):
    # absorbing state 
    if state == STATE_GOAL: 
        return state, 0
    
    row = state[0]
    col = state[1]
    if action is Action.UP and state[0] != TOP_EDGE:
        row = state[0] - 1
    elif action is Action.DOWN and state[0] != BOTTOM_EDGE:
        row = state[0] + 1
    elif action is Action.LEFT and state[1] != LEFT_EDGE:
        col = state[1] - 1
    elif action is Action.RIGHT and state[1] != RIGHT_EDGE:
        col = state[1] + 1
        
    next_state = (row,col)
    
    reward = REWARD_BASIC
    
    if CLIFF_WORLD[row][col] == CLIFF:
        next_state = STATE_START
        reward = REWARD_CLIFF
        
    return next_state, reward 

## learning 

In [7]:
ALPHA = 0.1
GAMMA = 0.9
EPSILON = 0.1 
EPISODE_NUM = 500
EPISODE_LENGTH = 1024

In [23]:
def epsilon_greedy_select(state, q_values, epsilon=0):
    action_selected = random.choice(list(Action))
    if random.random() >= epsilon:
        actions = [action for action in Action]
        qvals = [q_values.get((state, action), float('-inf')) for action in actions]
        # ties : choose the first action 
        action_selected = actions[np.argmax(qvals)]
        
    return action_selected
    
def generate_greedy_policy(q_values):
    policy = np.full_like(CLIFF_WORLD, Action.UP, dtype = Action)
    
    for row in ROWS:
        for col in COLS:
            policy[row][col] = epsilon_greedy_select((row,col), q_values, epsilon=0)
    return policy


def print_policy(policy):
    def name_action(row, col):
        action_name = policy[row][col].name[0]
        if CLIFF_WORLD[row][col] == CLIFF:
            action_name = '_'
        elif (row,col) == STATE_GOAL:
            action_name = 'G'
        return action_name
    
    for row in ROWS:
        row_str = [name_action(row,col) for col in COLS]
        print(''.join(row_str))

## SARSA

In [24]:
def sarsa(epsilon):
    q_values = {}
    for episode in range(EPISODE_NUM):
        state = STATE_START
        actiion = epsilon_greedy_select(state, q_values, epsilon)
        for iteration in range(EPISODE_LENGTH):
            next_state, reward = state_transition(state, action)
            next_action = epsilon_greedy_select(next_state, q_values, epsilon)
            qval = q_values.get((state,action), 0)
            next_qval = q_values.get((next_state,next_action), 0)
            qval = qval + ALPHA * (reward + GAMMA * next_qval - qval)
            
            q_values[(state, action)] = qval
            
            state = next_state
            action = next_action 
    return q_values, generate_greedy_policy(q_values)

In [25]:
q_final_sarsa, policy_sarsa = sarsa(EPSILON)
print_policy(policy_sarsa)

UnboundLocalError: local variable 'action' referenced before assignment

### Q-learning 