In [1]:
# Example 6.6 in Sutton Barto

# number the state space starting from the upper left hand corner.
# there are 48 states (possible positions). state 37 is the start
# and state 48 is the goal. rewards are -1 except states 38 thru
# 47 give a reward of -100 and send the agent back to state 37.

# actions are numbered 1,2,3,4 = up,down,right,left

function reward(s, a)  # return reward for taking action a from state s
    if s == goal
        return 0
    elseif (s == start && a == 3) || (in(s, 26:35) && a == 2)
        return -100   # stepped into the cliff
    else
        return -1
    end
end

function nextstate(s, a) # return next state when taking action a from state s
    if (s == goal) || (s == 36 && a == 2)
        return goal
    elseif (s == start && a == 3) || (in(s, 26:35) && a == 2)
        return start   # stepped into the cliff so back to start
    elseif in(s, [ 1 13 25 37 ]) && a == 4  # step off grid to left
        return s
    elseif in(s, 1:12) && a == 1    # step off grid above
        return s
    elseif in(s, [ 12 24 36 ]) && a == 3   # step off grid to right
        return s
    elseif in(s, 1:11) && a == 3 # step to right
        return s + 1
    elseif in(s, 2:12) && a == 4
        return s - 1
    elseif in(s, 13:23) && a == 3 # step to right
        return s + 1
    elseif in(s, 14:24) && a == 4
        return s - 1
    elseif in(s, 25:35) && a == 3
        return s + 1
    elseif in(s, 26:36) && a == 4
        return s - 1
    elseif in(s, 1:24) && a == 2
        return s + 12
    elseif in(s, 13:36) && a == 1
        return s - 12
    elseif s == start && a == 2
        return s
    elseif s == start && a == 1
        return 25
    elseif s == 25 && a == 2
        return start
    else
        println("!!! unexpected state $(s) and action $(a) !!!")
        @assert false
    end
end


nextstate (generic function with 1 method)

In [8]:
states = 48
actions = 4
alpha = 0.1
gamma = 1.0
epsilon = 0.1
start = 37
goal = 48
episodes = 500


function q_learning()
    states = 48  
    actions = 4  
    
    Q = zeros((states, actions))

    for episode in 1:episodes
        state = start
        while state != goal
            if rand() < epsilon
                action = rand(1:actions)
            else
                action = argmax(Q[state, :])
            end
            
            next_state = nextstate(state, action)  
            r = reward(state, action)  
            
            Q[state, action] += alpha * (r + gamma * maximum(Q[next_state, :]) - Q[state, action])
            state = next_state
        end
    end

    policy = zeros(Int, (4, 12))
    for s in 1:states
        policy[(s - 1) ÷ 12 + 1, (s - 1) % 12 + 1] = argmax(Q[s, :])
    end
    
    println("Cliff Directions!!:")
    for row in 1:4
        println(policy[row, :])
    end
end

q_learning()

Cliff Directions!!:
[3, 2, 2, 2, 2, 3, 1, 3, 2, 1, 3, 2]
[4, 3, 1, 3, 3, 3, 3, 3, 3, 3, 2, 2]
[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Reward and nextstates are provided by Prof. Darin England. I implement the qlearning function. This follows the pseudocode provied on page 131 of sutton-barto, and outputs a "cliff walking" list where each number corresponds to a direction - 1=up, 2=down, 3=right, 4=left. I chose to use the same number of epsiodes that was used in the sutton-barto example, 500, and start and goal were defined in the question. The function loops through each episode, and while the state doesnt eqaul the goal value, it chooses a epsilon greedy policy to chose its policy. It then uses the provided functions to take the action. After that, Q[state, action]