In [1]:
using PyPlot

INFO: Loading help data...


In [18]:
transition_matrix = { 
    
    (1,:north) => [5, 2, 1],
    (1,:east)  => [1, 5, 1],
    (1,:south) => [1, 2, 1],
    (1,:west)  => [2, 5, 1],
    
    (2,:north) => [6, 2, 3],
    (2,:east)  => [1, 6, 2],
    (2,:south) => [2, 3, 1],
    (2,:west)  => [3, 6, 2],

    (3,:north) => [3, 4, 2],
    (3,:east)  => [2, 3, 3],
    (3,:south) => [3, 4, 2],
    (3,:west)  => [4, 3, 3],

    (4,:north) => [7, 3, 4],
    (4,:east)  => [3, 7, 4],
    (4,:south) => [4, 3, 4],
    (4,:west)  => [4, 4, 7],

    (5,:north) => [5, 5, 5],
    (5,:east)  => [5, 5, 5],
    (5,:south) => [5, 5, 5],
    (5,:west)  => [5, 5, 5],

    (6,:north) => [9, 5, 6],
    (6,:east)  => [5, 9, 2],
    (6,:south) => [2, 5, 6],
    (6,:west)  => [6, 9, 2],

    (7,:north) => [11, 7, 7],
    (7,:east)  => [7, 11, 4],
    (7,:south) => [4, 7, 7],
    (7,:west)  => [7, 4, 11],

    (8,:north) => [8, 8, 8],
    (8,:east)  => [8, 8, 8],
    (8,:south) => [8, 8, 8],
    (8,:west)  => [8, 8, 8],

    (9,:north) => [9, 8, 10],
    (9,:east)  => [8, 6, 9],
    (9,:south) => [6, 10, 8],
    (9,:west)  => [10, 9, 6],

    (10,:north) => [10, 11, 9],
    (10,:east)  => [9, 10, 10],
    (10,:south) => [10, 11, 9],
    (10,:west)  => [11, 10, 10],

    (11,:north) => [11, 10, 11],
    (11,:east)  => [10, 11, 7],
    (11,:south) => [7, 11, 10],
    (11,:west)  => [11, 11, 7] 
};


In [161]:
actions = [:north, :east, :south, :west];

In [282]:
reward_function=[0,0,0,0,-1,0,0,0,0,0,0];

In [299]:
probabilities = [0.8, 0.1, 0.1];

In [300]:
function transition(state::Int64, action::Symbol)
    possible_states = transition_matrix[(state,action)]
    p = rand()
    if p < probabilities[1]
        return possible_states[1]
    elseif p < probabilities[1] + probabilities[2]
        return possible_states[2]
    else
        return possible_states[3]
    end
end;

In [301]:
function get_action_from_policy(p::Matrix{Float64}, state::Int64)
    p       = rand()
    prop    = 0.0
    action  = -1
    for i = 1:4
        prop = prop + policy[state, i]
        if p <= prop
            action = i
            break
        end
    end
    return actions[action]
end;

get_action_from_policy (generic function with 1 method)

In [302]:
function monte_carlo_generate_episode(initial_state::Int64, episode_length::Int64, policy::Matrix)
    sum_of_rewards        = 0
    sequence_of_states    = int(zeros(episode_length+1))
    sequence_of_states[1] = initial_state
    state                 = initial_state
    for i = 1:episode_length
        action         = get_action_from_policy(policy, state)
        next_state     = transition(state, action)
        sum_of_rewards = sum_of_rewards + reward_function[next_state]
        state          = next_state
        sequence_of_states[i+1] = state
    end
    return (sum_of_rewards, sequence_of_states)
end

monte_carlo_generate_episode (generic function with 1 method)

In [303]:
function monte_carlo_policy_evalution!(
    nr_of_evaluations::Int64,
    episode_length::Int64,
    policy::Matrix,
    values::Vector{Float64},
    sums::Vector{Float64},
    counts::Vector{Float64})
    for i = 1:nr_of_evaluations
        initial_state = int(ceil(rand() * 11))
        results               = monte_carlo_generate_episode(initial_state, episode_length, policy)
        sums[initial_state]   = sums[initial_state] + results[1]
        counts[initial_state] = counts[initial_state] + 1.0
        values[initial_state] = sums[initial_state] / counts[initial_state]
    end
end

monte_carlo_policy_evalution! (generic function with 1 method)

In [304]:
function get_best_actions(values::Vector{Float64})
    best_actions = int(zeros(11))
    for state = 1:11
        best_action = -1
        best_value  = nothing
        for action = 1:4
            next_states = transition_matrix[(state, actions[action])]
            for index = 1:3
                next_value = probabilities[index] * values[next_states[index]]
                if best_value == nothing || next_value > best_value
                    best_value  = next_value
                    best_action = action
                end
            end            
        end
        best_actions[state] = best_action
    end
    best_actions
end

function monte_carlo_policy_iteration!(values::Vector{Float64}, policy::Matrix{Float64})
    best_actions = get_best_actions(values)
    for state = 1:11
        for action = 1:4
            policy[state,action] = 0.05
        end
        policy[state,best_actions[state]] = 0.85
    end
end

monte_carlo_policy_iteration! (generic function with 2 methods)

In [307]:
policy = ones(11,4) .* 1/4
values = zeros(11);
sums   = zeros(11);
counts = zeros(11);

In [310]:
for i = 1:10000
    monte_carlo_policy_evalution!(10, 10, policy, values, sums, counts)
    monte_carlo_policy_iteration!(values,policy)
end
println("policy:\n$policy")
println("values: $values")

policy:
[0.85 0.05 0.05 0.05
 0.85 0.05 0.05 0.05
 0.85 0.05 0.05 0.05
 0.05 0.85 0.05 0.05
 0.85 0.05 0.05 0.05
 0.05 0.85 0.05 0.05
 0.05 0.85 0.05 0.05
 0.85 0.05 0.05 0.05
 0.85 0.05 0.05 0.05
 0.85 0.05 0.05 0.05
 0.85 0.05 0.05 0.05]
values: [-9.071340176166558,-6.300551004132531,-2.0544924971426464,-1.327946177191122,-10.0,-8.082690228535212,-0.2330920523506617,0.0,-0.9146853657645813,-0.2565728482960656,-0.11113144028103045]


In [309]:
for i = 1:10000
    monte_carlo_policy_evalution!(10, 10, policy, values, sums, counts)
    monte_carlo_policy_iteration!(values,policy)
end
println("policy:\n$policy")
println("values: $values")

policy:
[0.85 0.05 0.05 0.05
 0.85 0.05 0.05 0.05
 0.85 0.05 0.05 0.05
 0.05 0.85 0.05 0.05
 0.85 0.05 0.05 0.05
 0.05 0.85 0.05 0.05
 0.05 0.85 0.05 0.05
 0.85 0.05 0.05 0.05
 0.85 0.05 0.05 0.05
 0.85 0.05 0.05 0.05
 0.85 0.05 0.05 0.05]
values: [-9.069116360454943,-6.264213682202235,-2.0597047929681023,-1.345013918454233,-10.0,-8.101603083282132,-0.21792005235316572,0.0,-0.9203554476211503,-0.25563238902520635,-0.1277783860724844]
