In [None]:
using PyPlot

In [None]:
function draw_action_values(n::Int64)
    return [randn() for i=1:n]
end

In [None]:
Q_star = draw_action_values(10)

In [None]:
randomise_action_value(value::Float64) = value + randn()

In [None]:
[randomise_action_value(v) for v in Q_star]

In [None]:
for i = 1:10
    println(randomise_action_value(Q_star[1]))
end

In [None]:
function ϵ_greedy(ϵ::Float64, Q::Vector{Float64})
    action = nothing
    if rand() >= ϵ || ϵ == 0.0
        # get the highest estimated value
        max_Q            = maximum(Q)
        # get the list of actions with the highest value
        possible_actions = findin(Q,max_Q)
        # chose randomly from the list of actions with the highest values
        action           = possible_actions[ceil(rand() * length(possible_actions))]
    else
        # chose randomly from all actions
        action = ceil(rand() * length(Q))
    end
    return action
end

In [None]:
greedy(Q::Vector{Float64}) = ϵ_greedy(0.0, Q)

In [None]:
function evaluate_method(method::Function, nr_of_actions::Int64, nr_of_plays::Int64, nr_of_trials::Int64)
    average_reward_per_action = zeros(nr_of_plays, nr_of_trials)
    percentage_of_best_action = zeros(nr_of_plays, nr_of_trials)
    for t = 1:nr_of_trials
        # draw Q^*(a) and sort it, so that action 1 is always the best
        Q_star = sort(draw_action_values(nr_of_actions), rev=true) # ground truth
        Q_est  = zeros(nr_of_actions) # estimated action-values
        Q_sum  = zeros(nr_of_actions) # sum of all rewards for each action
        k_a    = zeros(nr_of_actions) # counts of times action a was chosen
        for p = 1:nr_of_plays
            selected_action = method(Q_est)
            k_a[selected_action]   = k_a[selected_action] + 1.0
            randomised_action_value = Q_star[selected_action] + randn()
            Q_sum[selected_action] = Q_sum[selected_action] + randomised_action_value
            Q_est[selected_action] = Q_sum[selected_action] / k_a[selected_action]
            average_reward_per_action[p,t] = randomised_action_value
            percentage_of_best_action[p,t] = (selected_action == 1)?1:0
        end
    end
    return mean(average_reward_per_action,2), mean(percentage_of_best_action,2)
end

In [None]:
a_greedy,p_greedy=evaluate_method(greedy, 10, 1000, 2000);

ϵ_greedy_with_001(Q::Vector{Float64})=ϵ_greedy(0.01, Q)
a_ϵ_001,p_ϵ_001=evaluate_method(ϵ_greedy_with_001, 10, 1000, 2000);

ϵ_greedy_with_01(Q::Vector{Float64})=ϵ_greedy(0.1, Q)
a_ϵ_01,p_ϵ_01=evaluate_method(ϵ_greedy_with_01, 10, 1000, 2000);

In [None]:
pg    = plot(a_greedy)
p_001 = plot(a_ϵ_001)
p_01  = plot(a_ϵ_01)
legend( [pg,p_001,p_01], ["greedy", "e = 0.01", "e = 0.1"], loc=4)

In [None]:
pg    = plot(p_greedy)
p_001 = plot(p_ϵ_001)
p_01  = plot(p_ϵ_01)
legend( [pg,p_001,p_01], ["greedy", "e = 0.01", "e = 0.1"], loc=4)