```julia
using Pkg
Pkg.activate(".")
Pkg.add("IJulia Revise Statistics")
Pkg.dev("..")
```

In [1]:
using Statistics

using Revise

##### Track used for tests

In [2]:
track = [
    0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 3
    0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3
    0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3
    0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3
    1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3
    1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3
    1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0
    1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
    1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
    1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
    1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
    1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
    1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
    1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
    0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
    0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
    0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
    0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
    0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
    0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
    0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
    0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
    0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
    0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
    0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
    0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
    0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
    0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
    0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
    0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0
    0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0
    0 0 0 2 2 2 2 2 2 0 0 0 0 0 0 0 0
    ][end:-1:1,:]

nothing

##### Off-policy Monte Carlo optimization

In [3]:
module Opts

using ReinforcementLearningSutton.RaceTrack: Action, Game, PlayerDeterministic, PlayerRandom, State, action_valid, nactions, nstates, play_game

function optimize_policy_mc_off_policy(game::Game, player_b::PlayerRandom, Γ::Float64 = 1., n_games::Int = 10000)::Array{Action, 1}
    
    states = [State(size(game.track)..., s_idx) for s_idx in 1:nstates(game.track)]
    actions = [Action(a_idx) for a_idx in 1:nactions()]

    q = zeros(nstates(game.track), nactions())
    c = zeros(nstates(game.track), nactions())
    w_norm = zeros(nstates(game.track))
    
    for (s_idx, s) in enumerate(states)
        for (a_idx, a) in enumerate(actions)
            v = action_valid(s, a)
            w_norm[s_idx] += v
            if !v
                q[s_idx, a_idx] = -Inf
            else
                q[s_idx, a_idx] = -500.
            end
        end
    end
    
    action_index_range = 1:nactions()
    π = [rand(action_index_range[q_row .== maximum(q_row)]) for q_row in eachrow(q)]
    
    for game_index in 1:n_games
        g = 0.
        w = 1.

        nsteps = play_game(game, player_b)
        
        for step in nsteps-1:-1:1
            s, a, r = game.episode_states[step], game.episode_actions[step], game.episode_rewards[step]
            g = Γ * g + r
            c[s, a] += w
            q[s, a] += (w / c[s, a]) * (g - q[s, a])
            
            m = 
            π[s] = rand(action_index_range[q[s, :] .== maximum(q[s, :])])
            if π[s] != a
                break
            end
            
            w /= w_norm[s]
        end
    end

    return map(Action, π)
end

end

Main.Opts

In [4]:
using ReinforcementLearningSutton.RaceTrack: Game, PlayerDeterministic, PlayerRandom, play_game

for Γ in [0.5, 0.9, 1.0]
    for max_n_iter in [10000, 20000, 50000, 100000, 200000]
        game = Game(track, 0.1)
        player_b = PlayerRandom(size(track)...)
        policy = Opts.optimize_policy_mc_off_policy(game, player_b, Γ, max_n_iter)

        game = Game(track, 0.)
        player_t = PlayerDeterministic(policy)
        println("Γ: ", Γ, ", optimization iterations: ", max_n_iter)
        println("Average episode length: ", mean([play_game(game, player_t) for _ in 1:100000]))
    end
end

Γ: 0.5, optimization iterations: 10000
Average episode length: 9.16296
Γ: 0.5, optimization iterations: 20000
Average episode length: 14.28437
Γ: 0.5, optimization iterations: 50000
Average episode length: 8.83176
Γ: 0.5, optimization iterations: 100000
Average episode length: 13.51971
Γ: 0.5, optimization iterations: 200000
Average episode length: 26.42372
Γ: 0.9, optimization iterations: 10000
Average episode length: 10.39426
Γ: 0.9, optimization iterations: 20000
Average episode length: 17.69856
Γ: 0.9, optimization iterations: 50000
Average episode length: 18.32355
Γ: 0.9, optimization iterations: 100000
Average episode length: 24.97553
Γ: 0.9, optimization iterations: 200000
Average episode length: 11.02055
Γ: 1.0, optimization iterations: 10000
Average episode length: 11.01166
Γ: 1.0, optimization iterations: 20000
Average episode length: 10.60352
Γ: 1.0, optimization iterations: 50000
Average episode length: 26.53477
Γ: 1.0, optimization iterations: 100000
Average episode length