<a href="https://colab.research.google.com/github/kessingtonosazee/GCP_Project_1/blob/master/julia2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
This document is an automatically-generated file that contains all typeset code blocks from
Algorithms for Decision Making by Mykel Kochenderfer, Tim Wheeler, and Kyle Wray. This book
is available from the MIT Press. A PDF version is also available online at algorithmsbook.com.

We share this content in the hopes that it helps you and makes the decision making algorithms
more approachable and accessible. Thank you for reading!

If you encounter any issues or have pressing comments, please file an issue at
github.com/algorithmsbooks/decisionmaking
"""

#################### representation 1
struct Variable
	name::Symbol
	r::Int # number of possible values
end

const Assignment = Dict{Symbol,Int}
const FactorTable = Dict{Assignment,Float64}

struct Factor
	vars::Vector{Variable}
	table::FactorTable
end

variablenames(œï::Factor) = [var.name for var in œï.vars]

select(a::Assignment, varnames::Vector{Symbol}) =
	Assignment(n=>a[n] for n in varnames)

function assignments(vars::AbstractVector{Variable})
    names = [var.name for var in vars]
    return vec([Assignment(n=>v for (n,v) in zip(names, values))
    			for values in product((1:v.r for v in vars)...)])
end

function normalize!(œï::Factor)
	z = sum(p for (a,p) in œï.table)
	for (a,p) in œï.table
		œï.table[a] = p/z
	end
	return œï
end
####################

#################### representation 2
Dict{K,V}(a::NamedTuple) where K where V =
    Dict{K,V}(n=>v for (n,v) in zip(keys(a), values(a)))
Base.convert(::Type{Dict{K,V}}, a::NamedTuple) where K where V = Dict{K,V}(a)
Base.isequal(a::Dict{<:Any,<:Any}, nt::NamedTuple) =
    length(a) == length(nt) && all(a[n] == v for (n,v) in zip(keys(nt), values(nt)))
####################

#################### representation 3
struct BayesianNetwork
	vars::Vector{Variable}
    factors::Vector{Factor}
	graph::SimpleDiGraph{Int64}
end
####################

#################### representation 4
function probability(bn::BayesianNetwork, assignment)
    subassignment(œï) = select(assignment, variablenames(œï))
    probability(œï) = get(œï.table, subassignment(œï), 0.0)
    return prod(probability(œï) for œï in bn.factors)
end
####################

#################### inference 1
function Base.:*(œï::Factor, œà::Factor)
    œïnames = variablenames(œï)
    œànames = variablenames(œà)
    œàonly = setdiff(œà.vars, œï.vars)
    table = FactorTable()
    for (œïa,œïp) in œï.table
        for a in assignments(œàonly)
            a = merge(œïa, a)
            œàa = select(a, œànames)
            table[a] = œïp * get(œà.table, œàa, 0.0)
        end
    end
    vars = vcat(œï.vars, œàonly)
    return Factor(vars, table)
end
####################

#################### inference 2
function marginalize(œï::Factor, name)
	table = FactorTable()
	for (a, p) in œï.table
		a‚Ä≤ = delete!(copy(a), name)
		table[a‚Ä≤] = get(table, a‚Ä≤, 0.0) + p
	end
	vars = filter(v -> v.name != name, œï.vars)
	return Factor(vars, table)
end
####################

#################### inference 3
in_scope(name, œï) = any(name == v.name for v in œï.vars)

function condition(œï::Factor, name, value)
	if !in_scope(name, œï)
		return œï
	end
	table = FactorTable()
	for (a, p) in œï.table
		if a[name] == value
			table[delete!(copy(a), name)] = p
		end
	end
	vars = filter(v -> v.name != name, œï.vars)
	return Factor(vars, table)
end

function condition(œï::Factor, evidence)
	for (name, value) in pairs(evidence)
		œï = condition(œï, name, value)
	end
	return œï
end
####################

#################### inference 4
struct ExactInference end

function infer(M::ExactInference, bn, query, evidence)
	œï = prod(bn.factors)
	œï = condition(œï, evidence)
	for name in setdiff(variablenames(œï), query)
		œï = marginalize(œï, name)
	end
	return normalize!(œï)
end
####################

#################### inference 5
struct VariableElimination
	ordering # array of variable indices
end

function infer(M::VariableElimination, bn, query, evidence)
	Œ¶ = [condition(œï, evidence) for œï in bn.factors]
	for i in M.ordering
		name = bn.vars[i].name
		if name ‚àâ query
			inds = findall(œï->in_scope(name, œï), Œ¶)
			if !isempty(inds)
				œï = prod(Œ¶[inds])
				deleteat!(Œ¶, inds)
				œï = marginalize(œï, name)
				push!(Œ¶, œï)
			end
		end
	end
	return normalize!(prod(Œ¶))
end
####################

#################### inference 6
function topological_sort(G)
	G = deepcopy(G)
	ordering = []
	parentless = filter(i -> isempty(inneighbors(G, i)), 1:nv(G))
	while !isempty(parentless)
		i = pop!(parentless)
		push!(ordering, i)
		for j in copy(outneighbors(G, i))
			rem_edge!(G, i, j)
			if isempty(inneighbors(G, j))
				push!(parentless, j)
			end
		end
	end
	return ordering
end
####################

#################### inference 7
function Base.rand(œï::Factor)
	tot, p, w = 0.0, rand(), sum(values(œï.table))
	for (a,v) in œï.table
		tot += v/w
		if tot >= p
			return a
		end
	end
	return Assignment()
end

function Base.rand(bn::BayesianNetwork)
	a = Assignment()
	for i in topological_sort(bn.graph)
		name, œï = bn.vars[i].name, bn.factors[i]
		a[name] = rand(condition(œï, a))[name]
	end
	return a
end
####################

#################### inference 8
struct DirectSampling
	m # number of samples
end

function infer(M::DirectSampling, bn, query, evidence)
	table = FactorTable()
	for i in 1:(M.m)
		a = rand(bn)
		if all(a[k] == v for (k,v) in pairs(evidence))
			b = select(a, query)
			table[b] = get(table, b, 0) + 1
		end
	end
	vars = filter(v->v.name ‚àà query, bn.vars)
	return normalize!(Factor(vars, table))
end
####################

#################### inference 9
struct LikelihoodWeightedSampling
	m # number of samples
end

function infer(M::LikelihoodWeightedSampling, bn, query, evidence)
	table = FactorTable()
	ordering = topological_sort(bn.graph)
	for i in 1:(M.m)
		a, w = Assignment(), 1.0
		for j in ordering
			name, œï = bn.vars[j].name, bn.factors[j]
			if haskey(evidence, name)
                a[name] = evidence[name]
				w *= œï.table[select(a, variablenames(œï))]
			else
				a[name] = rand(condition(œï, a))[name]
			end
		end
        b = select(a, query)
		table[b] = get(table, b, 0) + w
	end
	vars = filter(v->v.name ‚àà query, bn.vars)
	return normalize!(Factor(vars, table))
end
####################

#################### inference 10
function blanket(bn, a, i)
	name = bn.vars[i].name
	val = a[name]
	a = delete!(copy(a), name)
	Œ¶ = filter(œï -> in_scope(name, œï), bn.factors)
	œï = prod(condition(œï, a) for œï in Œ¶)
	return normalize!(œï)
end
####################

#################### inference 11
function update_gibbs_sample!(a, bn, evidence, ordering)
    for i in ordering
		name = bn.vars[i].name
		if !haskey(evidence, name)
            b = blanket(bn, a, i)
            a[name] = rand(b)[name]
		end
	end
end

function gibbs_sample!(a, bn, evidence, ordering, m)
	for j in 1:m
		update_gibbs_sample!(a, bn, evidence, ordering)
	end
end

struct GibbsSampling
	m_samples # number of samples to use
	m_burnin  # number of samples to discard during burn-in
	m_skip    # number of samples to skip for thinning
	ordering  # array of variable indices
end

function infer(M::GibbsSampling, bn, query, evidence)
	table = FactorTable()
	a = merge(rand(bn), evidence)
	gibbs_sample!(a, bn, evidence, M.ordering, M.m_burnin)
	for i in 1:(M.m_samples)
		gibbs_sample!(a, bn, evidence, M.ordering, M.m_skip)
		b = select(a, query)
		table[b] = get(table, b, 0) + 1
	end
	vars = filter(v->v.name ‚àà query, bn.vars)
	return normalize!(Factor(vars, table))
end
####################

#################### inference 12
function infer(D::MvNormal, query, evidencevars, evidence)
    Œº, Œ£ = D.Œº, D.Œ£.mat
    b, Œºa, Œºb = evidence, Œº[query], Œº[evidencevars]
    A = Œ£[query,query]
    B = Œ£[evidencevars,evidencevars]
    C = Œ£[query,evidencevars]
    Œº = Œº[query] + C * (B\(b - Œºb))
    Œ£ = A - C * (B \ C')
    return MvNormal(Œº, Œ£)
end
####################

#################### parameter-learning 1
function sub2ind(siz, x)
    k = vcat(1, cumprod(siz[1:end-1]))
    return dot(k, x .- 1) + 1
end

function statistics(vars, G, D::Matrix{Int})
    n = size(D, 1)
    r = [vars[i].r for i in 1:n]
    q = [prod([r[j] for j in inneighbors(G,i)]) for i in 1:n]
    M = [zeros(q[i], r[i]) for i in 1:n]
    for o in eachcol(D)
        for i in 1:n
            k = o[i]
            parents = inneighbors(G,i)
            j = 1
            if !isempty(parents)
                 j = sub2ind(r[parents], o[parents])
            end
            M[i][j,k] += 1.0
        end
    end
    return M
end
####################

#################### parameter-learning 2
function prior(vars, G)
    n = length(vars)
    r = [vars[i].r for i in 1:n]
    q = [prod([r[j] for j in inneighbors(G,i)]) for i in 1:n]
    return [ones(q[i], r[i]) for i in 1:n]
end
####################

#################### parameter-learning 3
gaussian_kernel(b) = x->pdf(Normal(0,b), x)

function kernel_density_estimate(œï, O)
	return x -> sum([œï(x - o) for o in O])/length(O)
end
####################

#################### structure-learning 1
function bayesian_score_component(M, Œ±)
    p =  sum(loggamma.(Œ± + M))
    p -= sum(loggamma.(Œ±))
    p += sum(loggamma.(sum(Œ±,dims=2)))
    p -= sum(loggamma.(sum(Œ±,dims=2) + sum(M,dims=2)))
    return p
end

function bayesian_score(vars, G, D)
    n = length(vars)
    M = statistics(vars, G, D)
    Œ± = prior(vars, G)
    return sum(bayesian_score_component(M[i], Œ±[i]) for i in 1:n)
end
####################

#################### structure-learning 2
struct K2Search
    ordering::Vector{Int} # variable ordering
end

function fit(method::K2Search, vars, D)
    G = SimpleDiGraph(length(vars))
    for (k,i) in enumerate(method.ordering[2:end])
        y = bayesian_score(vars, G, D)
        while true
            y_best, j_best = -Inf, 0
            for j in method.ordering[1:k]
                if !has_edge(G, j, i)
                    add_edge!(G, j, i)
                    y‚Ä≤ = bayesian_score(vars, G, D)
                    if y‚Ä≤ > y_best
                        y_best, j_best = y‚Ä≤, j
                    end
                    rem_edge!(G, j, i)
                end
            end
            if y_best > y
                y = y_best
                add_edge!(G, j_best, i)
            else
                break
            end
        end
    end
    return G
end
####################

#################### structure-learning 3
struct LocalDirectedGraphSearch
    G     # initial graph
    k_max # number of iterations
end

function rand_graph_neighbor(G)
    n = nv(G)
    i = rand(1:n)
    j = mod1(i + rand(2:n)-1, n)
    G‚Ä≤ = copy(G)
    has_edge(G, i, j) ? rem_edge!(G‚Ä≤, i, j) : add_edge!(G‚Ä≤, i, j)
    return G‚Ä≤
end

function fit(method::LocalDirectedGraphSearch, vars, D)
    G = method.G
    y = bayesian_score(vars, G, D)
    for k in 1:method.k_max
        G‚Ä≤ = rand_graph_neighbor(G)
        y‚Ä≤ = is_cyclic(G‚Ä≤) ? -Inf : bayesian_score(vars, G‚Ä≤, D)
        if y‚Ä≤ > y
            y, G = y‚Ä≤, G‚Ä≤
        end
    end
    return G
end
####################

#################### structure-learning 4
function are_markov_equivalent(G, H)
	if nv(G) != nv(H) || ne(G) != ne(H) ||
		!all(has_edge(H, e) || has_edge(H, reverse(e))
										for e in edges(G))
		return false
	end
	for (I, J) in [(G,H), (H,G)]
		for c in 1:nv(I)
			parents = inneighbors(I, c)
		 	for (a, b) in subsets(parents, 2)
		 		if !has_edge(I, a, b) && !has_edge(I, b, a) &&
		 		   !(has_edge(J, a, c) && has_edge(J, b, c))
		 		    return false
		 		end
			end
		end
	end

	return true
end
####################

#################### structure-learning 5
is_clique(P, nodes) = all(has_edge(P, a, b) || has_edge(P, b, a)
								for (a, b) in subsets(nodes, 2))

function pdag_to_dag_node!(P, G, removed_nodes)
	for i in 1:nv(P)
		if !removed_nodes[i]
			incoming = Set(inneighbors(P, i))
			outgoing = Set(outneighbors(P, i))
			directed_in = setdiff(incoming, outgoing)
			undirected = incoming ‚à© outgoing
			directed_out = setdiff(outgoing, incoming)
			if isempty(directed_out) && (isempty(undirected)
			      || is_clique(P, undirected ‚à™ directed_in))
				for j in undirected
					add_edge!(G, j, i)
				end
				for j in incoming
					rem_edge!(P, j, i)
					rem_edge!(P, i, j)
				end
				removed_nodes[i] = true
				return true
			end
		end
	end
	return false
end

function pdag_to_dag(P)
	G = SimpleDiGraph(nv(P))
	for e in edges(P)
		if !has_edge(P, reverse(e))
			add_edge!(G, e)
		end
	end
	removed_nodes = falses(nv(P))
	while !all(removed_nodes)
		if !pdag_to_dag_node!(P, G, removed_nodes)
			error("Cannot realize DAG for given PDAG")
		end
	end
	return G
end
####################

#################### simple-decisions 1
struct SimpleProblem
    bn::BayesianNetwork
    chance_vars::Vector{Variable}
    decision_vars::Vector{Variable}
    utility_vars::Vector{Variable}
    utilities::Dict{Symbol, Vector{Float64}}
end

function solve(ùí´::SimpleProblem, evidence, M)
    query = [var.name for var in ùí´.utility_vars]
    U(a) = sum(ùí´.utilities[uname][a[uname]] for uname in query)
    best = (a=nothing, u=-Inf)
    for assignment in assignments(ùí´.decision_vars)
        evidence = merge(evidence, assignment)
        œï = infer(M, ùí´.bn, query, evidence)
        u = sum(p*U(a) for (a, p) in œï.table)
        if u > best.u
            best = (a=assignment, u=u)
        end
    end
    return best
end
####################

#################### simple-decisions 2
function value_of_information(ùí´, query, evidence, M)
    œï = infer(M, ùí´.bn, query, evidence)
    voi = -solve(ùí´, evidence, M).u
    query_vars = filter(v->v.name ‚àà query, ùí´.chance_vars)
    for o‚Ä≤ in assignments(query_vars)
        oo‚Ä≤ = merge(evidence, o‚Ä≤)
        p = œï.table[o‚Ä≤]
        voi += p*solve(ùí´, oo‚Ä≤, M).u
    end
    return voi
end
####################

#################### exact-solutions 1
struct MDP
    Œ≥  # discount factor
    ùíÆ  # state space
    ùíú  # action space
    T  # transition function
    R  # reward function
    TR # sample transition and reward
end
####################

#################### exact-solutions 2
function lookahead(ùí´::MDP, U, s, a)
    ùíÆ, T, R, Œ≥ = ùí´.ùíÆ, ùí´.T, ùí´.R, ùí´.Œ≥
    return R(s,a) + Œ≥*sum(T(s,a,s‚Ä≤)*U(s‚Ä≤) for s‚Ä≤ in ùíÆ)
end
function lookahead(ùí´::MDP, U::Vector, s, a)
    ùíÆ, T, R, Œ≥ = ùí´.ùíÆ, ùí´.T, ùí´.R, ùí´.Œ≥
    return R(s,a) + Œ≥*sum(T(s,a,s‚Ä≤)*U[i] for (i,s‚Ä≤) in enumerate(ùíÆ))
end
####################

#################### exact-solutions 3
function iterative_policy_evaluation(ùí´::MDP, œÄ, k_max)
    ùíÆ, T, R, Œ≥ = ùí´.ùíÆ, ùí´.T, ùí´.R, ùí´.Œ≥
    U = [0.0 for s in ùíÆ]
    for k in 1:k_max
        U = [lookahead(ùí´, U, s, œÄ(s)) for s in ùíÆ]
    end
    return U
end
####################

#################### exact-solutions 4
function policy_evaluation(ùí´::MDP, œÄ)
	ùíÆ, R, T, Œ≥ = ùí´.ùíÆ, ùí´.R, ùí´.T, ùí´.Œ≥
	R‚Ä≤ = [R(s, œÄ(s)) for s in ùíÆ]
	T‚Ä≤ = [T(s, œÄ(s), s‚Ä≤) for s in ùíÆ, s‚Ä≤ in ùíÆ]
	return (I - Œ≥*T‚Ä≤)\R‚Ä≤
end
####################

#################### exact-solutions 5
struct ValueFunctionPolicy
	ùí´ # problem
	U # utility function
end

function greedy(ùí´::MDP, U, s)
    u, a = findmax(a->lookahead(ùí´, U, s, a), ùí´.ùíú)
    return (a=a, u=u)
end

(œÄ::ValueFunctionPolicy)(s) = greedy(œÄ.ùí´, œÄ.U, s).a
####################

#################### exact-solutions 6
struct PolicyIteration
    œÄ # initial policy
    k_max # maximum number of iterations
end

function solve(M::PolicyIteration, ùí´::MDP)
    œÄ, ùíÆ = M.œÄ, ùí´.ùíÆ
    for k = 1:M.k_max
        U = policy_evaluation(ùí´, œÄ)
        œÄ‚Ä≤ = ValueFunctionPolicy(ùí´, U)
        if all(œÄ(s) == œÄ‚Ä≤(s) for s in ùíÆ)
            break
        end
        œÄ = œÄ‚Ä≤
    end
    return œÄ
end
####################

#################### exact-solutions 7
function backup(ùí´::MDP, U, s)
	return maximum(lookahead(ùí´, U, s, a) for a in ùí´.ùíú)
end
####################

#################### exact-solutions 8
struct ValueIteration
    k_max # maximum number of iterations
end

function solve(M::ValueIteration, ùí´::MDP)
    U = [0.0 for s in ùí´.ùíÆ]
    for k = 1:M.k_max
        U = [backup(ùí´, U, s) for s in ùí´.ùíÆ]
    end
    return ValueFunctionPolicy(ùí´, U)
end
####################

#################### exact-solutions 9
struct GaussSeidelValueIteration
    k_max # maximum number of iterations
end

function solve(M::GaussSeidelValueIteration, ùí´::MDP)
    U = [0.0 for s in ùí´.ùíÆ]
    for k = 1:M.k_max
        for (i, s) in enumerate(ùí´.ùíÆ)
            U[i] = backup(ùí´, U, s)
        end
    end
    return ValueFunctionPolicy(ùí´, U)
end
####################

#################### exact-solutions 10
struct LinearProgramFormulation end

function tensorform(ùí´::MDP)
    ùíÆ, ùíú, R, T = ùí´.ùíÆ, ùí´.ùíú, ùí´.R, ùí´.T
    ùíÆ‚Ä≤ = eachindex(ùíÆ)
    ùíú‚Ä≤ = eachindex(ùíú)
    R‚Ä≤ = [R(s,a) for s in ùíÆ, a in ùíú]
    T‚Ä≤ = [T(s,a,s‚Ä≤) for s in ùíÆ, a in ùíú, s‚Ä≤ in ùíÆ]
    return ùíÆ‚Ä≤, ùíú‚Ä≤, R‚Ä≤, T‚Ä≤
end

solve(ùí´::MDP) = solve(LinearProgramFormulation(), ùí´)

function solve(M::LinearProgramFormulation, ùí´::MDP)
    ùíÆ, ùíú, R, T = tensorform(ùí´)
    model = Model(GLPK.Optimizer)
    @variable(model, U[ùíÆ])
    @objective(model, Min, sum(U))
    @constraint(model, [s=ùíÆ,a=ùíú], U[s] ‚â• R[s,a] + ùí´.Œ≥*T[s,a,:]‚ãÖU)
    optimize!(model)
    return ValueFunctionPolicy(ùí´, value.(U))
end
####################

#################### exact-solutions 11
struct LinearQuadraticProblem
    Ts # transition matrix with respect to state
    Ta # transition matrix with respect to action
    Rs # reward matrix with respect to state (negative semidefinite)
    Ra # reward matrix with respect to action (negative definite)
    h_max # horizon
end

function solve(ùí´::LinearQuadraticProblem)
    Ts, Ta, Rs, Ra, h_max = ùí´.Ts, ùí´.Ta, ùí´.Rs, ùí´.Ra, ùí´.h_max
    V = zeros(size(Rs))
    œÄs = Any[s -> zeros(size(Ta, 2))]
    for h in 2:h_max
        V = Ts'*(V - V*Ta*((Ta'*V*Ta + Ra) \ Ta'*V))*Ts + Rs
        L = -(Ta'*V*Ta + Ra) \ Ta' * V * Ts
        push!(œÄs, s -> L*s)
    end
    return œÄs
end
####################

#################### value-function-approximations 1
struct ApproximateValueIteration
    UŒ∏    # initial parameterized value function that supports fit!
    S     # set of discrete states for performing backups
    k_max # maximum number of iterations
end

function solve(M::ApproximateValueIteration, ùí´::MDP)
    UŒ∏, S, k_max = M.UŒ∏, M.S, M.k_max
    for k in 1:k_max
        U = [backup(ùí´, UŒ∏, s) for s in S]
        fit!(UŒ∏, S, U)
    end
    return ValueFunctionPolicy(ùí´, UŒ∏)
end
####################

#################### value-function-approximations 2
mutable struct NearestNeighborValueFunction
    k # number of neighbors
    d # distance function d(s, s‚Ä≤)
    S # set of discrete states
    Œ∏ # vector of values at states in S
end

function (UŒ∏::NearestNeighborValueFunction)(s)
    dists = [UŒ∏.d(s,s‚Ä≤) for s‚Ä≤ in UŒ∏.S]
    ind = sortperm(dists)[1:UŒ∏.k]
    return mean(UŒ∏.Œ∏[i] for i in ind)
end

function fit!(UŒ∏::NearestNeighborValueFunction, S, U)
    UŒ∏.Œ∏ = U
    return UŒ∏
end
####################

#################### value-function-approximations 3
mutable struct LocallyWeightedValueFunction
    k # kernel function k(s, s‚Ä≤)
    S # set of discrete states
    Œ∏ # vector of values at states in S
end

function (UŒ∏::LocallyWeightedValueFunction)(s)
    w = normalize([UŒ∏.k(s,s‚Ä≤) for s‚Ä≤ in UŒ∏.S], 1)
    return UŒ∏.Œ∏ ‚ãÖ w
end

function fit!(UŒ∏::LocallyWeightedValueFunction, S, U)
    UŒ∏.Œ∏ = U
    return UŒ∏
end
####################

#################### value-function-approximations 4
mutable struct MultilinearValueFunction
    o # position of lower-left corner
    Œ¥ # vector of widths
    Œ∏ # vector of values at states in S
end

function (UŒ∏::MultilinearValueFunction)(s)
	o, Œ¥, Œ∏ = UŒ∏.o, UŒ∏.Œ¥, UŒ∏.Œ∏
    Œî = (s - o)./Œ¥
    # Multidimensional index of lower-left cell
    i = min.(floor.(Int, Œî) .+ 1, size(Œ∏) .- 1)
    vertex_index = similar(i)
    d = length(s)
    u = 0.0
    for vertex in 0:2^d-1
        weight = 1.0
        for j in 1:d
            # Check whether jth bit is set
            if vertex & (1 << (j-1)) > 0
                vertex_index[j] = i[j] + 1
                weight *= Œî[j] - i[j] + 1
            else
                vertex_index[j] = i[j]
                weight *= i[j] - Œî[j]
            end
        end
        u += Œ∏[vertex_index...]*weight
    end
    return u
end

function fit!(UŒ∏::MultilinearValueFunction, S, U)
    UŒ∏.Œ∏ = U
    return UŒ∏
end
####################

#################### value-function-approximations 5
mutable struct SimplexValueFunction
    o # position of lower-left corner
    Œ¥ # vector of widths
    Œ∏ # vector of values at states in S
end

function (UŒ∏::SimplexValueFunction)(s)
	Œî = (s - UŒ∏.o)./UŒ∏.Œ¥
	# Multidimensional index of upper-right cell
	i = min.(floor.(Int, Œî) .+ 1, size(UŒ∏.Œ∏) .- 1) .+ 1
	u = 0.0
	s‚Ä≤ = (s - (UŒ∏.o + UŒ∏.Œ¥.*(i.-2))) ./ UŒ∏.Œ¥
	p = sortperm(s‚Ä≤) # increasing order
	w_tot = 0.0
	for j in p
		w = s‚Ä≤[j] - w_tot
		u += w*UŒ∏.Œ∏[i...]
		i[j] -= 1
		w_tot += w
	end
	u += (1 - w_tot)*UŒ∏.Œ∏[i...]
	return u
end

function fit!(UŒ∏::SimplexValueFunction, S, U)
    UŒ∏.Œ∏ = U
    return UŒ∏
end
####################

#################### value-function-approximations 6
	using LinearAlgebra
	function regression(X, y, bases::Vector)
	    B = [b(x) for x in X, b in bases]
	    return pinv(B)*y
	end
	function regression(X, y, bases::Function)
	    B = Array{Float64}(undef, length(X), length(bases(X[1])))
		for (i,x) in enumerate(X)
			B[i,:] = bases(x)
		end
	    return pinv(B)*y
	end

	polynomial_bases_1d(i, k) = [x->x[i]^p for p in 0:k]
	function polynomial_bases(n, k)
		bases = [polynomial_bases_1d(i, k) for i in 1:n]
		terms = Function[]
		for ks in product([0:k for i in 1:n]...)
			if sum(ks) ‚â§ k
				push!(terms,
					x->prod(b[j+1](x) for (j,b) in zip(ks,bases)))
			end
		end
		return terms
	end

	function sinusoidal_bases_1d(j, k, a, b)
		T = b[j] - a[j]
		bases = Function[x->1/2]
		for i in 1:k
			push!(bases, x->sin(2œÄ*i*x[j]/T))
			push!(bases, x->cos(2œÄ*i*x[j]/T))
		end
		return bases
	end
	function sinusoidal_bases(k, a, b)
		n = length(a)
		bases = [sinusoidal_bases_1d(i, k, a, b) for i in 1:n]
		terms = Function[]
		for ks in product([0:2k for i in 1:n]...)
			powers = [div(k+1,2) for k in ks]
			if sum(powers) ‚â§ k
				push!(terms,
					x->prod(b[j+1](x) for (j,b) in zip(ks,bases)))
			end
		end
		return terms
	end

	regress(Œ≤, ss, u) = regression(ss, u, Œ≤)
####################

#################### value-function-approximations 7
mutable struct LinearRegressionValueFunction
    Œ≤ # basis vector function
    Œ∏ # vector of parameters
end

function (UŒ∏::LinearRegressionValueFunction)(s)
    return UŒ∏.Œ≤(s) ‚ãÖ UŒ∏.Œ∏
end

function fit!(UŒ∏::LinearRegressionValueFunction, S, U)
    X = hcat([UŒ∏.Œ≤(s) for s in S]...)'
    UŒ∏.Œ∏ = pinv(X)*U
    return UŒ∏
end
####################

#################### online-approximations 1
struct RolloutLookahead
	ùí´ # problem
	œÄ # rollout policy
	d # depth
end

randstep(ùí´::MDP, s, a) = ùí´.TR(s, a)

function rollout(ùí´, s, œÄ, d)
    ret = 0.0
    for t in 1:d
        a = œÄ(s)
        s, r = randstep(ùí´, s, a)
        ret += ùí´.Œ≥^(t-1) * r
    end
    return ret
end

function (œÄ::RolloutLookahead)(s)
	U(s) = rollout(œÄ.ùí´, s, œÄ.œÄ, œÄ.d)
    return greedy(œÄ.ùí´, U, s).a
end
####################

#################### online-approximations 2
struct ForwardSearch
    ùí´ # problem
    d # depth
    U # value function at depth d
end

function forward_search(ùí´, s, d, U)
    if d ‚â§ 0
        return (a=nothing, u=U(s))
    end
    best = (a=nothing, u=-Inf)
    U‚Ä≤(s) = forward_search(ùí´, s, d-1, U).u
    for a in ùí´.ùíú
        u = lookahead(ùí´, U‚Ä≤, s, a)
        if u > best.u
            best = (a=a, u=u)
        end
    end
    return best
end

(œÄ::ForwardSearch)(s) = forward_search(œÄ.ùí´, s, œÄ.d, œÄ.U).a
####################

#################### online-approximations 3
struct BranchAndBound
    ùí´   # problem
    d   # depth
    Ulo # lower bound on value function at depth d
    Qhi # upper bound on action value function
end

function branch_and_bound(ùí´, s, d, Ulo, Qhi)
    if d ‚â§ 0
        return (a=nothing, u=Ulo(s))
    end
    U‚Ä≤(s) = branch_and_bound(ùí´, s, d-1, Ulo, Qhi).u
    best = (a=nothing, u=-Inf)
    for a in sort(ùí´.ùíú, by=a->Qhi(s,a), rev=true)
        if Qhi(s, a) < best.u
            return best # safe to prune
        end
        u = lookahead(ùí´, U‚Ä≤, s, a)
        if u > best.u
            best = (a=a, u=u)
        end
    end
    return best
end

(œÄ::BranchAndBound)(s) = branch_and_bound(œÄ.ùí´, s, œÄ.d, œÄ.Ulo, œÄ.Qhi).a
####################

#################### online-approximations 4
struct SparseSampling
    ùí´ # problem
    d # depth
    m # number of samples
    U # value function at depth d
end

function sparse_sampling(ùí´, s, d, m, U)
    if d ‚â§ 0
        return (a=nothing, u=U(s))
    end
    best = (a=nothing, u=-Inf)
    for a in ùí´.ùíú
        u = 0.0
        for i in 1:m
            s‚Ä≤, r = randstep(ùí´, s, a)
            a‚Ä≤, u‚Ä≤ = sparse_sampling(ùí´, s‚Ä≤, d-1, m, U)
            u += (r + ùí´.Œ≥*u‚Ä≤) / m
        end
        if u > best.u
            best = (a=a, u=u)
        end
    end
    return best
end

(œÄ::SparseSampling)(s) = sparse_sampling(œÄ.ùí´, s, œÄ.d, œÄ.m, œÄ.U).a
####################

#################### online-approximations 5
struct MonteCarloTreeSearch
	ùí´ # problem
	N # visit counts
	Q # action value estimates
	d # depth
	m # number of simulations
	c # exploration constant
	U # value function estimate
end

function (œÄ::MonteCarloTreeSearch)(s)
	for k in 1:œÄ.m
		simulate!(œÄ, s)
	end
	return argmax(a->œÄ.Q[(s,a)], œÄ.ùí´.ùíú)
end
####################

#################### online-approximations 6
function simulate!(œÄ::MonteCarloTreeSearch, s, d=œÄ.d)
    if d ‚â§ 0
        return œÄ.U(s)
    end
    ùí´, N, Q, c = œÄ.ùí´, œÄ.N, œÄ.Q, œÄ.c
    ùíú, TR, Œ≥ = ùí´.ùíú, ùí´.TR, ùí´.Œ≥
    if !haskey(N, (s, first(ùíú)))
        for a in ùíú
            N[(s,a)] = 0
            Q[(s,a)] = 0.0
        end
        return œÄ.U(s)
    end
    a = explore(œÄ, s)
    s‚Ä≤, r = TR(s,a)
    q = r + Œ≥*simulate!(œÄ, s‚Ä≤, d-1)
    N[(s,a)] += 1
    Q[(s,a)] += (q-Q[(s,a)])/N[(s,a)]
    return q
end
####################

#################### online-approximations 7
bonus(Nsa, Ns) = Nsa == 0 ? Inf : sqrt(log(Ns)/Nsa)

function explore(œÄ::MonteCarloTreeSearch, s)
    ùíú, N, Q, c = œÄ.ùí´.ùíú, œÄ.N, œÄ.Q, œÄ.c
    Ns = sum(N[(s,a)] for a in ùíú)
    return argmax(a->Q[(s,a)] + c*bonus(N[(s,a)], Ns), ùíú)
end
####################

#################### online-approximations 8
struct HeuristicSearch
    ùí´   # problem
    Uhi # upper bound on value function
    d   # depth
    m   # number of simulations
end

function simulate!(œÄ::HeuristicSearch, U, s)
    ùí´ = œÄ.ùí´
    for d in 1:œÄ.d
        a, u = greedy(ùí´, U, s)
        U[s] = u
        s = rand(ùí´.T(s, a))
    end
end

function (œÄ::HeuristicSearch)(s)
    U = [œÄ.Uhi(s) for s in œÄ.ùí´.ùíÆ]
    for i in 1:œÄ.m
        simulate!(œÄ, U, s)
    end
    return greedy(œÄ.ùí´, U, s).a
end
####################

#################### online-approximations 9
function extractpolicy(œÄ::HeuristicSearch, s)
    U = [œÄ.Uhi(s) for s in œÄ.ùí´.ùíÆ]
    for i in 1:œÄ.m
        simulate!(œÄ, U, s)
    end
    return ValueFunctionPolicy(œÄ.ùí´, U)
end
####################

#################### online-approximations 10
struct LabeledHeuristicSearch
    ùí´     # problem
    Uhi   # upper bound on value function
    d     # depth
    Œ¥     # gap threshold
end

function (œÄ::LabeledHeuristicSearch)(s)
    U, solved = [œÄ.Uhi(s) for s in ùí´.ùíÆ], Set()
    while s ‚àâ solved
        simulate!(œÄ, U, solved, s)
    end
    return greedy(œÄ.ùí´, U, s).a
end
####################

#################### online-approximations 11
function simulate!(œÄ::LabeledHeuristicSearch, U, solved, s)
    visited = []
    for d in 1:œÄ.d
        if s ‚àà solved
            break
        end
        push!(visited, s)
        a, u = greedy(œÄ.ùí´, U, s)
        U[s] = u
        s = rand(œÄ.ùí´.T(s, a))
    end
    while !isempty(visited)
        if label!(œÄ, U, solved, pop!(visited))
            break
        end
    end
end
####################

#################### online-approximations 12
function expand(œÄ::LabeledHeuristicSearch, U, solved, s)
    ùí´, Œ¥ = œÄ.ùí´, œÄ.Œ¥
    ùíÆ, ùíú, T = ùí´.ùíÆ, ùí´.ùíú, ùí´.T
    found, toexpand, envelope = false, Set(s), []
    while !isempty(toexpand)
        s = pop!(toexpand)
        push!(envelope, s)
        a, u = greedy(ùí´, U, s)
        if abs(U[s] - u) > Œ¥
            found = true
        else
            for s‚Ä≤ in ùíÆ
                if T(s,a,s‚Ä≤) > 0 && s‚Ä≤ ‚àâ (solved ‚à™ envelope)
                    push!(toexpand, s‚Ä≤)
                end
            end
        end
    end
    return (found, envelope)
end

function label!(œÄ::LabeledHeuristicSearch, U, solved, s)
    if s ‚àà solved
        return false
    end
    found, envelope = expand(œÄ, U, solved, s)
    if found
        for s ‚àà reverse(envelope)
            U[s] = greedy(œÄ.ùí´, U, s).u
        end
    else
        union!(solved, envelope)
    end
    return found
end
####################

#################### online-approximations 13
function extractpolicy(œÄ::LabeledHeuristicSearch, s)
    U, solved = [œÄ.Uhi(s) for s ‚àà œÄ.ùí´.ùíÆ], Set()
    while s ‚àâ solved
        simulate!(œÄ, U, solved, s)
    end
    return ValueFunctionPolicy(œÄ.ùí´, U)
end
####################

#################### policy-search 1
struct MonteCarloPolicyEvaluation
    ùí´ # problem
    b # initial state distribution
    d # depth
    m # number of samples
end

function (U::MonteCarloPolicyEvaluation)(œÄ)
    R(œÄ) = rollout(U.ùí´, rand(U.b), œÄ, U.d)
    return mean(R(œÄ) for i = 1:U.m)
end

(U::MonteCarloPolicyEvaluation)(œÄ, Œ∏) = U(s->œÄ(Œ∏, s))
####################

#################### policy-search 2
struct HookeJeevesPolicySearch
    Œ∏ # initial parameterization
    Œ± # step size
    c # step size reduction factor
    œµ # termination step size
end

function optimize(M::HookeJeevesPolicySearch, œÄ, U)
    Œ∏, Œ∏‚Ä≤, Œ±, c, œµ = copy(M.Œ∏), similar(M.Œ∏), M.Œ±, M.c, M.œµ
    u, n = U(œÄ, Œ∏), length(Œ∏)
    while Œ± > œµ
        copyto!(Œ∏‚Ä≤, Œ∏)
        best = (i=0, sgn=0, u=u)
        for i in 1:n
            for sgn in (-1,1)
                Œ∏‚Ä≤[i] = Œ∏[i] + sgn*Œ±
                u‚Ä≤ = U(œÄ, Œ∏‚Ä≤)
                if u‚Ä≤ > best.u
                    best = (i=i, sgn=sgn, u=u‚Ä≤)
                end
            end
            Œ∏‚Ä≤[i] = Œ∏[i]
        end
        if best.i != 0
            Œ∏[best.i] += best.sgn*Œ±
            u = best.u
        else
            Œ± *= c
        end
    end
    return Œ∏
end
####################

#################### policy-search 3
struct GeneticPolicySearch
    Œ∏s      # initial population
    œÉ       # initial standard deviation
    m_elite # number of elite samples
    k_max   # number of iterations
end

function optimize(M::GeneticPolicySearch, œÄ, U)
    Œ∏s, œÉ = M.Œ∏s, M.œÉ
    n, m = length(first(Œ∏s)), length(Œ∏s)
    for k in 1:M.k_max
        us = [U(œÄ, Œ∏) for Œ∏ in Œ∏s]
        sp = sortperm(us, rev=true)
        Œ∏_best = Œ∏s[sp[1]]
        rand_elite() = Œ∏s[sp[rand(1:M.m_elite)]]
        Œ∏s = [rand_elite() + œÉ.*randn(n) for i in 1:(m-1)]
        push!(Œ∏s, Œ∏_best)
    end
    return last(Œ∏s)
end
####################

#################### policy-search 4
struct CrossEntropyPolicySearch
    p       # initial distribution
    m       # number of samples
    m_elite # number of elite samples
    k_max   # number of iterations
end

function optimize_dist(M::CrossEntropyPolicySearch, œÄ, U)
    p, m, m_elite, k_max = M.p, M.m, M.m_elite, M.k_max
    for k in 1:k_max
        Œ∏s = rand(p, m)
        us = [U(œÄ, Œ∏s[:,i]) for i in 1:m]
        Œ∏_elite = Œ∏s[:,sortperm(us)[(m-m_elite+1):m]]
        p = Distributions.fit(typeof(p), Œ∏_elite)
    end
    return p
end

function optimize(M, œÄ, U)
    return Distributions.mode(optimize_dist(M, œÄ, U))
end
####################

#################### policy-search 5
struct EvolutionStrategies
    D       # distribution constructor
    œà       # initial distribution parameterization
    ‚àálogp   # log search likelihood gradient
    m       # number of samples
    Œ±       # step factor
    k_max   # number of iterations
end

function evolution_strategy_weights(m)
    ws = [max(0, log(m/2+1) - log(i)) for i in 1:m]
    ws ./= sum(ws)
    ws .-= 1/m
    return ws
end

function optimize_dist(M::EvolutionStrategies, œÄ, U)
    D, œà, m, ‚àálogp, Œ± = M.D, M.œà, M.m, M.‚àálogp, M.Œ±
    ws = evolution_strategy_weights(m)
    for k in 1:M.k_max
        Œ∏s = rand(D(œà), m)
        us = [U(œÄ, Œ∏s[:,i]) for i in 1:m]
        sp = sortperm(us, rev=true)
        ‚àá = sum(w.*‚àálogp(œà, Œ∏s[:,i]) for (w,i) in zip(ws,sp))
        œà += Œ±.*‚àá
    end
    return D(œà)
end
####################

#################### policy-search 6
struct IsotropicEvolutionStrategies
    œà       # initial mean
    œÉ       # initial standard deviation
    m       # number of samples
    Œ±       # step factor
    k_max   # number of iterations
end

function optimize_dist(M::IsotropicEvolutionStrategies, œÄ, U)
    œà, œÉ, m, Œ±, k_max = M.œà, M.œÉ, M.m, M.Œ±, M.k_max
    n = length(œà)
    ws = evolution_strategy_weights(2*div(m,2))
    for k in 1:k_max
        œµs = [randn(n) for i in 1:div(m,2)]
        append!(œµs, -œµs) # weight mirroring
        us = [U(œÄ, œà + œÉ.*œµ) for œµ in œµs]
        sp = sortperm(us, rev=true)
        ‚àá = sum(w.*œµs[i] for (w,i) in zip(ws,sp)) / œÉ
        œà += Œ±.*‚àá
    end
    return MvNormal(œà, œÉ)
end
####################

#################### policy-gradient-estimation 1
function simulate(ùí´::MDP, s, œÄ, d)
	œÑ = []
	for i = 1:d
	    a = œÄ(s)
		s‚Ä≤, r = ùí´.TR(s,a)
	    push!(œÑ, (s,a,r))
	    s = s‚Ä≤
    end
    return œÑ
end
####################

#################### policy-gradient-estimation 2
struct FiniteDifferenceGradient
    ùí´ # problem
    b # initial state distribution
    d # depth
    m # number of samples
    Œ¥ # step size
end

function gradient(M::FiniteDifferenceGradient, œÄ, Œ∏)
    ùí´, b, d, m, Œ¥, Œ≥, n = M.ùí´, M.b, M.d, M.m, M.Œ¥, M.ùí´.Œ≥, length(Œ∏)
    ŒîŒ∏(i) = [i == k ? Œ¥ : 0.0 for k in 1:n]
    R(œÑ) = sum(r*Œ≥^(k-1) for (k, (s,a,r)) in enumerate(œÑ))
    U(Œ∏) = mean(R(simulate(ùí´, rand(b), s->œÄ(Œ∏, s), d)) for i in 1:m)
    ŒîU = [U(Œ∏ + ŒîŒ∏(i)) - U(Œ∏) for i in 1:n]
    return ŒîU ./ Œ¥
end
####################

#################### policy-gradient-estimation 3
struct RegressionGradient
    ùí´ # problem
    b # initial state distribution
    d # depth
    m # number of samples
    Œ¥ # step size
end

function gradient(M::RegressionGradient, œÄ, Œ∏)
    ùí´, b, d, m, Œ¥, Œ≥ = M.ùí´, M.b, M.d, M.m, M.Œ¥, M.ùí´.Œ≥
    ŒîŒò = [Œ¥.*normalize(randn(length(Œ∏)), 2) for i = 1:m]
    R(œÑ) = sum(r*Œ≥^(k-1) for (k, (s,a,r)) in enumerate(œÑ))
    U(Œ∏) = R(simulate(ùí´, rand(b), s->œÄ(Œ∏,s), d))
    ŒîU = [U(Œ∏ + ŒîŒ∏) - U(Œ∏) for ŒîŒ∏ in ŒîŒò]
    return pinv(reduce(hcat, ŒîŒò)') * ŒîU
end
####################

#################### policy-gradient-estimation 4
struct LikelihoodRatioGradient
    ùí´ # problem
    b # initial state distribution
    d # depth
    m # number of samples
    ‚àálogœÄ # gradient of log likelihood
end

function gradient(M::LikelihoodRatioGradient, œÄ, Œ∏)
    ùí´, b, d, m, ‚àálogœÄ, Œ≥ = M.ùí´, M.b, M.d, M.m, M.‚àálogœÄ, M.ùí´.Œ≥
    œÄŒ∏(s) = œÄ(Œ∏, s)
    R(œÑ) = sum(r*Œ≥^(k-1) for (k, (s,a,r)) in enumerate(œÑ))
    ‚àáU(œÑ) = sum(‚àálogœÄ(Œ∏, a, s) for (s,a) in œÑ)*R(œÑ)
    return mean(‚àáU(simulate(ùí´, rand(b), œÄŒ∏, d)) for i in 1:m)
end
####################

#################### policy-gradient-estimation 5
struct RewardToGoGradient
    ùí´ # problem
    b # initial state distribution
    d # depth
    m # number of samples
    ‚àálogœÄ # gradient of log likelihood
end

function gradient(M::RewardToGoGradient, œÄ, Œ∏)
    ùí´, b, d, m, ‚àálogœÄ, Œ≥ = M.ùí´, M.b, M.d, M.m, M.‚àálogœÄ, M.ùí´.Œ≥
    œÄŒ∏(s) = œÄ(Œ∏, s)
    R(œÑ, j) = sum(r*Œ≥^(k-1) for (k,(s,a,r)) in zip(j:d, œÑ[j:end]))
    ‚àáU(œÑ) = sum(‚àálogœÄ(Œ∏, a, s)*R(œÑ,j) for (j, (s,a,r)) in enumerate(œÑ))
    return mean(‚àáU(simulate(ùí´, rand(b), œÄŒ∏, d)) for i in 1:m)
end
####################

#################### policy-gradient-estimation 6
struct BaselineSubtractionGradient
    ùí´ # problem
    b # initial state distribution
    d # depth
    m # number of samples
    ‚àálogœÄ # gradient of log likelihood
end

function gradient(M::BaselineSubtractionGradient, œÄ, Œ∏)
    ùí´, b, d, m, ‚àálogœÄ, Œ≥ = M.ùí´, M.b, M.d, M.m, M.‚àálogœÄ, M.ùí´.Œ≥
    œÄŒ∏(s) = œÄ(Œ∏, s)
    ‚Ñì(a, s, k) = ‚àálogœÄ(Œ∏, a, s)*Œ≥^(k-1)
    R(œÑ, k) = sum(r*Œ≥^(j-1) for (j,(s,a,r)) in enumerate(œÑ[k:end]))
    numer(œÑ) = sum(‚Ñì(a,s,k).^2*R(œÑ,k) for (k,(s,a,r)) in enumerate(œÑ))
    denom(œÑ) = sum(‚Ñì(a,s,k).^2 for (k,(s,a)) in enumerate(œÑ))
    base(œÑ) = numer(œÑ) ./ denom(œÑ)
    trajs = [simulate(ùí´, rand(b), œÄŒ∏, d) for i in 1:m]
    rbase = mean(base(œÑ) for œÑ in trajs)
    ‚àáU(œÑ) = sum(‚Ñì(a,s,k).*(R(œÑ,k).-rbase) for (k,(s,a,r)) in enumerate(œÑ))
    return mean(‚àáU(œÑ) for œÑ in trajs)
end
####################

#################### policy-gradient-optimization 1
struct PolicyGradientUpdate
    ‚àáU # policy gradient estimate
    Œ±  # step factor
end

function update(M::PolicyGradientUpdate, Œ∏)
    return Œ∏ + M.Œ± * M.‚àáU(Œ∏)
end
####################

#################### policy-gradient-optimization 2
scale_gradient(‚àá, L2_max) = min(L2_max/norm(‚àá), 1)*‚àá
clip_gradient(‚àá, a, b) = clamp.(‚àá, a, b)
####################

#################### policy-gradient-optimization 3
struct RestrictedPolicyUpdate
    ùí´     # problem
    b     # initial state distribution
    d     # depth
    m     # number of samples
    ‚àálogœÄ # gradient of log likelihood
    œÄ     # policy
    œµ     # divergence bound
end

function update(M::RestrictedPolicyUpdate, Œ∏)
    ùí´, b, d, m, ‚àálogœÄ, œÄ, Œ≥ = M.ùí´, M.b, M.d, M.m, M.‚àálogœÄ, M.œÄ, M.ùí´.Œ≥
    œÄŒ∏(s) = œÄ(Œ∏, s)
    R(œÑ) = sum(r*Œ≥^(k-1) for (k, (s,a,r)) in enumerate(œÑ))
    œÑs = [simulate(ùí´, rand(b), œÄŒ∏, d) for i in 1:m]
    ‚àálog(œÑ) = sum(‚àálogœÄ(Œ∏, a, s) for (s,a) in œÑ)
    ‚àáU(œÑ) = ‚àálog(œÑ)*R(œÑ)
    u = mean(‚àáU(œÑ) for œÑ in œÑs)
    return Œ∏ + u*sqrt(2*M.œµ/dot(u,u))
end
####################

#################### policy-gradient-optimization 4
struct NaturalPolicyUpdate
    ùí´     # problem
    b     # initial state distribution
    d     # depth
    m     # number of samples
    ‚àálogœÄ # gradient of log likelihood
    œÄ     # policy
    œµ     # divergence bound
end

function natural_update(Œ∏, ‚àáf, F, œµ, œÑs)
    ‚àáfŒ∏ = mean(‚àáf(œÑ) for œÑ in œÑs)
    u = mean(F(œÑ) for œÑ in œÑs) \ ‚àáfŒ∏
    return Œ∏ + u*sqrt(2œµ/dot(‚àáfŒ∏,u))
end

function update(M::NaturalPolicyUpdate, Œ∏)
    ùí´, b, d, m, ‚àálogœÄ, œÄ, Œ≥ = M.ùí´, M.b, M.d, M.m, M.‚àálogœÄ, M.œÄ, M.ùí´.Œ≥
    œÄŒ∏(s) = œÄ(Œ∏, s)
    R(œÑ) = sum(r*Œ≥^(k-1) for (k, (s,a,r)) in enumerate(œÑ))
    ‚àálog(œÑ) = sum(‚àálogœÄ(Œ∏, a, s) for (s,a) in œÑ)
    ‚àáU(œÑ) = ‚àálog(œÑ)*R(œÑ)
    F(œÑ) = ‚àálog(œÑ)*‚àálog(œÑ)'
    œÑs = [simulate(ùí´, rand(b), œÄŒ∏, d) for i in 1:m]
    return natural_update(Œ∏, ‚àáU, F, M.œµ, œÑs)
end
####################

#################### policy-gradient-optimization 5
struct TrustRegionUpdate
    ùí´     # problem
    b     # initial state distribution
    d     # depth
    m     # number of samples
    œÄ     # policy œÄ(s)
    p     # policy likelihood p(Œ∏, a, s)
    ‚àálogœÄ # log likelihood gradient
    KL    # KL divergence KL(Œ∏, Œ∏‚Ä≤, s)
    œµ     # divergence bound
    Œ±     # line search reduction factor (e.g., 0.5)
end

function surrogate_objective(M::TrustRegionUpdate, Œ∏, Œ∏‚Ä≤, œÑs)
    d, p, Œ≥ = M.d, M.p, M.ùí´.Œ≥
    R(œÑ, j) = sum(r*Œ≥^(k-1) for (k,(s,a,r)) in zip(j:d, œÑ[j:end]))
    w(a,s) = p(Œ∏‚Ä≤,a,s) / p(Œ∏,a,s)
    f(œÑ) = mean(w(a,s)*R(œÑ,k) for (k,(s,a,r)) in enumerate(œÑ))
    return mean(f(œÑ) for œÑ in œÑs)
end

function surrogate_constraint(M::TrustRegionUpdate, Œ∏, Œ∏‚Ä≤, œÑs)
    Œ≥ = M.ùí´.Œ≥
    KL(œÑ) = mean(M.KL(Œ∏, Œ∏‚Ä≤, s)*Œ≥^(k-1) for (k,(s,a,r)) in enumerate(œÑ))
    return mean(KL(œÑ) for œÑ in œÑs)
end

function linesearch(M::TrustRegionUpdate, f, g, Œ∏, Œ∏‚Ä≤)
    fŒ∏ = f(Œ∏)
    while g(Œ∏‚Ä≤) > M.œµ || f(Œ∏‚Ä≤) ‚â§ fŒ∏
        Œ∏‚Ä≤ = Œ∏ + M.Œ±*(Œ∏‚Ä≤ - Œ∏)
    end
    return Œ∏‚Ä≤
end

function update(M::TrustRegionUpdate, Œ∏)
    ùí´, b, d, m, ‚àálogœÄ, œÄ, Œ≥ = M.ùí´, M.b, M.d, M.m, M.‚àálogœÄ, M.œÄ, M.ùí´.Œ≥
    œÄŒ∏(s) = œÄ(Œ∏, s)
    R(œÑ) = sum(r*Œ≥^(k-1) for (k, (s,a,r)) in enumerate(œÑ))
    ‚àálog(œÑ) = sum(‚àálogœÄ(Œ∏, a, s) for (s,a) in œÑ)
    ‚àáU(œÑ) = ‚àálog(œÑ)*R(œÑ)
    F(œÑ) = ‚àálog(œÑ)*‚àálog(œÑ)'
    œÑs = [simulate(ùí´, rand(b), œÄŒ∏, d) for i in 1:m]
    Œ∏‚Ä≤ = natural_update(Œ∏, ‚àáU, F, M.œµ, œÑs)
    f(Œ∏‚Ä≤) = surrogate_objective(M, Œ∏, Œ∏‚Ä≤, œÑs)
    g(Œ∏‚Ä≤) = surrogate_constraint(M, Œ∏, Œ∏‚Ä≤, œÑs)
    return linesearch(M, f, g, Œ∏, Œ∏‚Ä≤)
end
####################

#################### policy-gradient-optimization 6
struct ClampedSurrogateUpdate
    ùí´     # problem
    b     # initial state distribution
    d     # depth
    m     # number of trajectories
    œÄ     # policy
    p     # policy likelihood
    ‚àáœÄ    # policy likelihood gradient
    œµ     # divergence bound
    Œ±     # step size
    k_max # number of iterations per update
end

function clamped_gradient(M::ClampedSurrogateUpdate, Œ∏, Œ∏‚Ä≤, œÑs)
    d, p, ‚àáœÄ, œµ, Œ≥ = M.d, M.p, M.‚àáœÄ, M.œµ, M.ùí´.Œ≥
    R(œÑ, j) = sum(r*Œ≥^(k-1) for (k,(s,a,r)) in zip(j:d, œÑ[j:end]))
    ‚àáf(a,s,r_togo) = begin
        P = p(Œ∏, a,s)
        w = p(Œ∏‚Ä≤,a,s) / P
        if (r_togo > 0 && w > 1+œµ) || (r_togo < 0 && w < 1-œµ)
            return zeros(length(Œ∏))
        end
        return ‚àáœÄ(Œ∏‚Ä≤, a, s) * r_togo / P
    end
    ‚àáf(œÑ) = mean(‚àáf(a,s,R(œÑ,k)) for (k,(s,a,r)) in enumerate(œÑ))
    return mean(‚àáf(œÑ) for œÑ in œÑs)
end

function update(M::ClampedSurrogateUpdate, Œ∏)
    ùí´, b, d, m, œÄ, Œ±, k_max= M.ùí´, M.b, M.d, M.m, M.œÄ, M.Œ±, M.k_max
    œÄŒ∏(s) = œÄ(Œ∏, s)
    œÑs = [simulate(ùí´, rand(b), œÄŒ∏, d) for i in 1:m]
    Œ∏‚Ä≤ = copy(Œ∏)
    for k in 1:k_max
        Œ∏‚Ä≤ += Œ±*clamped_gradient(M, Œ∏, Œ∏‚Ä≤, œÑs)
    end
    return Œ∏‚Ä≤
end
####################

#################### actor-critic 1
struct ActorCritic
    ùí´     # problem
    b     # initial state distribution
    d     # depth
    m     # number of samples
    ‚àálogœÄ # gradient of log likelihood ‚àálogœÄ(Œ∏,a,s)
    U     # parameterized value function U(œï, s)
    ‚àáU    # gradient of value function ‚àáU(œï,s)
end

function gradient(M::ActorCritic, œÄ, Œ∏, œï)
    ùí´, b, d, m, ‚àálogœÄ = M.ùí´, M.b, M.d, M.m, M.‚àálogœÄ
    U, ‚àáU, Œ≥ = M.U, M.‚àáU, M.ùí´.Œ≥
    œÄŒ∏(s) = œÄ(Œ∏, s)
    R(œÑ,j) = sum(r*Œ≥^(k-1) for (k,(s,a,r)) in enumerate(œÑ[j:end]))
    A(œÑ,j) = œÑ[j][3] + Œ≥*U(œï,œÑ[j+1][1]) - U(œï,œÑ[j][1])
    ‚àáUŒ∏(œÑ) = sum(‚àálogœÄ(Œ∏,a,s)*A(œÑ,j)*Œ≥^(j-1) for (j, (s,a,r))
    				in enumerate(œÑ[1:end-1]))
    ‚àá‚Ñìœï(œÑ) = sum((U(œï,s) - R(œÑ,j))*‚àáU(œï,s) for (j, (s,a,r))
    				in enumerate(œÑ))
    trajs = [simulate(ùí´, rand(b), œÄŒ∏, d) for i in 1:m]
    return mean(‚àáUŒ∏(œÑ) for œÑ in trajs), mean(‚àá‚Ñìœï(œÑ) for œÑ in trajs)
end
####################

#################### actor-critic 2
struct GeneralizedAdvantageEstimation
    ùí´     # problem
    b     # initial state distribution
    d     # depth
    m     # number of samples
    ‚àálogœÄ # gradient of log likelihood ‚àálogœÄ(Œ∏,a,s)
    U     # parameterized value function U(œï, s)
    ‚àáU    # gradient of value function ‚àáU(œï,s)
    Œª     # weight ‚àà [0,1]
end

function gradient(M::GeneralizedAdvantageEstimation, œÄ, Œ∏, œï)
    ùí´, b, d, m, ‚àálogœÄ = M.ùí´, M.b, M.d, M.m, M.‚àálogœÄ
    U, ‚àáU, Œ≥, Œª = M.U, M.‚àáU, M.ùí´.Œ≥, M.Œª
    œÄŒ∏(s) = œÄ(Œ∏, s)
    R(œÑ,j) = sum(r*Œ≥^(k-1) for (k,(s,a,r)) in enumerate(œÑ[j:end]))
    Œ¥(œÑ,j) = œÑ[j][3] + Œ≥*U(œï,œÑ[j+1][1]) - U(œï,œÑ[j][1])
    A(œÑ,j) = sum((Œ≥*Œª)^(‚Ñì-1)*Œ¥(œÑ, j+‚Ñì-1) for ‚Ñì in 1:d-j)
    ‚àáUŒ∏(œÑ) = sum(‚àálogœÄ(Œ∏,a,s)*A(œÑ,j)*Œ≥^(j-1)
                    for (j, (s,a,r)) in enumerate(œÑ[1:end-1]))
    ‚àá‚Ñìœï(œÑ) = sum((U(œï,s) - R(œÑ,j))*‚àáU(œï,s)
                    for (j, (s,a,r)) in enumerate(œÑ))
    trajs = [simulate(ùí´, rand(b), œÄŒ∏, d) for i in 1:m]
    return mean(‚àáUŒ∏(œÑ) for œÑ in trajs), mean(‚àá‚Ñìœï(œÑ) for œÑ in trajs)
end
####################

#################### actor-critic 3
struct DeterministicPolicyGradient
    ùí´     # problem
    b     # initial state distribution
    d     # depth
    m     # number of samples
    ‚àáœÄ    # gradient of deterministic policy œÄ(Œ∏, s)
    Q     # parameterized value function Q(œï,s,a)
    ‚àáQœï   # gradient of value function with respect to œï
    ‚àáQa   # gradient of value function with respect to a
    œÉ     # policy noise
end

function gradient(M::DeterministicPolicyGradient, œÄ, Œ∏, œï)
    ùí´, b, d, m, ‚àáœÄ = M.ùí´, M.b, M.d, M.m, M.‚àáœÄ
    Q, ‚àáQœï, ‚àáQa, œÉ, Œ≥ = M.Q, M.‚àáQœï, M.‚àáQa, M.œÉ, M.ùí´.Œ≥
    œÄ_rand(s) = œÄ(Œ∏, s) + œÉ*randn()*I
    ‚àáUŒ∏(œÑ) = sum(‚àáœÄ(Œ∏,s)*‚àáQa(œï,s,œÄ(Œ∏,s))*Œ≥^(j-1) for (j,(s,a,r))
                in enumerate(œÑ))
    ‚àá‚Ñìœï(œÑ,j) = begin
        s, a, r = œÑ[j]
        s‚Ä≤ = œÑ[j+1][1]
        a‚Ä≤ = œÄ(Œ∏,s‚Ä≤)
        Œ¥ = r + Œ≥*Q(œï,s‚Ä≤,a‚Ä≤) - Q(œï,s,a)
        return Œ¥*(Œ≥*‚àáQœï(œï,s‚Ä≤,a‚Ä≤) - ‚àáQœï(œï,s,a))
    end
    ‚àá‚Ñìœï(œÑ) = sum(‚àá‚Ñìœï(œÑ,j) for j in 1:length(œÑ)-1)
    trajs = [simulate(ùí´, rand(b), œÄ_rand, d) for i in 1:m]
    return mean(‚àáUŒ∏(œÑ) for œÑ in trajs), mean(‚àá‚Ñìœï(œÑ) for œÑ in trajs)
end
####################

#################### validation 1
function adversarial(ùí´::MDP, œÄ, Œª)
    ùíÆ, ùíú, T, R, Œ≥ = ùí´.ùíÆ, ùí´.ùíú, ùí´.T, ùí´.R, ùí´.Œ≥
    ùíÆ‚Ä≤ = ùíú‚Ä≤ = ùíÆ
    R‚Ä≤ = zeros(length(ùíÆ‚Ä≤), length(ùíú‚Ä≤))
    T‚Ä≤ = zeros(length(ùíÆ‚Ä≤), length(ùíú‚Ä≤), length(ùíÆ‚Ä≤))
    for s in ùíÆ‚Ä≤
        for a in ùíú‚Ä≤
            R‚Ä≤[s,a] = -R(s, œÄ(s)) + Œª*log(T(s, œÄ(s), a))
            T‚Ä≤[s,a,a] = 1
        end
    end
    return MDP(T‚Ä≤, R‚Ä≤, Œ≥)
end
####################

#################### exploration-and-exploitation 1
struct BanditProblem
    Œ∏ # vector of payoff probabilities
    R # reward sampler
end

function BanditProblem(Œ∏)
    R(a) = rand() < Œ∏[a] ? 1 : 0
    return BanditProblem(Œ∏, R)
end

function simulate(ùí´::BanditProblem, model, œÄ, h)
    for i in 1:h
        a = œÄ(model)
        r = ùí´.R(a)
        update!(model, a, r)
    end
end
####################

#################### exploration-and-exploitation 2
struct BanditModel
    B # vector of beta distributions
end

function update!(model::BanditModel, a, r)
    Œ±, Œ≤ = StatsBase.params(model.B[a])
    model.B[a] = Beta(Œ± + r, Œ≤ + (1-r))
    return model
end
####################

#################### exploration-and-exploitation 3
mutable struct EpsilonGreedyExploration
    œµ # probability of random arm
end

function (œÄ::EpsilonGreedyExploration)(model::BanditModel)
    if rand() < œÄ.œµ
        return rand(eachindex(model.B))
    else
        return argmax(mean.(model.B))
    end
end
####################

#################### exploration-and-exploitation 4
mutable struct ExploreThenCommitExploration
    k # pulls remaining until commitment
end

function (œÄ::ExploreThenCommitExploration)(model::BanditModel)
    if œÄ.k > 0
        œÄ.k -= 1
        return rand(eachindex(model.B))
    end
    return argmax(mean.(model.B))
end
####################

#################### exploration-and-exploitation 5
mutable struct SoftmaxExploration
    Œª # precision parameter
    Œ± # precision factor
end

function (œÄ::SoftmaxExploration)(model::BanditModel)
    weights = exp.(œÄ.Œª * mean.(model.B))
    œÄ.Œª *= œÄ.Œ±
    return rand(Categorical(normalize(weights, 1)))
end
####################

#################### exploration-and-exploitation 6
mutable struct QuantileExploration
    Œ± # quantile (e.g., 0.95)
end

function (œÄ::QuantileExploration)(model::BanditModel)
    return argmax([quantile(B, œÄ.Œ±) for B in model.B])
end
####################

#################### exploration-and-exploitation 7
mutable struct UCB1Exploration
    c # exploration constant
end

function bonus(œÄ::UCB1Exploration, B, a)
	N = sum(b.Œ± + b.Œ≤ for b in B)
	Na = B[a].Œ± + B[a].Œ≤
    return œÄ.c * sqrt(log(N)/Na)
end

function (œÄ::UCB1Exploration)(model::BanditModel)
	B = model.B
    œÅ = mean.(B)
    u = œÅ .+ [bonus(œÄ, B, a) for a in eachindex(B)]
    return argmax(u)
end
####################

#################### exploration-and-exploitation 8
struct PosteriorSamplingExploration end

(œÄ::PosteriorSamplingExploration)(model::BanditModel) =
    argmax(rand.(model.B))
####################

#################### exploration-and-exploitation 9
function simulate(ùí´::MDP, model, œÄ, h, s)
    for i in 1:h
        a = œÄ(model, s)
        s‚Ä≤, r = ùí´.TR(s, a)
        update!(model, s, a, r, s‚Ä≤)
        s = s‚Ä≤
    end
end
####################

#################### model-based-methods 1
mutable struct MaximumLikelihoodMDP
    ùíÆ # state space (assumes 1:nstates)
    ùíú # action space (assumes 1:nactions)
    N # transition count N(s,a,s‚Ä≤)
    œÅ # reward sum œÅ(s, a)
    Œ≥ # discount
    U # value function
    planner
end

function lookahead(model::MaximumLikelihoodMDP, s, a)
    ùíÆ, U, Œ≥ = model.ùíÆ, model.U, model.Œ≥
    n = sum(model.N[s,a,:])
    if n == 0
        return 0.0
    end
    r = model.œÅ[s, a] / n
    T(s,a,s‚Ä≤) = model.N[s,a,s‚Ä≤] / n
    return r + Œ≥ * sum(T(s,a,s‚Ä≤)*U[s‚Ä≤] for s‚Ä≤ in ùíÆ)
end

function backup(model::MaximumLikelihoodMDP, U, s)
    return maximum(lookahead(model, s, a) for a in model.ùíú)
end

function update!(model::MaximumLikelihoodMDP, s, a, r, s‚Ä≤)
    model.N[s,a,s‚Ä≤] += 1
    model.œÅ[s,a] += r
    update!(model.planner, model, s, a, r, s‚Ä≤)
    return model
end
####################

#################### model-based-methods 2
function MDP(model::MaximumLikelihoodMDP)
    N, œÅ, ùíÆ, ùíú, Œ≥ = model.N, model.œÅ, model.ùíÆ, model.ùíú, model.Œ≥
    T, R = similar(N), similar(œÅ)
    for s in ùíÆ
        for a in ùíú
            n = sum(N[s,a,:])
            if n == 0
                T[s,a,:] .= 0.0
                R[s,a] = 0.0
            else
                T[s,a,:] = N[s,a,:] / n
                R[s,a] = œÅ[s,a] / n
            end
        end
    end
    return MDP(T, R, Œ≥)
end
####################

#################### model-based-methods 3
struct FullUpdate end

function update!(planner::FullUpdate, model, s, a, r, s‚Ä≤)
    ùí´ = MDP(model)
    U = solve(ùí´).U
    copy!(model.U, U)
    return planner
end
####################

#################### model-based-methods 4
struct RandomizedUpdate
    m # number of updates
end

function update!(planner::RandomizedUpdate, model, s, a, r, s‚Ä≤)
    U = model.U
    U[s] = backup(model, U, s)
    for i in 1:planner.m
        s = rand(model.ùíÆ)
        U[s] = backup(model, U, s)
    end
    return planner
end
####################

#################### model-based-methods 5
struct PrioritizedUpdate
    m  # number of updates
    pq # priority queue
end

function update!(planner::PrioritizedUpdate, model, s)
    N, U, pq = model.N, model.U, planner.pq
    ùíÆ, ùíú = model.ùíÆ, model.ùíú
    u = U[s]
    U[s] = backup(model, U, s)
    for s‚Åª in ùíÆ
        for a‚Åª in ùíú
            n_sa = sum(N[s‚Åª,a‚Åª,s‚Ä≤] for s‚Ä≤ in ùíÆ)
            if n_sa > 0
                T = N[s‚Åª,a‚Åª,s] / n_sa
                priority = T * abs(U[s] - u)
                if priority > 0
                    pq[s‚Åª] = max(get(pq, s‚Åª, 0.0), priority)
                end
            end
        end
    end
    return planner
end

function update!(planner::PrioritizedUpdate, model, s, a, r, s‚Ä≤)
    planner.pq[s] = Inf
    for i in 1:planner.m
        if isempty(planner.pq)
            break
        end
        update!(planner, model, dequeue!(planner.pq))
    end
    return planner
end
####################

#################### model-based-methods 6
function (œÄ::EpsilonGreedyExploration)(model, s)
    ùíú, œµ = model.ùíú, œÄ.œµ
    if rand() < œµ
        return rand(ùíú)
    end
    Q(s,a) = lookahead(model, s, a)
    return argmax(a->Q(s,a), ùíú)
end
####################

#################### model-based-methods 7
mutable struct RmaxMDP
    ùíÆ # state space (assumes 1:nstates)
    ùíú # action space (assumes 1:nactions)
    N # transition count N(s,a,s‚Ä≤)
    œÅ # reward sum œÅ(s, a)
    Œ≥ # discount
    U # value function
    planner
    m    # count threshold
    rmax # maximum reward
end

function lookahead(model::RmaxMDP, s, a)
    ùíÆ, U, Œ≥ = model.ùíÆ, model.U, model.Œ≥
    n = sum(model.N[s,a,:])
    if n < model.m
        return model.rmax / (1-Œ≥)
    end
    r = model.œÅ[s, a] / n
    T(s,a,s‚Ä≤) = model.N[s,a,s‚Ä≤] / n
    return r + Œ≥ * sum(T(s,a,s‚Ä≤)*U[s‚Ä≤] for s‚Ä≤ in ùíÆ)
end

function backup(model::RmaxMDP, U, s)
    return maximum(lookahead(model, s, a) for a in model.ùíú)
end

function update!(model::RmaxMDP, s, a, r, s‚Ä≤)
    model.N[s,a,s‚Ä≤] += 1
    model.œÅ[s,a] += r
    update!(model.planner, model, s, a, r, s‚Ä≤)
    return model
end

function MDP(model::RmaxMDP)
    N, œÅ, ùíÆ, ùíú, Œ≥ = model.N, model.œÅ, model.ùíÆ, model.ùíú, model.Œ≥
    T, R, m, rmax = similar(N), similar(œÅ), model.m, model.rmax
    for s in ùíÆ
        for a in ùíú
            n = sum(N[s,a,:])
            if n < m
                T[s,a,:] .= 0.0
                T[s,a,s] = 1.0
                R[s,a] = rmax
            else
                T[s,a,:] = N[s,a,:] / n
                R[s,a] = œÅ[s,a] / n
            end
        end
    end
    return MDP(T, R, Œ≥)
end
####################

#################### model-based-methods 8
mutable struct BayesianMDP
    ùíÆ # state space (assumes 1:nstates)
    ùíú # action space (assumes 1:nactions)
    D # Dirichlet distributions D[s,a]
    R # reward function as matrix (not estimated)
    Œ≥ # discount
    U # value function
    planner
end

function lookahead(model::BayesianMDP, s, a)
    ùíÆ, U, Œ≥ = model.ùíÆ, model.U, model.Œ≥
    n = sum(model.D[s,a].alpha)
    if n == 0
        return 0.0
    end
    r = model.R(s,a)
    T(s,a,s‚Ä≤) = model.D[s,a].alpha[s‚Ä≤] / n
    return r + Œ≥ * sum(T(s,a,s‚Ä≤)*U[s‚Ä≤] for s‚Ä≤ in ùíÆ)
end

function update!(model::BayesianMDP, s, a, r, s‚Ä≤)
    Œ± = model.D[s,a].alpha
    Œ±[s‚Ä≤] += 1
    model.D[s,a] = Dirichlet(Œ±)
    update!(model.planner, model, s, a, r, s‚Ä≤)
    return model
end
####################

#################### model-based-methods 9
struct PosteriorSamplingUpdate end

function Base.rand(model::BayesianMDP)
    ùíÆ, ùíú = model.ùíÆ, model.ùíú
    T = zeros(length(ùíÆ), length(ùíú), length(ùíÆ))
    for s in ùíÆ
        for a in ùíú
            T[s,a,:] = rand(model.D[s,a])
        end
    end
    return MDP(T, model.R, model.Œ≥)
end

function update!(planner::PosteriorSamplingUpdate, model, s, a, r, s‚Ä≤)
    ùí´ = rand(model)
    U = solve(ùí´).U
    copy!(model.U, U)
end
####################

#################### model-free-methods 1
mutable struct IncrementalEstimate
	Œº # mean estimate
	Œ± # learning rate function
	m # number of updates
end

function update!(model::IncrementalEstimate, x)
	model.m += 1
	model.Œº += model.Œ±(model.m) * (x - model.Œº)
	return model
end
####################

#################### model-free-methods 2
mutable struct QLearning
    ùíÆ # state space (assumes 1:nstates)
    ùíú # action space (assumes 1:nactions)
    Œ≥ # discount
    Q # action value function
    Œ± # learning rate
end

lookahead(model::QLearning, s, a) = model.Q[s,a]

function update!(model::QLearning, s, a, r, s‚Ä≤)
    Œ≥, Q, Œ± = model.Œ≥, model.Q, model.Œ±
    Q[s,a] += Œ±*(r + Œ≥*maximum(Q[s‚Ä≤,:]) - Q[s,a])
    return model
end
####################

#################### model-free-methods 3
mutable struct Sarsa
    ùíÆ # state space (assumes 1:nstates)
    ùíú # action space (assumes 1:nactions)
    Œ≥ # discount
    Q # action value function
    Œ± # learning rate
    ‚Ñì # most recent experience tuple (s,a,r)
end

lookahead(model::Sarsa, s, a) = model.Q[s,a]

function update!(model::Sarsa, s, a, r, s‚Ä≤)
    if model.‚Ñì != nothing
        Œ≥, Q, Œ±, ‚Ñì = model.Œ≥, model.Q, model.Œ±,  model.‚Ñì
        model.Q[‚Ñì.s,‚Ñì.a] += Œ±*(‚Ñì.r + Œ≥*Q[s,a] - Q[‚Ñì.s,‚Ñì.a])
    end
    model.‚Ñì = (s=s, a=a, r=r)
    return model
end
####################

#################### model-free-methods 4
mutable struct SarsaLambda
    ùíÆ # state space (assumes 1:nstates)
    ùíú # action space (assumes 1:nactions)
    Œ≥ # discount
    Q # action value function
    N # trace
    Œ± # learning rate
    Œª # trace decay rate
    ‚Ñì # most recent experience tuple (s,a,r)
end

lookahead(model::SarsaLambda, s, a) = model.Q[s,a]

function update!(model::SarsaLambda, s, a, r, s‚Ä≤)
    if model.‚Ñì != nothing
        Œ≥, Œª, Q, Œ±, ‚Ñì = model.Œ≥, model.Œª, model.Q, model.Œ±, model.‚Ñì
        model.N[‚Ñì.s,‚Ñì.a] += 1
        Œ¥ = ‚Ñì.r + Œ≥*Q[s,a] - Q[‚Ñì.s,‚Ñì.a]
        for s in model.ùíÆ
            for a in model.ùíú
                model.Q[s,a] += Œ±*Œ¥*model.N[s,a]
                model.N[s,a] *= Œ≥*Œª
            end
        end
    else
    	model.N[:,:] .= 0.0
    end
    model.‚Ñì = (s=s, a=a, r=r)
    return model
end
####################

#################### model-free-methods 5
struct GradientQLearning
    ùíú  # action space (assumes 1:nactions)
    Œ≥  # discount
    Q  # parameterized action value function Q(Œ∏,s,a)
    ‚àáQ # gradient of action value function
    Œ∏  # action value function parameter
    Œ±  # learning rate
end

function lookahead(model::GradientQLearning, s, a)
    return model.Q(model.Œ∏, s,a)
end

function update!(model::GradientQLearning, s, a, r, s‚Ä≤)
    ùíú, Œ≥, Q, Œ∏, Œ± = model.ùíú, model.Œ≥, model.Q, model.Œ∏, model.Œ±
    u = maximum(Q(Œ∏,s‚Ä≤,a‚Ä≤) for a‚Ä≤ in ùíú)
    Œî = (r + Œ≥*u - Q(Œ∏,s,a))*model.‚àáQ(Œ∏,s,a)
    Œ∏[:] += Œ±*scale_gradient(Œî, 1)
    return model
end
####################

#################### model-free-methods 6
struct ReplayGradientQLearning
    ùíú      # action space (assumes 1:nactions)
    Œ≥      # discount
    Q      # parameterized action value function Q(Œ∏,s,a)
    ‚àáQ     # gradient of action value function
    Œ∏      # action value function parameter
    Œ±      # learning rate
    buffer # circular memory buffer
    m      # number of steps between gradient updates
    m_grad # batch size
end

function lookahead(model::ReplayGradientQLearning, s, a)
    return model.Q(model.Œ∏, s,a)
end

function update!(model::ReplayGradientQLearning, s, a, r, s‚Ä≤)
    ùíú, Œ≥, Q, Œ∏, Œ± = model.ùíú, model.Œ≥, model.Q, model.Œ∏, model.Œ±
    buffer, m, m_grad = model.buffer, model.m, model.m_grad
    if isfull(buffer)
        U(s) = maximum(Q(Œ∏,s,a) for a in ùíú)
        ‚àáQ(s,a,r,s‚Ä≤) = (r + Œ≥*U(s‚Ä≤) - Q(Œ∏,s,a))*model.‚àáQ(Œ∏,s,a)
        Œî = mean(‚àáQ(s,a,r,s‚Ä≤) for (s,a,r,s‚Ä≤) in rand(buffer, m_grad))
        Œ∏[:] += Œ±*scale_gradient(Œî, 1)
        for i in 1:m # discard oldest experiences
            popfirst!(buffer)
        end
    else
        push!(buffer, (s,a,r,s‚Ä≤))
    end
    return model
end
####################

#################### imitation-learning 1
struct BehavioralCloning
    Œ±     # step size
    k_max # number of iterations
    ‚àálogœÄ # log likelihood gradient
end

function optimize(M::BehavioralCloning, D, Œ∏)
	Œ±, k_max, ‚àálogœÄ = M.Œ±, M.k_max, M.‚àálogœÄ
	for k in 1:k_max
		‚àá = mean(‚àálogœÄ(Œ∏, a, s) for (s,a) in D)
		Œ∏ += Œ±*‚àá
	end
	return Œ∏
end
####################

#################### imitation-learning 2
struct CostSensitiveMultiClassifier
	ùíú     # action space
    Œ±     # step size
    C     # cost function
    k_max # number of iterations
    ‚àáœÄ    # policy likelihood gradient
end

function optimize(M::CostSensitiveMultiClassifier, D, Œ∏)
	ùíú, Œ±, C, k_max, ‚àáœÄ = M.ùíú, M.Œ±, M.C, M.k_max, M.‚àáœÄ
	for k in 1:k_max
		‚àá = mean(sum(C(s,a,a_pred)*‚àáœÄ(Œ∏, a_pred, s)
				for a_pred in ùíú)
					for (s,a) in D)
		Œ∏ -= Œ±*‚àá
	end
	return Œ∏
end
####################

#################### imitation-learning 3
struct DataSetAggregation
	ùí´     # problem with unknown reward function
	bc    # behavioral cloning struct
	k_max # number of iterations
	m     # number of rollouts per iteration
	d     # rollout depth
	b     # initial state distribution
	œÄE    # expert
	œÄŒ∏    # parameterized policy
end

function optimize(M::DataSetAggregation, D, Œ∏)
	ùí´, bc, k_max, m = M.ùí´, M.bc, M.k_max, M.m
	d, b, œÄE, œÄŒ∏ = M.d, M.b, M.œÄE, M.œÄŒ∏
	Œ∏ = optimize(bc, D, Œ∏)
	for k in 2:k_max
		for i in 1:m
			s = rand(b)
			for j in 1:d
				push!(D, (s, œÄE(s)))
				a = rand(œÄŒ∏(Œ∏, s))
				s = rand(ùí´.T(s, a))
			end
		end
		Œ∏ = optimize(bc, D, Œ∏)
	end
	return Œ∏
end
####################

#################### imitation-learning 4
struct SEARN
	ùí´     # problem with unknown reward
	mc    # cost-sensitive multiclass classifier struct
	k_max # number of iterations
	m     # number of rollouts per iteration
	d     # rollout depth
	b     # initial state distribution
	Œ≤     # mixing scalar
	œÄE    # expert policy
	œÄŒ∏    # parameterized policy
end

function optimize(M::SEARN, Œ∏)
	ùí´, mc, k_max, m = M.ùí´, M.mc, M.k_max, M.m
	d, b, Œ≤, œÄE, œÄŒ∏ = M.d, M.b, M.Œ≤, M.œÄE, M.œÄŒ∏
	Œ∏s, œÄ = Vector{Float64}[], s -> œÄE(s)
    T, ùíú = ùí´.T, ùí´.ùíú
	for k in 1:k_max
        D = []
		for i in 1:m
			s = rand(b)
			for j in 1:d
				c = [rollout(ùí´, rand(T(s, a)), œÄ, d-j) for a in ùíú]
				c = maximum(c) .- c

				push!(D, (s, c))
				s = rand(T(s, œÄ(s)))
			end
		end

		Œ∏ = optimize(mc, D, Œ∏)
		push!(Œ∏s, Œ∏)

		œÄ_hat = s -> rand(Categorical(œÄŒ∏(Œ∏, s)))
		œÄ = s -> rand() < Œ≤ ? œÄ_hat(s) : œÄ(s)
	end

	# Compute a policy that does not contain the expert
	PœÄ = Categorical(normalize([(1-Œ≤)^(k_max-i) for i in 1:k_max],1))
	return œÄ = s -> rand(Categorical(œÄŒ∏(Œ∏s[rand(PœÄ)], s)))
end
####################

#################### imitation-learning 5
struct SMILe
	ùí´     # problem with unknown reward
	bc    # Behavioral cloning struct
	k_max # number of iterations
	m     # number of rollouts per iteration
	d     # rollout depth
	b     # initial state distribution
	Œ≤     # mixing scalar (e.g., d^-3)
	œÄE    # expert policy
	œÄŒ∏    # parameterized policy
end

function optimize(M::SMILe, Œ∏)
    ùí´, bc, k_max, m = M.ùí´, M.bc, M.k_max, M.m
    d, b, Œ≤, œÄE, œÄŒ∏ = M.d, M.b, M.Œ≤, M.œÄE, M.œÄŒ∏
    ùíú, T = ùí´.ùíú, ùí´.T
	Œ∏s = []
	œÄ = s -> œÄE(s)
	for k in 1:k_max
		# execute latest œÄ to get new data set D
        D = []
		for i in 1:m
			s = rand(b)
			for j in 1:d
				push!(D, (s, œÄE(s)))
				a = œÄ(s)
				s = rand(T(s, a))
			end
		end
		# train new policy classifier
		Œ∏ = optimize(bc, D, Œ∏)
		push!(Œ∏s, Œ∏)
		# compute a new policy mixture
		PœÄ = Categorical(normalize([(1-Œ≤)^(i-1) for i in 1:k],1))
		œÄ = s -> begin
			if rand() < (1-Œ≤)^(k-1)
				return œÄE(s)
			else
				return rand(Categorical(œÄŒ∏(Œ∏s[rand(PœÄ)], s)))
			end
		end
	end
	Ps = normalize([(1-Œ≤)^(i-1) for i in 1:k_max],1)
    return Ps, Œ∏s
end
####################

#################### imitation-learning 6
struct InverseReinforcementLearning
    ùí´  # problem
    b  # initial state distribution
    d  # depth
    m  # number of samples
    œÄ  # parameterized policy
    Œ≤  # binary feature mapping
    ŒºE # expert feature expectations
    RL # reinforcement learning method
    œµ  # tolerance
end

function feature_expectations(M::InverseReinforcementLearning, œÄ)
    ùí´, b, m, d, Œ≤, Œ≥ = M.ùí´, M.b, M.m, M.d, M.Œ≤, M.ùí´.Œ≥
    Œº(œÑ) = sum(Œ≥^(k-1)*Œ≤(s, a) for (k,(s,a)) in enumerate(œÑ))
    œÑs = [simulate(ùí´, rand(b), œÄ, d) for i in 1:m]
    return mean(Œº(œÑ) for œÑ in œÑs)
end
####################

#################### imitation-learning 7
function calc_weighting(M::InverseReinforcementLearning, Œºs)
    ŒºE = M.ŒºE
    k = length(ŒºE)
    model = Model(Ipopt.Optimizer)
    @variable(model, t)
    @variable(model, œï[1:k] ‚â• 0)
    @objective(model, Max, t)
    for Œº in Œºs
        @constraint(model, œï‚ãÖŒºE ‚â• œï‚ãÖŒº + t)
    end
    @constraint(model, œï‚ãÖœï ‚â§ 1)
    optimize!(model)
    return (value(t), value.(œï))
end

function calc_policy_mixture(M::InverseReinforcementLearning, Œºs)
    ŒºE = M.ŒºE
    k = length(Œºs)
    model = Model(Ipopt.Optimizer)
    @variable(model, Œª[1:k] ‚â• 0)
    @objective(model, Min, (ŒºE - sum(Œª[i]*Œºs[i] for i in 1:k))‚ãÖ
                            (ŒºE - sum(Œª[i]*Œºs[i] for i in 1:k)))
    @constraint(model, sum(Œª) == 1)
    optimize!(model)
    return value.(Œª)
end

function optimize(M::InverseReinforcementLearning, Œ∏)
    œÄ, œµ, RL = M.œÄ, M.œµ, M.RL
    Œ∏s = [Œ∏]
    Œºs = [feature_expectations(M, s->œÄ(Œ∏,s))]
    while true
        t, œï = calc_weighting(M, Œºs)
        if t ‚â§ œµ
            break
        end
        copyto!(RL.œï, œï) # R(s,a) = œï‚ãÖŒ≤(s,a)
        Œ∏ = optimize(RL, œÄ, Œ∏)
        push!(Œ∏s, Œ∏)
        push!(Œºs, feature_expectations(M, s->œÄ(Œ∏,s)))
    end
    Œª = calc_policy_mixture(M, Œºs)
    return Œª, Œ∏s
end
####################

#################### imitation-learning 8
struct MaximumEntropyIRL
    ùí´     # problem
    b     # initial state distribution
    d     # depth
    œÄ     # parameterized policy œÄ(Œ∏,s)
    PœÄ    # parameterized policy likelihood œÄ(Œ∏, a, s)
    ‚àáR    # reward function gradient
    RL    # reinforcement learning method
    Œ±     # step size
    k_max # number of iterations
end

function discounted_state_visitations(M::MaximumEntropyIRL, Œ∏)
	ùí´, b, d, PœÄ = M.ùí´, M.b, M.d, M.PœÄ
	ùíÆ, ùíú, T, Œ≥ = ùí´.ùíÆ, ùí´.ùíú, ùí´.T, ùí´.Œ≥
	b_sk = zeros(length(ùí´.ùíÆ), d)
	b_sk[:,1] = [pdf(b, s) for s in ùíÆ]
	for k in 2:d
        for (si‚Ä≤, s‚Ä≤) in enumerate(ùíÆ)
            b_sk[si‚Ä≤,k] = Œ≥*sum(sum(b_sk[si,k-1]*PœÄ(Œ∏, a, s)*T(s, a, s‚Ä≤)
					for (si,s) in enumerate(ùíÆ))
				for a in ùíú)
		end
	end
    return normalize!(vec(mean(b_sk, dims=2)),1)
end

function optimize(M::MaximumEntropyIRL, D, œï, Œ∏)
	ùí´, œÄ, PœÄ, ‚àáR, RL, Œ±, k_max = M.ùí´, M.œÄ, M.PœÄ, M.‚àáR, M.RL, M.Œ±, M.k_max
    ùíÆ, ùíú, Œ≥, nD = ùí´.ùíÆ, ùí´.ùíú, ùí´.Œ≥, length(D)
    for k in 1:k_max
    	copyto!(RL.œï, œï) # update parameters
		Œ∏ = optimize(RL, œÄ, Œ∏)
    	b = discounted_state_visitations(M, Œ∏)
    	‚àáRœÑ = œÑ -> sum(Œ≥^(i-1)*‚àáR(œï,s,a) for (i,(s,a)) in enumerate(œÑ))
    	‚àáf = sum(‚àáRœÑ(œÑ) for œÑ in D) - nD*sum(b[si]*sum(PœÄ(Œ∏,a,s)*‚àáR(œï,s,a)
        			for (ai,a) in enumerate(ùíú))
        		for (si, s) in enumerate(ùíÆ))
        œï += Œ±*‚àáf
    end
    return œï, Œ∏
end
####################

#################### beliefs 1
struct POMDP
    Œ≥   # discount factor
    ùíÆ   # state space
    ùíú   # action space
    ùí™   # observation space
    T   # transition function
    R   # reward function
    O   # observation function
    TRO # sample transition, reward, and observation
end
####################

#################### beliefs 2
function update(b::Vector{Float64}, ùí´, a, o)
	ùíÆ, T, O = ùí´.ùíÆ, ùí´.T, ùí´.O
    b‚Ä≤ = similar(b)
    for (i‚Ä≤, s‚Ä≤) in enumerate(ùíÆ)
        po = O(a, s‚Ä≤, o)
        b‚Ä≤[i‚Ä≤] = po * sum(T(s, a, s‚Ä≤) * b[i] for (i, s) in enumerate(ùíÆ))
    end
    if sum(b‚Ä≤) ‚âà 0.0
    	fill!(b‚Ä≤, 1)
    end
    return normalize!(b‚Ä≤, 1)
end
####################

#################### beliefs 3
struct KalmanFilter
	Œºb # mean vector
	Œ£b # covariance matrix
end

function update(b::KalmanFilter, ùí´, a, o)
	Œºb, Œ£b = b.Œºb, b.Œ£b
	Ts, Ta, Os = ùí´.Ts, ùí´.Ta, ùí´.Os
	Œ£s, Œ£o = ùí´.Œ£s, ùí´.Œ£o
	# predict
	Œºp = Ts*Œºb + Ta*a
	Œ£p = Ts*Œ£b*Ts' + Œ£s
	# update
	Œ£po = Œ£p*Os'
	K = Œ£po/(Os*Œ£p*Os' + Œ£o)
	Œºb‚Ä≤ = Œºp + K*(o - Os*Œºp)
	Œ£b‚Ä≤ = (I - K*Os)*Œ£p
	return KalmanFilter(Œºb‚Ä≤, Œ£b‚Ä≤)
end
####################

#################### beliefs 4
struct ExtendedKalmanFilter
	Œºb # mean vector
	Œ£b # covariance matrix
end

import ForwardDiff: jacobian
function update(b::ExtendedKalmanFilter, ùí´, a, o)
	Œºb, Œ£b = b.Œºb, b.Œ£b
	fT, fO = ùí´.fT, ùí´.fO
	Œ£s, Œ£o = ùí´.Œ£s, ùí´.Œ£o
	# predict
	Œºp = fT(Œºb, a)
	Ts = jacobian(s->fT(s, a), Œºb)
	Os = jacobian(fO, Œºp)
	Œ£p = Ts*Œ£b*Ts' + Œ£s
	# update
	Œ£po = Œ£p*Os'
	K = Œ£po/(Os*Œ£p*Os' + Œ£o)
	Œºb‚Ä≤ = Œºp + K*(o - fO(Œºp))
	Œ£b‚Ä≤ = (I - K*Os)*Œ£p
	return ExtendedKalmanFilter(Œºb‚Ä≤, Œ£b‚Ä≤)
end
####################

#################### beliefs 5
struct UnscentedKalmanFilter
	Œºb # mean vector
	Œ£b # covariance matrix
	Œª  # spread parameter
end

function unscented_transform(Œº, Œ£, f, Œª, ws)
    n = length(Œº)
    Œî = cholesky((n + Œª) * Œ£).L
	S = [Œº]
	for i in 1:n
		push!(S, Œº + Œî[:,i])
		push!(S, Œº - Œî[:,i])
	end
    S‚Ä≤ = f.(S)
	Œº‚Ä≤ = sum(w*s for (w,s) in zip(ws, S‚Ä≤))
	Œ£‚Ä≤ = sum(w*(s - Œº‚Ä≤)*(s - Œº‚Ä≤)' for (w,s) in zip(ws, S‚Ä≤))
	return (Œº‚Ä≤, Œ£‚Ä≤, S, S‚Ä≤)
end

function update(b::UnscentedKalmanFilter, ùí´, a, o)
	Œºb, Œ£b, Œª = b.Œºb, b.Œ£b, b.Œª
	fT, fO = ùí´.fT, ùí´.fO
	n = length(Œºb)
	ws = [Œª / (n + Œª); fill(1/(2(n + Œª)), 2n)]
    # predict
    Œºp, Œ£p, Sp, Sp‚Ä≤ = unscented_transform(Œºb, Œ£b, s->fT(s,a), Œª, ws)
	Œ£p += ùí´.Œ£s
    # update
    Œºo, Œ£o, So, So‚Ä≤ = unscented_transform(Œºp, Œ£p, fO, Œª, ws)
	Œ£o += ùí´.Œ£o
	Œ£po = sum(w*(s - Œºp)*(s‚Ä≤ - Œºo)' for (w,s,s‚Ä≤) in zip(ws, So, So‚Ä≤))
	K = Œ£po / Œ£o
	Œºb‚Ä≤ = Œºp + K*(o - Œºo)
	Œ£b‚Ä≤ = Œ£p - K*Œ£o*K'
	return UnscentedKalmanFilter(Œºb‚Ä≤, Œ£b‚Ä≤, Œª)
end
####################

#################### beliefs 6
struct ParticleFilter
	states # vector of state samples
end

function update(b::ParticleFilter, ùí´, a, o)
	T, O = ùí´.T, ùí´.O
	states = [rand(T(s, a)) for s in b.states]
	weights = [O(a, s‚Ä≤, o) for s‚Ä≤ in states]
	D = SetCategorical(states, weights)
	return ParticleFilter(rand(D, length(states)))
end
####################

#################### beliefs 7
struct RejectionParticleFilter
	states # vector of state samples
end

function update(b::RejectionParticleFilter, ùí´, a, o)
	T, O = ùí´.T, ùí´.O
	states = similar(b.states)
	i = 1
	while i ‚â§ length(states)
		s = rand(b.states)
		s‚Ä≤ = rand(T(s,a))
		if rand(O(a,s‚Ä≤)) == o
			states[i] = s‚Ä≤
			i += 1
		end
	end
	return RejectionParticleFilter(states)
end
####################

#################### beliefs 8
struct InjectionParticleFilter
	states # vector of state samples
	m_inject # number of samples to inject
	D_inject # injection distribution
end

function update(b::InjectionParticleFilter, ùí´, a, o)
	T, O, m_inject, D_inject = ùí´.T, ùí´.O, b.m_inject, b.D_inject
	states = [rand(T(s, a)) for s in b.states]
	weights = [O(a, s‚Ä≤, o) for s‚Ä≤ in states]
	D = SetCategorical(states, weights)
	m = length(states)
	states = vcat(rand(D, m - m_inject), rand(D_inject, m_inject))
	return InjectionParticleFilter(states, m_inject, D_inject)
end
####################

#################### beliefs 9
mutable struct AdaptiveInjectionParticleFilter
	states   # vector of state samples
	w_slow   # slow moving average
	w_fast   # fast moving average
	Œ±_slow   # slow moving average parameter
	Œ±_fast   # fast moving average parameter
	ŒΩ        # injection parameter
	D_inject # injection distribution
end

function update(b::AdaptiveInjectionParticleFilter, ùí´, a, o)
	T, O = ùí´.T, ùí´.O
	w_slow, w_fast, Œ±_slow, Œ±_fast, ŒΩ, D_inject =
		b.w_slow, b.w_fast, b.Œ±_slow, b.Œ±_fast, b.ŒΩ, b.D_inject
	states = [rand(T(s, a)) for s in b.states]
	weights = [O(a, s‚Ä≤, o) for s‚Ä≤ in states]
	w_mean = mean(weights)
	w_slow += Œ±_slow*(w_mean - w_slow)
	w_fast += Œ±_fast*(w_mean - w_fast)
	m = length(states)
	m_inject = round(Int, m * max(0, 1.0 - ŒΩ*w_fast / w_slow))
	D = SetCategorical(states, weights)
	states = vcat(rand(D, m - m_inject), rand(D_inject, m_inject))
	b.w_slow, b.w_fast = w_slow, w_fast
	return AdaptiveInjectionParticleFilter(states,
		w_slow, w_fast, Œ±_slow, Œ±_fast, ŒΩ, D_inject)
end
####################

#################### exact-solutions 1
struct ConditionalPlan
    a        # action to take at root
    subplans # dictionary mapping observations to subplans
end

ConditionalPlan(a) = ConditionalPlan(a, Dict())

(œÄ::ConditionalPlan)() = œÄ.a
(œÄ::ConditionalPlan)(o) = œÄ.subplans[o]
####################

#################### exact-solutions 2
ConditionalPlan(œÄ::Tuple) = ConditionalPlan(œÄ[1], Dict(k=>ConditionalPlan(v) for (k,v) in œÄ[2]))
####################

#################### exact-solutions 3
function lookahead(ùí´::POMDP, U, s, a)
    ùíÆ, ùí™, T, O, R, Œ≥ = ùí´.ùíÆ, ùí´.ùí™, ùí´.T, ùí´.O, ùí´.R, ùí´.Œ≥
    u‚Ä≤ = sum(T(s,a,s‚Ä≤)*sum(O(a,s‚Ä≤,o)*U(o,s‚Ä≤) for o in ùí™) for s‚Ä≤ in ùíÆ)
    return R(s,a) + Œ≥*u‚Ä≤
end

function evaluate_plan(ùí´::POMDP, œÄ::ConditionalPlan, s)
    U(o,s‚Ä≤) = evaluate_plan(ùí´, œÄ(o), s‚Ä≤)
    return isempty(œÄ.subplans) ? ùí´.R(s,œÄ()) : lookahead(ùí´, U, s, œÄ())
end
####################

#################### exact-solutions 4
function alphavector(ùí´::POMDP, œÄ::ConditionalPlan)
    return [evaluate_plan(ùí´, œÄ, s) for s in ùí´.ùíÆ]
end
####################

#################### exact-solutions 5
struct AlphaVectorPolicy
    ùí´ # POMDP problem
    Œì # alpha vectors
    a # actions associated with alpha vectors
end

function utility(œÄ::AlphaVectorPolicy, b)
    return maximum(Œ±‚ãÖb for Œ± in œÄ.Œì)
end

function (œÄ::AlphaVectorPolicy)(b)
    i = argmax([Œ±‚ãÖb for Œ± in œÄ.Œì])
    return œÄ.a[i]
end
####################

#################### exact-solutions 6
function lookahead(ùí´::POMDP, U, b::Vector, a)
    ùíÆ, ùí™, T, O, R, Œ≥ = ùí´.ùíÆ, ùí´.ùí™, ùí´.T, ùí´.O, ùí´.R, ùí´.Œ≥
    r = sum(R(s,a)*b[i] for (i,s) in enumerate(ùíÆ))
    Posa(o,s,a) = sum(O(a,s‚Ä≤,o)*T(s,a,s‚Ä≤) for s‚Ä≤ in ùíÆ)
    Poba(o,b,a) = sum(b[i]*Posa(o,s,a) for (i,s) in enumerate(ùíÆ))
    return r + Œ≥*sum(Poba(o,b,a)*U(update(b, ùí´, a, o)) for o in ùí™)
end

function greedy(ùí´::POMDP, U, b::Vector)
    u, a = findmax(a->lookahead(ùí´, U, b, a), ùí´.ùíú)
    return (a=a, u=u)
end

struct LookaheadAlphaVectorPolicy
    ùí´ # POMDP problem
    Œì # alpha vectors
end

function utility(œÄ::LookaheadAlphaVectorPolicy, b)
    return maximum(Œ±‚ãÖb for Œ± in œÄ.Œì)
end

function greedy(œÄ, b)
    U(b) = utility(œÄ, b)
    return greedy(œÄ.ùí´, U, b)
end

(œÄ::LookaheadAlphaVectorPolicy)(b) = greedy(œÄ, b).a
####################

#################### exact-solutions 7
function find_maximal_belief(Œ±, Œì)
	m = length(Œ±)
	if isempty(Œì)
		return fill(1/m, m) # arbitrary belief
	end
	model = Model(GLPK.Optimizer)
	@variable(model, Œ¥)
	@variable(model, b[i=1:m] ‚â• 0)
	@constraint(model, sum(b) == 1.0)
	for a in Œì
		@constraint(model, (Œ±-a)‚ãÖb ‚â• Œ¥)
	end
	@objective(model, Max, Œ¥)
	optimize!(model)
	return value(Œ¥) > 0 ? value.(b) : nothing
end
####################

#################### exact-solutions 8
function find_dominating(Œì)
    n = length(Œì)
    candidates, dominating = trues(n), falses(n)
    while any(candidates)
        i = findfirst(candidates)
        b = find_maximal_belief(Œì[i], Œì[dominating])
        if b === nothing
            candidates[i] = false
        else
            k = argmax([candidates[j] ? b‚ãÖŒì[j] : -Inf for j in 1:n])
            candidates[k], dominating[k] = false, true
        end
    end
    return dominating
end

function prune(plans, Œì)
    d = find_dominating(Œì)
    return (plans[d], Œì[d])
end
####################

#################### exact-solutions 9
function value_iteration(ùí´::POMDP, k_max)
    ùíÆ, ùíú, R = ùí´.ùíÆ, ùí´.ùíú, ùí´.R
    plans = [ConditionalPlan(a) for a in ùíú]
    Œì = [[R(s,a) for s in ùíÆ] for a in ùíú]
    plans, Œì = prune(plans, Œì)
    for k in 2:k_max
        plans, Œì = expand(plans, Œì, ùí´)
        plans, Œì = prune(plans, Œì)
    end
    return (plans, Œì)
end

function solve(M::ValueIteration, ùí´::POMDP)
    plans, Œì = value_iteration(ùí´, M.k_max)
    return LookaheadAlphaVectorPolicy(ùí´, Œì)
end
####################

#################### exact-solutions 10
function ConditionalPlan(ùí´::POMDP, a, plans)
    subplans = Dict(o=>œÄ for (o, œÄ) in zip(ùí´.ùí™, plans))
    return ConditionalPlan(a, subplans)
end

function combine_lookahead(ùí´::POMDP, s, a, Œìo)
    ùíÆ, ùí™, T, O, R, Œ≥ = ùí´.ùíÆ, ùí´.ùí™, ùí´.T, ùí´.O, ùí´.R, ùí´.Œ≥
    U‚Ä≤(s‚Ä≤,i) = sum(O(a,s‚Ä≤,o)*Œ±[i] for (o,Œ±) in zip(ùí™,Œìo))
    return R(s,a) + Œ≥*sum(T(s,a,s‚Ä≤)*U‚Ä≤(s‚Ä≤,i) for (i,s‚Ä≤) in enumerate(ùíÆ))
end

function combine_alphavector(ùí´::POMDP, a, Œìo)
    return [combine_lookahead(ùí´, s, a, Œìo) for s in ùí´.ùíÆ]
end

function expand(plans, Œì, ùí´)
    ùíÆ, ùíú, ùí™, T, O, R = ùí´.ùíÆ, ùí´.ùíú, ùí´.ùí™, ùí´.T, ùí´.O, ùí´.R
    plans‚Ä≤, Œì‚Ä≤ = [], []
    for a in ùíú
        # iterate over all possible mappings from observations to plans
        for inds in product([eachindex(plans) for o in ùí™]...)
            œÄo = plans[[inds...]]
            Œìo = Œì[[inds...]]
            œÄ = ConditionalPlan(ùí´, a, œÄo)
            Œ± = combine_alphavector(ùí´, a, Œìo)
            push!(plans‚Ä≤, œÄ)
            push!(Œì‚Ä≤, Œ±)
        end
    end
    return (plans‚Ä≤, Œì‚Ä≤)
end
####################

#################### offline-approximations 1
function alphavector_iteration(ùí´::POMDP, M, Œì)
    for k in 1:M.k_max
        Œì = update(ùí´, M, Œì)
    end
    return Œì
end
####################

#################### offline-approximations 2
struct QMDP
    k_max # maximum number of iterations
end

function update(ùí´::POMDP, M::QMDP, Œì)
    ùíÆ, ùíú, R, T, Œ≥ = ùí´.ùíÆ, ùí´.ùíú, ùí´.R, ùí´.T, ùí´.Œ≥
    Œì‚Ä≤ = [[R(s,a) + Œ≥*sum(T(s,a,s‚Ä≤)*maximum(Œ±‚Ä≤[j] for Œ±‚Ä≤ in Œì)
        for (j,s‚Ä≤) in enumerate(ùíÆ)) for s in ùíÆ] for a in ùíú]
    return Œì‚Ä≤
end

function solve(M::QMDP, ùí´::POMDP)
    Œì = [zeros(length(ùí´.ùíÆ)) for a in ùí´.ùíú]
    Œì = alphavector_iteration(ùí´, M, Œì)
    return AlphaVectorPolicy(ùí´, Œì, ùí´.ùíú)
end
####################

#################### offline-approximations 3
struct FastInformedBound
    k_max # maximum number of iterations
end

function update(ùí´::POMDP, M::FastInformedBound, Œì)
    ùíÆ, ùíú, ùí™, R, T, O, Œ≥ = ùí´.ùíÆ, ùí´.ùíú, ùí´.ùí™, ùí´.R, ùí´.T, ùí´.O, ùí´.Œ≥
    Œì‚Ä≤ = [[R(s, a) + Œ≥*sum(maximum(sum(O(a,s‚Ä≤,o)*T(s,a,s‚Ä≤)*Œ±‚Ä≤[j]
        for (j,s‚Ä≤) in enumerate(ùíÆ)) for Œ±‚Ä≤ in Œì) for o in ùí™)
        for s in ùíÆ] for a in ùíú]
    return Œì‚Ä≤
end

function solve(M::FastInformedBound, ùí´::POMDP)
    Œì = [zeros(length(ùí´.ùíÆ)) for a in ùí´.ùíú]
    Œì = alphavector_iteration(ùí´, M, Œì)
    return AlphaVectorPolicy(ùí´, Œì, ùí´.ùíú)
end
####################

#################### offline-approximations 4
function baws_lowerbound(ùí´::POMDP)
    ùíÆ, ùíú, R, Œ≥ = ùí´.ùíÆ, ùí´.ùíú, ùí´.R, ùí´.Œ≥
    r = maximum(minimum(R(s, a) for s in ùíÆ) for a in ùíú) / (1-Œ≥)
    Œ± = fill(r, length(ùíÆ))
    return Œ±
end
####################

#################### offline-approximations 5
function blind_lowerbound(ùí´, k_max)
    ùíÆ, ùíú, T, R, Œ≥ = ùí´.ùíÆ, ùí´.ùíú, ùí´.T, ùí´.R, ùí´.Œ≥
    Q(s,a,Œ±) = R(s,a) + Œ≥*sum(T(s,a,s‚Ä≤)*Œ±[j] for (j,s‚Ä≤) in enumerate(ùíÆ))
    Œì = [baws_lowerbound(ùí´) for a in ùíú]
    for k in 1:k_max
        Œì = [[Q(s,a,Œ±) for s in ùíÆ] for (Œ±,a) in zip(Œì, ùíú)]
    end
    return Œì
end
####################

#################### offline-approximations 6
function backup(ùí´::POMDP, Œì, b)
    ùíÆ, ùíú, ùí™, Œ≥ = ùí´.ùíÆ, ùí´.ùíú, ùí´.ùí™, ùí´.Œ≥
    R, T, O = ùí´.R, ùí´.T, ùí´.O
    Œìa = []
    for a in ùíú
        Œìao = []
        for o in ùí™
            b‚Ä≤ = update(b, ùí´, a, o)
            push!(Œìao, argmax(Œ±->Œ±‚ãÖb‚Ä≤, Œì))
        end
        Œ± = [R(s, a) + Œ≥*sum(sum(T(s, a, s‚Ä≤)*O(a, s‚Ä≤, o)*Œìao[i][j]
            for (j,s‚Ä≤) in enumerate(ùíÆ)) for (i,o) in enumerate(ùí™))
            for s in ùíÆ]
        push!(Œìa, Œ±)
    end
    return argmax(Œ±->Œ±‚ãÖb, Œìa)
end
####################

#################### offline-approximations 7
struct PointBasedValueIteration
    B     # set of belief points
    k_max # maximum number of iterations
end

function update(ùí´::POMDP, M::PointBasedValueIteration, Œì)
    return [backup(ùí´, Œì, b) for b in M.B]
end

function solve(M::PointBasedValueIteration, ùí´)
    Œì = fill(baws_lowerbound(ùí´), length(ùí´.ùíú))
    Œì = alphavector_iteration(ùí´, M, Œì)
    return LookaheadAlphaVectorPolicy(ùí´, Œì)
end
####################

#################### offline-approximations 8
struct RandomizedPointBasedValueIteration
    B     # set of belief points
    k_max # maximum number of iterations
end

function update(ùí´::POMDP, M::RandomizedPointBasedValueIteration, Œì)
    Œì‚Ä≤, B‚Ä≤ = [], copy(M.B)
    while !isempty(B‚Ä≤)
        b = rand(B‚Ä≤)
        Œ± = argmax(Œ±->Œ±‚ãÖb, Œì)
        Œ±‚Ä≤ = backup(ùí´, Œì, b)
        if Œ±‚Ä≤‚ãÖb ‚â• Œ±‚ãÖb
            push!(Œì‚Ä≤, Œ±‚Ä≤)
        else
            push!(Œì‚Ä≤, Œ±)
        end
        filter!(b->maximum(Œ±‚ãÖb for Œ± in Œì‚Ä≤) <
            maximum(Œ±‚ãÖb for Œ± in Œì), B‚Ä≤)
    end
    return Œì‚Ä≤
end

function solve(M::RandomizedPointBasedValueIteration, ùí´)
    Œì = [baws_lowerbound(ùí´)]
    Œì = alphavector_iteration(ùí´, M, Œì)
    return LookaheadAlphaVectorPolicy(ùí´, Œì)
end
####################

#################### offline-approximations 9
struct SawtoothPolicy
    ùí´ # POMDP problem
    V # dictionary mapping beliefs to utilities
end

function basis(ùí´)
    n = length(ùí´.ùíÆ)
    e(i) = [j == i ? 1.0 : 0.0 for j in 1:n]
    return [e(i) for i in 1:n]
end

function utility(œÄ::SawtoothPolicy, b)
    ùí´, V = œÄ.ùí´, œÄ.V
    if haskey(V, b)
        return V[b]
    end
    n = length(ùí´.ùíÆ)
    E = basis(ùí´)
    u = sum(V[E[i]] * b[i] for i in 1:n)
    for (b‚Ä≤, u‚Ä≤) in V
        if b‚Ä≤ ‚àâ E
            i = argmax([norm(b-e, 1) - norm(b‚Ä≤-e, 1) for e in E])
            w = [norm(b - e, 1) for e in E]
            w[i] = norm(b - b‚Ä≤, 1)
            w /= sum(w)
            w = [1 - wi for wi in w]
            Œ± = [V[e] for e in E]
            Œ±[i] = u‚Ä≤
            u = min(u, w‚ãÖŒ±)
        end
    end
    return u
end

(œÄ::SawtoothPolicy)(b) = greedy(œÄ, b).a
####################

#################### offline-approximations 10
struct SawtoothIteration
    V     # initial mapping from beliefs to utilities
    B     # beliefs to compute values including those in V map
    k_max # maximum number of iterations
end

function solve(M::SawtoothIteration, ùí´::POMDP)
    E = basis(ùí´)
    œÄ = SawtoothPolicy(ùí´, M.V)
    for k in 1:M.k_max
        V = Dict(b => (b ‚àà E ? M.V[b] : greedy(œÄ, b).u) for b in M.B)
        œÄ = SawtoothPolicy(ùí´, V)
    end
    return œÄ
end
####################

#################### offline-approximations 11
function randstep(ùí´::POMDP, b, a)
	s = rand(SetCategorical(ùí´.ùíÆ, b))
	s‚Ä≤, r, o = ùí´.TRO(s, a)
	b‚Ä≤ = update(b, ùí´, a, o)
	return b‚Ä≤, r
end
####################

#################### offline-approximations 12
function random_belief_expansion(ùí´, B)
	B‚Ä≤ = copy(B)
	for b in B
		a = rand(ùí´.ùíú)
		b‚Ä≤, r = randstep(ùí´, b, a)
		push!(B‚Ä≤, b‚Ä≤)
	end
	return unique!(B‚Ä≤)
end
####################

#################### offline-approximations 13
function exploratory_belief_expansion(ùí´, B)
    B‚Ä≤ = copy(B)
    for b in B
        best = (b=copy(b), d=0.0)
        for a in ùí´.ùíú
            b‚Ä≤, r = randstep(ùí´, b, a)
            d = minimum(norm(b - b‚Ä≤, 1) for b in B‚Ä≤)
            if d > best.d
                best = (b=b‚Ä≤, d=d)
            end
        end
        push!(B‚Ä≤, best.b)
    end
    return unique!(B‚Ä≤)
end
####################

#################### offline-approximations 14
function directed_belief_expansion(œÄ, b)
    ùí´, ùíÆ, ùíú, ùí™ = œÄ.ùí´, œÄ.ùí´.ùíÆ, œÄ.ùí´.ùíú, œÄ.ùí´.ùí™
    T, O, R, Œ≥ = œÄ.ùí´.T, œÄ.ùí´.O, œÄ.ùí´.R, œÄ.ùí´.Œ≥
    n, Œì, Œ± = length(ùíÆ), œÄ.Œì, argmax(Œ± -> Œ±‚ãÖb, œÄ.Œì)

    B = []
    for (a, o) in Iterators.product(ùíú, ùí™)
        b‚Ä≤ = update(b, ùí´, œÄ(b), o)
        Œ±‚Ä≤ = argmax(Œ± -> Œ±‚ãÖb‚Ä≤, Œì)
        backup(a‚Ä≤) = [sum(T(s, a‚Ä≤, s‚Ä≤) * O(a‚Ä≤, s‚Ä≤, o) * Œ±‚Ä≤[s‚Ä≤Index]
                      for (s‚Ä≤Index, s‚Ä≤) in enumerate(ùíÆ)) for s in ùíÆ]
        Œ≤ = backup(a) - backup(œÄ(b‚Ä≤))

        model = Model(Ipopt.Optimizer)
        @variable(model, belief[1:n] >= 0.0)
        @objective(model, Max, belief‚ãÖŒ≤)
        for Œ±‚Ä≤‚Ä≤ in Œì
            @constraint(model, belief‚ãÖŒ± >= belief‚ãÖŒ±‚Ä≤‚Ä≤)
        end
        @constraint(model, sum(belief[s] for s in 1:n) == 1.0)
        optimize!(model)

        println(termination_status(model))
        println(objective_value(model))
        if (termination_status(model) in [MOI.OPTIMAL,MOI.LOCALLY_SOLVED,
                    MOI.ALMOST_OPTIMAL, MOI.ALMOST_LOCALLY_SOLVED] &&
                objective_value(model) > 0.0)
            push!(B, value.(belief))
        end
    end

    return unique(B)
end
####################

#################### offline-approximations 15
struct SawtoothHeuristicSearch
    b     # initial belief
    Œ¥     # gap threshold
    d     # depth
    k_max # maximum number of iterations
    k_fib # number of iterations for fast informed bound
end

function explore!(M::SawtoothHeuristicSearch, ùí´, œÄhi, œÄlo, b, d=0)
    ùíÆ, ùíú, ùí™, Œ≥ = ùí´.ùíÆ, ùí´.ùíú, ùí´.ùí™, ùí´.Œ≥
    œµ(b‚Ä≤) = utility(œÄhi, b‚Ä≤) - utility(œÄlo, b‚Ä≤)
    if d ‚â• M.d || œµ(b) ‚â§ M.Œ¥ / Œ≥^d
        return
    end
    a = œÄhi(b)
    o = argmax(o -> œµ(update(b, ùí´, a, o)), ùí™)
    b‚Ä≤ = update(b, ùí´, a, o)
    explore!(M, ùí´, œÄhi, œÄlo, b‚Ä≤, d+1)
    if b‚Ä≤ ‚àâ basis(ùí´)
        œÄhi.V[b‚Ä≤] = greedy(œÄhi, b‚Ä≤).u
    end
    push!(œÄlo.Œì, backup(ùí´, œÄlo.Œì, b‚Ä≤))
end

function solve(M::SawtoothHeuristicSearch, ùí´::POMDP)
    œÄfib = solve(FastInformedBound(M.k_fib), ùí´)
    Vhi = Dict(e => utility(œÄfib, e) for e in basis(ùí´))
    œÄhi = SawtoothPolicy(ùí´, Vhi)
    œÄlo = LookaheadAlphaVectorPolicy(ùí´, [baws_lowerbound(ùí´)])
    for i in 1:M.k_max
        explore!(M, ùí´, œÄhi, œÄlo, M.b)
        if utility(œÄhi, M.b) - utility(œÄlo, M.b) < M.Œ¥
            break
        end
    end
    return œÄlo
end
####################

#################### offline-approximations 16
struct FreudenthalTriangulation
    n::Int # dimensionality
    m::Int # granularity
end

"""
    vertices(T::FreudenthalTriangulation)
Construct the list of Freudenthal vertices. The vertices are represented by a list of `n` dimensional vectors.
"""
function vertices(T::FreudenthalTriangulation)
    V = Vector{Int}[]
    v = Vector{Int}(undef, T.n)
    v[1] = T.m
    _vertices!(V, v, 2)
    return V
end

function _vertices!(V::Vector{Vector{Int}}, v::Vector{Int}, i::Int)
    n = length(v)
    if i > n
        push!(V, copy(v))
        return
    end
    for k in 0 : v[i-1]
        v[i] = k
        _vertices!(V, v, i+1)
    end
end

"""
    _freudenthal_simplex(x::Vector{Float64})
Returns the list of vertices of the simplex of point `x` in the Freudenthal grid.
"""
function _freudenthal_simplex(x::Vector{Float64})
    n = length(x)
    V = Vector{Vector{Int}}(undef, n+1)
    V[1] = floor.(Int, x)
    d = x - V[1]
    p = sortperm(d, rev=true)
    for i in 2 : n+1
        V[i] = copy(V[i-1])
        V[i][p[i-1]] += 1
    end
    return V
end

"""
    _barycentric_coordinates(x::Vector{Float64}, V::Vector{Vector{Int}})
Given a point `x` and its simplex `V` in the Freudenthal grid, returns the barycentric coordinates
of `x` in the grid. `V` must be in the same order as provided by the output of `freudenthal_simplex`
"""
function _barycentric_coordinates(x::Vector{Float64}, V::Vector{Vector{Int}})
    d = x - V[1]
    p = sortperm(d, rev=true)
    n = length(x)
    Œª = Vector{Float64}(undef, n+1)
    Œª[n+1] = d[p[n]]
    for i in n:-1:2
        Œª[i] = d[p[i-1]] - d[p[i]]
    end
    Œª[1] = 1.0 - sum(Œª[2:end])
    return Œª
end

"""
    simplex(T::FreudenthalTriangulation, x::Vector{Float64})
Given a point `x`, returns the simplex of the point `x` and the barycentric coordinates of `x` in the grid.
"""
function simplex(T::FreudenthalTriangulation, x::Vector{Float64})
    V = _freudenthal_simplex(x)
    return V, _barycentric_coordinates(x, V)
end

"""
    _to_belief(x)
Transform a point `x` in the Freudenthal space to a point in the belief space.
`m` is the resolution of the Freudenthal grid.
"""
_to_belief(x) = (push!(x[1:end-1] - x[2:end], x[end]))./x[1]

"""
    _to_freudenthal(b, m::Int64)
Transform a point `b` in the belief space to a point in the Freudenthal space.
`m` is the resolution of the Freudenthal grid.
"""
_to_freudenthal(b, m::Int64) = [sum(b[k] for k in i : length(b))*m for i in 1 : length(b)]

belief_vertices(T::FreudenthalTriangulation) = _to_belief.(vertices(T))

function belief_simplex(T::FreudenthalTriangulation, b)
    x = _to_freudenthal(b, T.m)
    V, Œª = simplex(T, x)
    B = _to_belief.(V)
    valid = Œª .> ‚àöeps()
    return B[valid], Œª[valid]
end
####################

#################### offline-approximations 17
struct TriangulatedPolicy
    ùí´ # POMDP problem
    V # dictionary mapping beliefs to utilities
    B # beliefs
    T # Freudenthal triangulation
end

function TriangulatedPolicy(ùí´::POMDP, m)
    T = FreudenthalTriangulation(length(ùí´.ùíÆ), m)
    B = belief_vertices(T)
    V = Dict(b => 0.0 for b in B)
    return TriangulatedPolicy(ùí´, V, B, T)
end

function utility(œÄ::TriangulatedPolicy, b)
    B, Œª = belief_simplex(œÄ.T, b)
    return sum(Œªi*œÄ.V[b] for (Œªi, b) in zip(Œª, B))
end

(œÄ::TriangulatedPolicy)(b) = greedy(œÄ, b).a
####################

#################### offline-approximations 18
struct TriangulatedIteration
    m     # granularity
    k_max # maximum number of iterations
end

function solve(M::TriangulatedIteration, ùí´)
    œÄ = TriangulatedPolicy(ùí´, M.m)
    U(b) = utility(œÄ, b)
    for k in 1:M.k_max
        U‚Ä≤ = [greedy(ùí´, U, b).u for b in œÄ.B]
        for (b, u‚Ä≤) in zip(œÄ.B, U‚Ä≤)
            œÄ.V[b] = u‚Ä≤
        end
    end
    return œÄ
end
####################

#################### online-approximations 1
struct BeliefExplorationPolicy
	ùí´
	N::Dict
	Q::Dict
	c::Float64
end

function (œÄ::BeliefExplorationPolicy)(h)
	ùíú, N, Q, c = œÄ.ùí´.ùíú, œÄ.N, œÄ.Q, œÄ.c
	Nh = sum(get(N, (h,a), 0) for a in ùíú)
	best = (a=nothing, u=-Inf)
	for a in ùíú
		u = Q[(h,a)] + c*(N[(h,a)] == 0 ? Inf :
							sqrt(log(Nh)/N[(h,a)]))
		if u > best.u
			best = (a=a, u=u)
		end
	end
	return best.a
end
####################

#################### online-approximations 2
struct HistoryMonteCarloTreeSearch
	ùí´ # problem
	N # visit counts
	Q # action value estimates
	d # depth
	m # number of simulations
	c # exploration constant
	U # value function estimate
end

function explore(œÄ::HistoryMonteCarloTreeSearch, h)
    ùíú, N, Q, c = œÄ.ùí´.ùíú, œÄ.N, œÄ.Q, œÄ.c
    Nh = sum(get(N, (h,a), 0) for a in ùíú)
    return argmax(a->Q[(h,a)] + c*bonus(N[(h,a)], Nh), ùíú)
end

function simulate(œÄ::HistoryMonteCarloTreeSearch, s, h, d)
    if d ‚â§ 0
        return œÄ.U(s)
    end
    ùí´, N, Q, c = œÄ.ùí´, œÄ.N, œÄ.Q, œÄ.c
    ùíÆ, ùíú, TRO, Œ≥ = ùí´.ùíÆ, ùí´.ùíú, ùí´.TRO, ùí´.Œ≥
    if !haskey(N, (h, first(ùíú)))
        for a in ùíú
            N[(h,a)] = 0
            Q[(h,a)] = 0.0
        end
        return œÄ.U(s)
    end
    a = explore(œÄ, h)
    s‚Ä≤, r, o = TRO(s,a)
    q = r + Œ≥*simulate(œÄ, s‚Ä≤, vcat(h, (a,o)), d-1)
    N[(h,a)] += 1
    Q[(h,a)] += (q-Q[(h,a)])/N[(h,a)]
    return q
end

function (œÄ::HistoryMonteCarloTreeSearch)(b, h=[])
    for i in 1:œÄ.m
        s = rand(SetCategorical(œÄ.ùí´.ùíÆ, b))
        simulate(œÄ, s, h, œÄ.d)
    end
    return argmax(a->œÄ.Q[(h,a)], œÄ.ùí´.ùíú)
end
####################

#################### online-approximations 3
struct DeterminizedParticle
    s # state
    i # scenario index
    j # depth index
end

function successor(ùí´, Œ¶, œï, a)
    ùíÆ, ùí™, T, O = ùí´.ùíÆ, ùí´.ùí™, ùí´.T, ùí´.O
    p = 0.0
    for (s‚Ä≤, o) in product(ùíÆ, ùí™)
        p += T(œï.s, a, s‚Ä≤) * O(a, s‚Ä≤, o)
        if p ‚â• Œ¶[œï.i, œï.j]
            return (s‚Ä≤, o)
        end
    end
    return last(ùíÆ), last(ùí™)
end

function possible_observations(ùí´, Œ¶, b, a)
    ùí™ = []
    for œï in b
        s‚Ä≤, o = successor(ùí´, Œ¶, œï, a)
        push!(ùí™, o)
    end
    return unique(ùí™)
end

function update(b, Œ¶, ùí´, a, o)
    b‚Ä≤ = []
    for œï in b
        s‚Ä≤, o‚Ä≤ = successor(ùí´, Œ¶, œï, a)
        if o == o‚Ä≤
            push!(b‚Ä≤, DeterminizedParticle(s‚Ä≤, œï.i, œï.j + 1))
        end
    end
    return b‚Ä≤
end
####################

#################### online-approximations 4
struct DeterminizedSparseTreeSearch
	ùí´ # problem
    d # depth
    Œ¶ # m√ód determinizing matrix
    U # value function to use at leaf nodes
end

function determinized_sparse_tree_search(ùí´, b, d, Œ¶, U)
    ùíÆ, ùíú, ùí™, T, R, O, Œ≥ = ùí´.ùíÆ, ùí´.ùíú, ùí´.ùí™, ùí´.T, ùí´.R, ùí´.O, ùí´.Œ≥
    if d == 0
        return (a=nothing, u=U(b))
    end
    best = (a=nothing, u=-Inf)
    for a in ùíú
        u = sum(R(œï.s, a) for œï in b) / length(b)
        for o in possible_observations(ùí´, Œ¶, b, a)
            Poba = sum(sum(O(a,s‚Ä≤,o)*T(œï.s,a,s‚Ä≤) for s‚Ä≤ in ùíÆ)
                       for œï in b) / length(b)
            b‚Ä≤ = update(b, Œ¶, ùí´, a, o)
            u‚Ä≤ = determinized_sparse_tree_search(ùí´,b‚Ä≤,d-1,Œ¶,U).u
            u += Œ≥*Poba*u‚Ä≤
        end
        if u > best.u
            best = (a=a, u=u)
        end
    end
    return best
end

function determinized_belief(b, ùí´, m)
    particles = []
    for i in 1:m
        s = rand(SetCategorical(ùí´.ùíÆ, b))
		push!(particles, DeterminizedParticle(s, i, 1))
    end
    return particles
end

function (œÄ::DeterminizedSparseTreeSearch)(b)
	particles = determinized_belief(b, œÄ.ùí´, size(œÄ.Œ¶,1))
    return determinized_sparse_tree_search(œÄ.ùí´,particles,œÄ.d,œÄ.Œ¶,œÄ.U).a
end
####################

#################### online-approximations 5
struct GapHeuristicSearch
    ùí´     # problem
    Ulo   # lower bound on value function
    Uhi   # upper bound on value function
    Œ¥     # gap threshold
    k_max # maximum number of simulations
    d_max # maximum depth
end

function heuristic_search(œÄ::GapHeuristicSearch, Ulo, Uhi, b, d)
    ùí´, Œ¥ = œÄ.ùí´, œÄ.Œ¥
    ùíÆ, ùíú, ùí™, R, Œ≥ = ùí´.ùíÆ, ùí´.ùíú, ùí´.ùí™, ùí´.R, ùí´.Œ≥
    B = Dict((a,o)=>update(b,ùí´,a,o) for (a,o) in product(ùíú,ùí™))
    B = merge(B, Dict(()=>copy(b)))
    for (ao, b‚Ä≤) in B
        if !haskey(Uhi, b‚Ä≤)
            Ulo[b‚Ä≤], Uhi[b‚Ä≤] = œÄ.Ulo(b‚Ä≤), œÄ.Uhi(b‚Ä≤)
        end
    end
    if d == 0 || Uhi[b] - Ulo[b] ‚â§ Œ¥
        return
    end
    a = argmax(a -> lookahead(ùí´,b‚Ä≤->Uhi[b‚Ä≤],b,a), ùíú)
    o = argmax(o -> Uhi[B[(a, o)]] - Ulo[B[(a, o)]], ùí™)
    b‚Ä≤ = update(b,ùí´,a,o)
    heuristic_search(œÄ,Ulo,Uhi,b‚Ä≤,d-1)
    Ulo[b] = maximum(lookahead(ùí´,b‚Ä≤->Ulo[b‚Ä≤],b,a) for a in ùíú)
    Uhi[b] = maximum(lookahead(ùí´,b‚Ä≤->Uhi[b‚Ä≤],b,a) for a in ùíú)
end

function (œÄ::GapHeuristicSearch)(b)
    ùí´, k_max, d_max, Œ¥ = œÄ.ùí´, œÄ.k_max, œÄ.d_max, œÄ.Œ¥
    Ulo = Dict{Vector{Float64}, Float64}()
    Uhi = Dict{Vector{Float64}, Float64}()
    for i in 1:k_max
        heuristic_search(œÄ, Ulo, Uhi, b, d_max)
        if Uhi[b] - Ulo[b] < Œ¥
            break
        end
    end
    return argmax(a -> lookahead(ùí´,b‚Ä≤->Ulo[b‚Ä≤],b,a), ùí´.ùíú)
end
####################

#################### controller-abstractions 1
mutable struct ControllerPolicy
    ùí´ # problem
    X # set of controller nodes
    œà # action selection distribution
    Œ∑ # successor selection distribution
end

function (œÄ::ControllerPolicy)(x)
    ùíú, œà = œÄ.ùí´.ùíú, œÄ.œà
    dist = [œà[x, a] for a in ùíú]
    return rand(SetCategorical(ùíú, dist))
end

function update(œÄ::ControllerPolicy, x, a, o)
    X, Œ∑ = œÄ.X, œÄ.Œ∑
    dist = [Œ∑[x, a, o, x‚Ä≤] for x‚Ä≤ in X]
    return rand(SetCategorical(X, dist))
end
####################

#################### controller-abstractions 2
function utility(œÄ::ControllerPolicy, U, x, s)
    ùíÆ, ùíú, ùí™ = œÄ.ùí´.ùíÆ, œÄ.ùí´.ùíú, œÄ.ùí´.ùí™
    T, O, R, Œ≥ = œÄ.ùí´.T, œÄ.ùí´.O, œÄ.ùí´.R, œÄ.ùí´.Œ≥
    X, œà, Œ∑ = œÄ.X, œÄ.œà, œÄ.Œ∑
    U‚Ä≤(a,s‚Ä≤,o) = sum(Œ∑[x,a,o,x‚Ä≤]*U[x‚Ä≤,s‚Ä≤] for x‚Ä≤ in X)
    U‚Ä≤(a,s‚Ä≤) = T(s,a,s‚Ä≤)*sum(O(a,s‚Ä≤,o)*U‚Ä≤(a,s‚Ä≤,o) for o in ùí™)
    U‚Ä≤(a) = R(s,a) + Œ≥*sum(U‚Ä≤(a,s‚Ä≤) for s‚Ä≤ in ùíÆ)
    return sum(œà[x,a]*U‚Ä≤(a) for a in ùíú)
end

function iterative_policy_evaluation(œÄ::ControllerPolicy, k_max)
    ùíÆ, X = œÄ.ùí´.ùíÆ, œÄ.X
    U = Dict((x, s) => 0.0 for x in X, s in ùíÆ)
    for k in 1:k_max
        U = Dict((x, s) => utility(œÄ, U, x, s) for x in X, s in ùíÆ)
    end
    return U
end
####################

#################### controller-abstractions 3
struct ControllerPolicyIteration
    k_max    # number of iterations
    eval_max # number of evaluation iterations
end

function solve(M::ControllerPolicyIteration, ùí´::POMDP)
    ùíú, ùí™, k_max, eval_max = ùí´.ùíú, ùí´.ùí™, M.k_max, M.eval_max
    X = [1]
    œà = Dict((x, a) => 1.0 / length(ùíú) for x in X, a in ùíú)
    Œ∑ = Dict((x, a, o, x‚Ä≤) => 1.0 for x in X, a in ùíú, o in ùí™, x‚Ä≤ in X)
    œÄ = ControllerPolicy(ùí´, X, œà, Œ∑)
    for i in 1:k_max
        prevX = copy(œÄ.X)
        U = iterative_policy_evaluation(œÄ, eval_max)
        policy_improvement!(œÄ, U, prevX)
        prune!(œÄ, U, prevX)
    end
    return œÄ
end

function policy_improvement!(œÄ::ControllerPolicy, U, prevX)
    ùíÆ, ùíú, ùí™ = œÄ.ùí´.ùíÆ, œÄ.ùí´.ùíú, œÄ.ùí´.ùí™
    X, œà, Œ∑ = œÄ.X, œÄ.œà, œÄ.Œ∑
    repeatXùí™ = fill(X, length(ùí™))
    assignùíúX‚Ä≤ = vec(collect(product(ùíú, repeatXùí™...)))
    for ax‚Ä≤ in assignùíúX‚Ä≤
        x, a = maximum(X) + 1, ax‚Ä≤[1]
        push!(X, x)
        successor(o) = ax‚Ä≤[findfirst(isequal(o), ùí™) + 1]
        U‚Ä≤(o,s‚Ä≤) = U[successor(o), s‚Ä≤]
        for s in ùíÆ
            U[x, s] = lookahead(œÄ.ùí´, U‚Ä≤, s, a)
        end
        for a‚Ä≤ in ùíú
            œà[x, a‚Ä≤] = a‚Ä≤ == a ? 1.0 : 0.0
            for (o, x‚Ä≤) in product(ùí™, prevX)
                Œ∑[x, a‚Ä≤, o, x‚Ä≤] = x‚Ä≤ == successor(o) ? 1.0 : 0.0
            end
        end
    end
    for (x, a, o, x‚Ä≤) in product(X, ùíú, ùí™, X)
        if !haskey(Œ∑, (x, a, o, x‚Ä≤))
            Œ∑[x, a, o, x‚Ä≤] = 0.0
        end
    end
end
####################

#################### controller-abstractions 4
function prune!(œÄ::ControllerPolicy, U, prevX)
    ùíÆ, ùíú, ùí™, X, œà, Œ∑ = œÄ.ùí´.ùíÆ, œÄ.ùí´.ùíú, œÄ.ùí´.ùí™, œÄ.X, œÄ.œà, œÄ.Œ∑
    newX, removeX = setdiff(X, prevX), []
    # prune dominated from previous nodes
    dominated(x,x‚Ä≤) = all(U[x,s] ‚â§ U[x‚Ä≤,s] for s in ùíÆ)
    for (x,x‚Ä≤) in product(prevX, newX)
        if x‚Ä≤ ‚àâ removeX && dominated(x, x‚Ä≤)
            for s in ùíÆ
                U[x,s] = U[x‚Ä≤,s]
            end
            for a in ùíú
                œà[x,a] = œà[x‚Ä≤,a]
                for (o,x‚Ä≤‚Ä≤) in product(ùí™, X)
                    Œ∑[x,a,o,x‚Ä≤‚Ä≤] = Œ∑[x‚Ä≤,a,o,x‚Ä≤‚Ä≤]
                end
            end
            push!(removeX, x‚Ä≤)
        end
    end
    # prune identical from previous nodes
    identical_action(x,x‚Ä≤) = all(œà[x,a] ‚âà œà[x‚Ä≤,a] for a in ùíú)
    identical_successor(x,x‚Ä≤) = all(Œ∑[x,a,o,x‚Ä≤‚Ä≤] ‚âà Œ∑[x‚Ä≤,a,o,x‚Ä≤‚Ä≤]
            for a in ùíú, o in ùí™, x‚Ä≤‚Ä≤ in X)
    identical(x,x‚Ä≤) = identical_action(x,x‚Ä≤) && identical_successor(x,x‚Ä≤)
    for (x,x‚Ä≤) in product(prevX, newX)
        if x‚Ä≤ ‚àâ removeX && identical(x,x‚Ä≤)
            push!(removeX, x‚Ä≤)
        end
    end
    # prune dominated from new nodes
    for (x,x‚Ä≤) in product(X, newX)
        if x‚Ä≤ ‚àâ removeX && dominated(x‚Ä≤,x) && x ‚â† x‚Ä≤
            push!(removeX, x‚Ä≤)
        end
    end
    # update controller
    œÄ.X = setdiff(X, removeX)
    œÄ.œà = Dict(k => v for (k,v) in œà if k[1] ‚àâ removeX)
    œÄ.Œ∑ = Dict(k => v for (k,v) in Œ∑ if k[1] ‚àâ removeX)
end
####################

#################### controller-abstractions 5
struct NonlinearProgramming
    b # initial belief
    ‚Ñì # number of nodes
end

function tensorform(ùí´::POMDP)
    ùíÆ, ùíú, ùí™, R, T, O = ùí´.ùíÆ, ùí´.ùíú, ùí´.ùí™, ùí´.R, ùí´.T, ùí´.O
    ùíÆ‚Ä≤ = eachindex(ùíÆ)
    ùíú‚Ä≤ = eachindex(ùíú)
    ùí™‚Ä≤ = eachindex(ùí™)
    R‚Ä≤ = [R(s,a) for s in ùíÆ, a in ùíú]
    T‚Ä≤ = [T(s,a,s‚Ä≤) for s in ùíÆ, a in ùíú, s‚Ä≤ in ùíÆ]
    O‚Ä≤ = [O(a,s‚Ä≤,o) for a in ùíú, s‚Ä≤ in ùíÆ, o in ùí™]
    return ùíÆ‚Ä≤, ùíú‚Ä≤, ùí™‚Ä≤, R‚Ä≤, T‚Ä≤, O‚Ä≤
end

function solve(M::NonlinearProgramming, ùí´::POMDP)
    x1, X = 1, collect(1:M.‚Ñì)
    ùí´, Œ≥, b = ùí´, ùí´.Œ≥, M.b
    ùíÆ, ùíú, ùí™, R, T, O = tensorform(ùí´)
    model = Model(Ipopt.Optimizer)
    @variable(model, U[X,ùíÆ])
    @variable(model, œà[X,ùíú] ‚â• 0)
    @variable(model, Œ∑[X,ùíú,ùí™,X] ‚â• 0)
    @objective(model, Max, b‚ãÖU[x1,:])
    @NLconstraint(model, [x=X,s=ùíÆ],
        U[x,s] == (sum(œà[x,a]*(R[s,a] + Œ≥*sum(T[s,a,s‚Ä≤]*sum(O[a,s‚Ä≤,o]
        *sum(Œ∑[x,a,o,x‚Ä≤]*U[x‚Ä≤,s‚Ä≤] for x‚Ä≤ in X)
        for o in ùí™) for s‚Ä≤ in ùíÆ)) for a in ùíú)))
    @constraint(model, [x=X], sum(œà[x,:]) == 1)
    @constraint(model, [x=X,a=ùíú,o=ùí™], sum(Œ∑[x,a,o,:]) == 1)
    optimize!(model)
    œà‚Ä≤, Œ∑‚Ä≤ = value.(œà), value.(Œ∑)
    return ControllerPolicy(ùí´, X,
        Dict((x, ùí´.ùíú[a]) => œà‚Ä≤[x, a] for x in X, a in ùíú),
        Dict((x, ùí´.ùíú[a], ùí´.ùí™[o], x‚Ä≤) => Œ∑‚Ä≤[x, a, o, x‚Ä≤]
             for x in X, a in ùíú, o in ùí™, x‚Ä≤ in X))
end
####################

#################### controller-abstractions 6
struct ControllerGradient
    b       # initial belief
    ‚Ñì       # number of nodes
    Œ±       # gradient step
    k_max   # maximum iterations
end

function solve(M::ControllerGradient, ùí´::POMDP)
    ùíú, ùí™, ‚Ñì, k_max = ùí´.ùíú, ùí´.ùí™, M.‚Ñì, M.k_max
    X = collect(1:‚Ñì)
    œà = Dict((x, a) => rand() for x in X, a in ùíú)
    Œ∑ = Dict((x, a, o, x‚Ä≤) => rand() for x in X, a in ùíú, o in ùí™, x‚Ä≤ in X)
    œÄ = ControllerPolicy(ùí´, X, œà, Œ∑)
    for i in 1:k_max
        improve!(œÄ, M, ùí´)
    end
    return œÄ
end

function improve!(œÄ::ControllerPolicy, M::ControllerGradient, ùí´::POMDP)
    ùíÆ, ùíú, ùí™, X, x1, œà, Œ∑ = ùí´.ùíÆ, ùí´.ùíú, ùí´.ùí™, œÄ.X, 1, œÄ.œà, œÄ.Œ∑
    n, m, z, b, ‚Ñì, Œ± = length(ùíÆ), length(ùíú), length(ùí™), M.b, M.‚Ñì, M.Œ±
    ‚àÇU‚Ä≤‚àÇœà, ‚àÇU‚Ä≤‚àÇŒ∑ = gradient(œÄ, M, ùí´)
    UIndex(x, s) = (s - 1) * ‚Ñì + (x - 1) + 1
    E(U, x1, b) = sum(b[s]*U[UIndex(x1,s)] for s in 1:n)
    œà‚Ä≤ = Dict((x, a) => 0.0 for x in X, a in ùíú)
    Œ∑‚Ä≤ = Dict((x, a, o, x‚Ä≤) => 0.0 for x in X, a in ùíú, o in ùí™, x‚Ä≤ in X)
    for x in X
        œà‚Ä≤x = [œà[x, a] + Œ± * E(‚àÇU‚Ä≤‚àÇœà(x, a), x1, b) for a in ùíú]
        œà‚Ä≤x = project_to_simplex(œà‚Ä≤x)
        for (aIndex, a) in enumerate(ùíú)
            œà‚Ä≤[x, a] = œà‚Ä≤x[aIndex]
        end
        for (a, o) in product(ùíú, ùí™)
            Œ∑‚Ä≤x = [(Œ∑[x, a, o, x‚Ä≤] +
                    Œ± * E(‚àÇU‚Ä≤‚àÇŒ∑(x, a, o, x‚Ä≤), x1, b)) for x‚Ä≤ in X]
            Œ∑‚Ä≤x = project_to_simplex(Œ∑‚Ä≤x)
            for (x‚Ä≤Index, x‚Ä≤) in enumerate(X)
                Œ∑‚Ä≤[x, a, o, x‚Ä≤] = Œ∑‚Ä≤x[x‚Ä≤Index]
            end
        end
    end
    œÄ.œà, œÄ.Œ∑ = œà‚Ä≤, Œ∑‚Ä≤
end

function project_to_simplex(y)
    u = sort(copy(y), rev=true)
    i = maximum([j for j in eachindex(u)
                 if u[j] + (1 - sum(u[1:j])) / j > 0.0])
    Œ¥ = (1 - sum(u[j] for j = 1:i)) / i
    return [max(y[j] + Œ¥, 0.0) for j in eachindex(u)]
end
####################

#################### controller-abstractions 7
function gradient(œÄ::ControllerPolicy, M::ControllerGradient, ùí´::POMDP)
    ùíÆ, ùíú, ùí™, T, O, R, Œ≥ = ùí´.ùíÆ, ùí´.ùíú, ùí´.ùí™, ùí´.T, ùí´.O, ùí´.R, ùí´.Œ≥
    X, x1, œà, Œ∑ = œÄ.X, 1, œÄ.œà, œÄ.Œ∑
    n, m, z = length(ùíÆ), length(ùíú), length(ùí™)
    XùíÆ = vec(collect(product(X, ùíÆ)))
    T‚Ä≤ = [sum(œà[x, a] * T(s, a, s‚Ä≤) * sum(O(a, s‚Ä≤, o) * Œ∑[x, a, o, x‚Ä≤]
          for o in ùí™) for a in ùíú) for (x, s) in XùíÆ, (x‚Ä≤, s‚Ä≤) in XùíÆ]
    R‚Ä≤ = [sum(œà[x, a] * R(s, a) for a in ùíú) for (x, s) in XùíÆ]
    Z = 1.0I(length(XùíÆ)) - Œ≥ * T‚Ä≤
    invZ = inv(Z)
    ‚àÇZ‚àÇœà(hx, ha) = [x == hx ? (-Œ≥ * T(s, ha, s‚Ä≤)
                    * sum(O(ha, s‚Ä≤, o) * Œ∑[hx, ha, o, x‚Ä≤]
                          for o in ùí™)) : 0.0
                    for (x, s) in XùíÆ, (x‚Ä≤, s‚Ä≤) in XùíÆ]
    ‚àÇZ‚àÇŒ∑(hx, ha, ho, hx‚Ä≤) = [x == hx && x‚Ä≤ == hx‚Ä≤ ? (-Œ≥ * œà[hx, ha]
                    * T(s, ha, s‚Ä≤) * O(ha, s‚Ä≤, ho)) : 0.0
                 for (x, s) in XùíÆ, (x‚Ä≤, s‚Ä≤) in XùíÆ]
    ‚àÇR‚Ä≤‚àÇœà(hx, ha) = [x == hx ? R(s, ha) : 0.0 for (x, s) in XùíÆ]
    ‚àÇR‚Ä≤‚àÇŒ∑(hx, ha, ho, hx‚Ä≤) = [0.0 for (x, s) in XùíÆ]
    ‚àÇU‚Ä≤‚àÇœà(hx, ha) = invZ * (‚àÇR‚Ä≤‚àÇœà(hx, ha) - ‚àÇZ‚àÇœà(hx, ha) * invZ * R‚Ä≤)
    ‚àÇU‚Ä≤‚àÇŒ∑(hx, ha, ho, hx‚Ä≤) = invZ * (‚àÇR‚Ä≤‚àÇŒ∑(hx, ha, ho, hx‚Ä≤)
                                - ‚àÇZ‚àÇŒ∑(hx, ha, ho, hx‚Ä≤) * invZ * R‚Ä≤)
    return ‚àÇU‚Ä≤‚àÇœà, ‚àÇU‚Ä≤‚àÇŒ∑
end
####################

#################### multiagent_reasoning 1
struct SimpleGame
    Œ≥  # discount factor
    ‚Ñê  # agents
    ùíú  # joint action space
    R  # joint reward function
end
####################

#################### multiagent_reasoning 2
struct SimpleGamePolicy
    p # dictionary mapping actions to probabilities

    function SimpleGamePolicy(p::Base.Generator)
        return SimpleGamePolicy(Dict(p))
    end

    function SimpleGamePolicy(p::Dict)
        vs = collect(values(p))
        vs ./= sum(vs)
        return new(Dict(k => v for (k,v) in zip(keys(p), vs)))
    end

    SimpleGamePolicy(ai) = new(Dict(ai => 1.0))
end

(œÄi::SimpleGamePolicy)(ai) = get(œÄi.p, ai, 0.0)

function (œÄi::SimpleGamePolicy)()
    D = SetCategorical(collect(keys(œÄi.p)), collect(values(œÄi.p)))
    return rand(D)
end

joint(X) = vec(collect(product(X...)))

joint(œÄ, œÄi, i) = [i == j ? œÄi : œÄj for (j, œÄj) in enumerate(œÄ)]

function utility(ùí´::SimpleGame, œÄ, i)
    ùíú, R = ùí´.ùíú, ùí´.R
    p(a) = prod(œÄj(aj) for (œÄj, aj) in zip(œÄ, a))
    return sum(R(a)[i]*p(a) for a in joint(ùíú))
end
####################

#################### multiagent_reasoning 3
function best_response(ùí´::SimpleGame, œÄ, i)
    U(ai) = utility(ùí´, joint(œÄ, SimpleGamePolicy(ai), i), i)
    ai = argmax(U, ùí´.ùíú[i])
    return SimpleGamePolicy(ai)
end
####################

#################### multiagent_reasoning 4
function softmax_response(ùí´::SimpleGame, œÄ, i, Œª)
    ùíúi = ùí´.ùíú[i]
    U(ai) = utility(ùí´, joint(œÄ, SimpleGamePolicy(ai), i), i)
    return SimpleGamePolicy(ai => exp(Œª*U(ai)) for ai in ùíúi)
end
####################

#################### multiagent_reasoning 5
struct NashEquilibrium end

function tensorform(ùí´::SimpleGame)
    ‚Ñê, ùíú, R = ùí´.‚Ñê, ùí´.ùíú, ùí´.R
    ‚Ñê‚Ä≤ = eachindex(‚Ñê)
    ùíú‚Ä≤ = [eachindex(ùíú[i]) for i in ‚Ñê]
    R‚Ä≤ = [R(a) for a in joint(ùíú)]
    return ‚Ñê‚Ä≤, ùíú‚Ä≤, R‚Ä≤
end

function solve(M::NashEquilibrium, ùí´::SimpleGame)
    ‚Ñê, ùíú, R = tensorform(ùí´)
    model = Model(Ipopt.Optimizer)
    @variable(model, U[‚Ñê])
    @variable(model, œÄ[i=‚Ñê, ùíú[i]] ‚â• 0)
    @NLobjective(model, Min,
        sum(U[i] - sum(prod(œÄ[j,a[j]] for j in ‚Ñê) * R[y][i]
            for (y,a) in enumerate(joint(ùíú))) for i in ‚Ñê))
    @NLconstraint(model, [i=‚Ñê, ai=ùíú[i]],
        U[i] ‚â• sum(
            prod(j==i ? (a[j]==ai ? 1.0 : 0.0) : œÄ[j,a[j]] for j in ‚Ñê)
            * R[y][i] for (y,a) in enumerate(joint(ùíú))))
    @constraint(model, [i=‚Ñê], sum(œÄ[i,ai] for ai in ùíú[i]) == 1)
    optimize!(model)
    œÄi‚Ä≤(i) = SimpleGamePolicy(ùí´.ùíú[i][ai] => value(œÄ[i,ai]) for ai in ùíú[i])
    return [œÄi‚Ä≤(i) for i in ‚Ñê]
end
####################

#################### multiagent_reasoning 6
mutable struct JointCorrelatedPolicy
    p # dictionary mapping from joint actions to probabilities
    JointCorrelatedPolicy(p::Base.Generator) = new(Dict(p))
end

(œÄ::JointCorrelatedPolicy)(a) = get(œÄ.p, a, 0.0)

function (œÄ::JointCorrelatedPolicy)()
    D = SetCategorical(collect(keys(œÄ.p)), collect(values(œÄ.p)))
    return rand(D)
end
####################

#################### multiagent_reasoning 7
struct CorrelatedEquilibrium end

function solve(M::CorrelatedEquilibrium, ùí´::SimpleGame)
    ‚Ñê, ùíú, R = ùí´.‚Ñê, ùí´.ùíú, ùí´.R
    model = Model(Ipopt.Optimizer)
    @variable(model, œÄ[joint(ùíú)] ‚â• 0)
    @objective(model, Max, sum(sum(œÄ[a]*R(a) for a in joint(ùíú))))
    @constraint(model, [i=‚Ñê, ai=ùíú[i], ai‚Ä≤=ùíú[i]],
        sum(R(a)[i]*œÄ[a] for a in joint(ùíú) if a[i]==ai)
        ‚â• sum(R(joint(a,ai‚Ä≤,i))[i]*œÄ[a] for a in joint(ùíú) if a[i]==ai))
    @constraint(model, sum(œÄ) == 1)
    optimize!(model)
    return JointCorrelatedPolicy(a => value(œÄ[a]) for a in joint(ùíú))
end
####################

#################### multiagent_reasoning 8
struct IteratedBestResponse
    k_max # number of iterations
    œÄ     # initial policy
end

function IteratedBestResponse(ùí´::SimpleGame, k_max)
    œÄ = [SimpleGamePolicy(ai => 1.0 for ai in ùíúi) for ùíúi in ùí´.ùíú]
    return IteratedBestResponse(k_max, œÄ)
end

function solve(M::IteratedBestResponse, ùí´)
    œÄ = M.œÄ
    for k in 1:M.k_max
        œÄ = [best_response(ùí´, œÄ, i) for i in ùí´.‚Ñê]
    end
    return œÄ
end
####################

#################### multiagent_reasoning 9
struct HierarchicalSoftmax
    Œª # precision parameter
    k # level
    œÄ # initial policy
end

function HierarchicalSoftmax(ùí´::SimpleGame, Œª, k)
    œÄ = [SimpleGamePolicy(ai => 1.0 for ai in ùíúi) for ùíúi in ùí´.ùíú]
    return HierarchicalSoftmax(Œª, k, œÄ)
end

function solve(M::HierarchicalSoftmax, ùí´)
    œÄ = M.œÄ
    for k in 1:M.k
        œÄ = [softmax_response(ùí´, œÄ, i, M.Œª) for i in ùí´.‚Ñê]
    end
    return œÄ
end
####################

#################### multiagent_reasoning 10
function simulate(ùí´::SimpleGame, œÄ, k_max)
    for k = 1:k_max
        a = [œÄi() for œÄi in œÄ]
        for œÄi in œÄ
            update!(œÄi, a)
        end
    end
    return œÄ
end
####################

#################### multiagent_reasoning 11
mutable struct FictitiousPlay
    ùí´  # simple game
    i  # agent index
    N  # array of action count dictionaries
    œÄi # current policy
end

function FictitiousPlay(ùí´::SimpleGame, i)
    N = [Dict(aj => 1 for aj in ùí´.ùíú[j]) for j in ùí´.‚Ñê]
    œÄi = SimpleGamePolicy(ai => 1.0 for ai in ùí´.ùíú[i])
    return FictitiousPlay(ùí´, i, N, œÄi)
end

(œÄi::FictitiousPlay)() = œÄi.œÄi()

(œÄi::FictitiousPlay)(ai) = œÄi.œÄi(ai)

function update!(œÄi::FictitiousPlay, a)
    N, ùí´, ‚Ñê, i = œÄi.N, œÄi.ùí´, œÄi.ùí´.‚Ñê, œÄi.i
    for (j, aj) in enumerate(a)
        N[j][aj] += 1
    end
    p(j) = SimpleGamePolicy(aj => u/sum(values(N[j])) for (aj, u) in N[j])
    œÄ = [p(j) for j in ‚Ñê]
    œÄi.œÄi = best_response(ùí´, œÄ, i)
end
####################

#################### multiagent_reasoning 12
mutable struct GradientAscent
    ùí´  # simple game
    i  # agent index
    t  # time step
    œÄi # current policy
end

function GradientAscent(ùí´::SimpleGame, i)
    uniform() = SimpleGamePolicy(ai => 1.0 for ai in ùí´.ùíú[i])
    return GradientAscent(ùí´, i, 1, uniform())
end

(œÄi::GradientAscent)() = œÄi.œÄi()

(œÄi::GradientAscent)(ai) = œÄi.œÄi(ai)

function update!(œÄi::GradientAscent, a)
    ùí´, ‚Ñê, ùíúi, i, t = œÄi.ùí´, œÄi.ùí´.‚Ñê, œÄi.ùí´.ùíú[œÄi.i], œÄi.i, œÄi.t
    jointœÄ(ai) = [SimpleGamePolicy(j == i ? ai : a[j]) for j in ‚Ñê]
    r = [utility(ùí´, jointœÄ(ai), i) for ai in ùíúi]
    œÄ‚Ä≤ = [œÄi.œÄi(ai) for ai in ùíúi]
    œÄ = project_to_simplex(œÄ‚Ä≤ + r / sqrt(t))
    œÄi.t = t + 1
    œÄi.œÄi = SimpleGamePolicy(ai => p for (ai, p) in zip(ùíúi, œÄ))
end
####################

#################### sequential_problems 1
struct MG
    Œ≥  # discount factor
    ‚Ñê  # agents
    ùíÆ  # state space
    ùíú  # joint action space
    T  # transition function
    R  # joint reward function
end
####################

#################### sequential_problems 2
struct MGPolicy
    p # dictionary mapping states to simple game policies
    MGPolicy(p::Base.Generator) = new(Dict(p))
end

(œÄi::MGPolicy)(s, ai) = œÄi.p[s](ai)
(œÄi::SimpleGamePolicy)(s, ai) = œÄi(ai)

probability(ùí´::MG, s, œÄ, a) = prod(œÄj(s, aj) for (œÄj, aj) in zip(œÄ, a))
reward(ùí´::MG, s, œÄ, i) =
    sum(ùí´.R(s,a)[i]*probability(ùí´,s,œÄ,a) for a in joint(ùí´.ùíú))
transition(ùí´::MG, s, œÄ, s‚Ä≤) =
    sum(ùí´.T(s,a,s‚Ä≤)*probability(ùí´,s,œÄ,a) for a in joint(ùí´.ùíú))

function policy_evaluation(ùí´::MG, œÄ, i)
    ùíÆ, ùíú, R, T, Œ≥ = ùí´.ùíÆ, ùí´.ùíú, ùí´.R, ùí´.T, ùí´.Œ≥
    p(s,a) = prod(œÄj(s, aj) for (œÄj, aj) in zip(œÄ, a))
    R‚Ä≤ = [sum(R(s,a)[i]*p(s,a) for a in joint(ùíú)) for s in ùíÆ]
    T‚Ä≤ = [sum(T(s,a,s‚Ä≤)*p(s,a) for a in joint(ùíú)) for s in ùíÆ, s‚Ä≤ in ùíÆ]
    return (I - Œ≥*T‚Ä≤)\R‚Ä≤
end
####################

#################### sequential_problems 3
function best_response(ùí´::MG, œÄ, i)
    ùíÆ, ùíú, R, T, Œ≥ = ùí´.ùíÆ, ùí´.ùíú, ùí´.R, ùí´.T, ùí´.Œ≥
    T‚Ä≤(s,ai,s‚Ä≤) = transition(ùí´, s, joint(œÄ, SimpleGamePolicy(ai), i), s‚Ä≤)
    R‚Ä≤(s,ai) = reward(ùí´, s, joint(œÄ, SimpleGamePolicy(ai), i), i)
    œÄi = solve(MDP(Œ≥, ùíÆ, ùíú[i], T‚Ä≤, R‚Ä≤))
    return MGPolicy(s => SimpleGamePolicy(œÄi(s)) for s in ùíÆ)
end
####################

#################### sequential_problems 4
function softmax_response(ùí´::MG, œÄ, i, Œª)
    ùíÆ, ùíú, R, T, Œ≥ = ùí´.ùíÆ, ùí´.ùíú, ùí´.R, ùí´.T, ùí´.Œ≥
    T‚Ä≤(s,ai,s‚Ä≤) = transition(ùí´, s, joint(œÄ, SimpleGamePolicy(ai), i), s‚Ä≤)
    R‚Ä≤(s,ai) = reward(ùí´, s, joint(œÄ, SimpleGamePolicy(ai), i), i)
    mdp = MDP(Œ≥, ùíÆ, joint(ùíú), T‚Ä≤, R‚Ä≤)
    œÄi = solve(mdp)
    Q(s,a) = lookahead(mdp, œÄi.U, s, a)
    p(s) = SimpleGamePolicy(a => exp(Œª*Q(s,a)) for a in ùíú[i])
    return MGPolicy(s => p(s) for s in ùíÆ)
end
####################

#################### sequential_problems 5
function tensorform(ùí´::MG)
    ‚Ñê, ùíÆ, ùíú, R, T = ùí´.‚Ñê, ùí´.ùíÆ, ùí´.ùíú, ùí´.R, ùí´.T
    ‚Ñê‚Ä≤ = eachindex(‚Ñê)
    ùíÆ‚Ä≤ = eachindex(ùíÆ)
    ùíú‚Ä≤ = [eachindex(ùíú[i]) for i in ‚Ñê]
    R‚Ä≤ = [R(s,a) for s in ùíÆ, a in joint(ùíú)]
    T‚Ä≤ = [T(s,a,s‚Ä≤) for s in ùíÆ, a in joint(ùíú), s‚Ä≤ in ùíÆ]
    return ‚Ñê‚Ä≤, ùíÆ‚Ä≤, ùíú‚Ä≤, R‚Ä≤, T‚Ä≤
end

function solve(M::NashEquilibrium, ùí´::MG)
    ‚Ñê, ùíÆ, ùíú, R, T = tensorform(ùí´)
    ùíÆ‚Ä≤, ùíú‚Ä≤, Œ≥ = ùí´.ùíÆ, ùí´.ùíú, ùí´.Œ≥
    model = Model(Ipopt.Optimizer)
    @variable(model, U[‚Ñê, ùíÆ])
    @variable(model, œÄ[i=‚Ñê, ùíÆ, ai=ùíú[i]] ‚â• 0)
    @NLobjective(model, Min,
        sum(U[i,s] - sum(prod(œÄ[j,s,a[j]] for j in ‚Ñê)
            * (R[s,y][i] + Œ≥*sum(T[s,y,s‚Ä≤]*U[i,s‚Ä≤] for s‚Ä≤ in ùíÆ))
            for (y,a) in enumerate(joint(ùíú))) for i in ‚Ñê, s in ùíÆ))
    @NLconstraint(model, [i=‚Ñê, s=ùíÆ, ai=ùíú[i]],
        U[i,s] ‚â• sum(
            prod(j==i ? (a[j]==ai ? 1.0 : 0.0) : œÄ[j,s,a[j]] for j in ‚Ñê)
            * (R[s,y][i] + Œ≥*sum(T[s,y,s‚Ä≤]*U[i,s‚Ä≤] for s‚Ä≤ in ùíÆ))
            for (y,a) in enumerate(joint(ùíú))))
    @constraint(model, [i=‚Ñê, s=ùíÆ], sum(œÄ[i,s,ai] for ai in ùíú[i]) == 1)
    optimize!(model)
    œÄ‚Ä≤ = value.(œÄ)
    œÄi‚Ä≤(i,s) = SimpleGamePolicy(ùíú‚Ä≤[i][ai] => œÄ‚Ä≤[i,s,ai] for ai in ùíú[i])
    œÄi‚Ä≤(i) = MGPolicy(ùíÆ‚Ä≤[s] => œÄi‚Ä≤(i,s) for s in ùíÆ)
    return [œÄi‚Ä≤(i) for i in ‚Ñê]
end
####################

#################### sequential_problems 6
function randstep(ùí´::MG, s, a)
    s‚Ä≤ = rand(SetCategorical(ùí´.ùíÆ, [ùí´.T(s, a, s‚Ä≤) for s‚Ä≤ in ùí´.ùíÆ]))
    r = ùí´.R(s,a)
    return s‚Ä≤, r
end

function simulate(ùí´::MG, œÄ, k_max, b)
    s = rand(b)
    for k = 1:k_max
        a = Tuple(œÄi(s)() for œÄi in œÄ)
        s‚Ä≤, r = randstep(ùí´, s, a)
        for œÄi in œÄ
            update!(œÄi, s, a, s‚Ä≤)
        end
        s = s‚Ä≤
    end
    return œÄ
end
####################

#################### sequential_problems 7
mutable struct MGFictitiousPlay
    ùí´  # Markov game
    i  # agent index
    Qi # state-action value estimates
    Ni # state-action counts
end

function MGFictitiousPlay(ùí´::MG, i)
    ‚Ñê, ùíÆ, ùíú, R = ùí´.‚Ñê, ùí´.ùíÆ, ùí´.ùíú, ùí´.R
    Qi = Dict((s, a) => R(s, a)[i] for s in ùíÆ for a in joint(ùíú))
    Ni = Dict((j, s, aj) => 1.0 for j in ‚Ñê for s in ùíÆ for aj in ùíú[j])
    return MGFictitiousPlay(ùí´, i, Qi, Ni)
end

function (œÄi::MGFictitiousPlay)(s)
    ùí´, i, Qi = œÄi.ùí´, œÄi.i, œÄi.Qi
    ‚Ñê, ùíÆ, ùíú, T, R, Œ≥ = ùí´.‚Ñê, ùí´.ùíÆ, ùí´.ùíú, ùí´.T, ùí´.R, ùí´.Œ≥
    œÄi‚Ä≤(i,s) = SimpleGamePolicy(ai => œÄi.Ni[i,s,ai] for ai in ùíú[i])
    œÄi‚Ä≤(i) = MGPolicy(s => œÄi‚Ä≤(i,s) for s in ùíÆ)
    œÄ = [œÄi‚Ä≤(i) for i in ‚Ñê]
    U(s,œÄ) = sum(œÄi.Qi[s,a]*probability(ùí´,s,œÄ,a) for a in joint(ùíú))
    Q(s,œÄ) = reward(ùí´,s,œÄ,i) + Œ≥*sum(transition(ùí´,s,œÄ,s‚Ä≤)*U(s‚Ä≤,œÄ)
                                     for s‚Ä≤ in ùíÆ)
    Q(ai) = Q(s, joint(œÄ, SimpleGamePolicy(ai), i))
    ai = argmax(Q, ùí´.ùíú[œÄi.i])
    return SimpleGamePolicy(ai)
end

function update!(œÄi::MGFictitiousPlay, s, a, s‚Ä≤)
    ùí´, i, Qi = œÄi.ùí´, œÄi.i, œÄi.Qi
    ‚Ñê, ùíÆ, ùíú, T, R, Œ≥ = ùí´.‚Ñê, ùí´.ùíÆ, ùí´.ùíú, ùí´.T, ùí´.R, ùí´.Œ≥
    for (j,aj) in enumerate(a)
        œÄi.Ni[j,s,aj] += 1
    end
    œÄi‚Ä≤(i,s) = SimpleGamePolicy(ai => œÄi.Ni[i,s,ai] for ai in ùíú[i])
    œÄi‚Ä≤(i) = MGPolicy(s => œÄi‚Ä≤(i,s) for s in ùíÆ)
    œÄ = [œÄi‚Ä≤(i) for i in ‚Ñê]
    U(œÄ,s) = sum(œÄi.Qi[s,a]*probability(ùí´,s,œÄ,a) for a in joint(ùíú))
    Q(s,a) = R(s,a)[i] + Œ≥*sum(T(s,a,s‚Ä≤)*U(œÄ,s‚Ä≤) for s‚Ä≤ in ùíÆ)
    for a in joint(ùíú)
        œÄi.Qi[s,a] = Q(s,a)
    end
end
####################

#################### sequential_problems 8
mutable struct MGGradientAscent
    ùí´  # Markov game
    i  # agent index
    t  # time step
    Qi # state-action value estimates
    œÄi # current policy
end

function MGGradientAscent(ùí´::MG, i)
    ‚Ñê, ùíÆ, ùíú = ùí´.‚Ñê, ùí´.ùíÆ, ùí´.ùíú
    Qi = Dict((s, a) => 0.0 for s in ùíÆ, a in joint(ùíú))
    uniform() = Dict(s => SimpleGamePolicy(ai => 1.0 for ai in ùí´.ùíú[i])
                     for s in ùíÆ)
    return MGGradientAscent(ùí´, i, 1, Qi, uniform())
end

function (œÄi::MGGradientAscent)(s)
    ùíúi, t = œÄi.ùí´.ùíú[œÄi.i], œÄi.t
    œµ = 1 / sqrt(t)
    œÄi‚Ä≤(ai) = œµ/length(ùíúi) + (1-œµ)*œÄi.œÄi[s](ai)
    return SimpleGamePolicy(ai => œÄi‚Ä≤(ai) for ai in ùíúi)
end

function update!(œÄi::MGGradientAscent, s, a, s‚Ä≤)
    ùí´, i, t, Qi = œÄi.ùí´, œÄi.i, œÄi.t, œÄi.Qi
    ‚Ñê, ùíÆ, ùíúi, R, Œ≥ = ùí´.‚Ñê, ùí´.ùíÆ, ùí´.ùíú[œÄi.i], ùí´.R, ùí´.Œ≥
    jointœÄ(ai) = Tuple(j == i ? ai : a[j] for j in ‚Ñê)
    Œ± = 1 / sqrt(t)
    Qmax = maximum(Qi[s‚Ä≤, jointœÄ(ai)] for ai in ùíúi)
    œÄi.Qi[s, a] += Œ± * (R(s, a)[i] + Œ≥ * Qmax - Qi[s, a])
    u = [Qi[s, jointœÄ(ai)] for ai in ùíúi]
    œÄ‚Ä≤ = [œÄi.œÄi[s](ai) for ai in ùíúi]
    œÄ = project_to_simplex(œÄ‚Ä≤ + u / sqrt(t))
    œÄi.t = t + 1
    œÄi.œÄi[s] = SimpleGamePolicy(ai => p for (ai, p) in zip(ùíúi, œÄ))
end
####################

#################### sequential_problems 9
mutable struct NashQLearning
    ùí´ # Markov game
    i # agent index
    Q # state-action value estimates
    N # history of actions performed
end

function NashQLearning(ùí´::MG, i)
    ‚Ñê, ùíÆ, ùíú = ùí´.‚Ñê, ùí´.ùíÆ, ùí´.ùíú
    Q = Dict((j, s, a) => 0.0 for j in ‚Ñê, s in ùíÆ, a in joint(ùíú))
    N = Dict((s, a) => 1.0 for s in ùíÆ, a in joint(ùíú))
    return NashQLearning(ùí´, i, Q, N)
end

function (œÄi::NashQLearning)(s)
    ùí´, i, Q, N = œÄi.ùí´, œÄi.i, œÄi.Q, œÄi.N
    ‚Ñê, ùíÆ, ùíú, ùíúi, Œ≥ = ùí´.‚Ñê, ùí´.ùíÆ, ùí´.ùíú, ùí´.ùíú[œÄi.i], ùí´.Œ≥
    M = NashEquilibrium()
    ùí¢ = SimpleGame(Œ≥, ‚Ñê, ùíú, a -> [Q[j, s, a] for j in ‚Ñê])
    œÄ = solve(M, ùí¢)
    œµ = 1 / sum(N[s, a] for a in joint(ùíú))
    œÄi‚Ä≤(ai) = œµ/length(ùíúi) + (1-œµ)*œÄ[i](ai)
    return SimpleGamePolicy(ai => œÄi‚Ä≤(ai) for ai in ùíúi)
end

function update!(œÄi::NashQLearning, s, a, s‚Ä≤)
    ùí´, ‚Ñê, ùíÆ, ùíú, R, Œ≥ = œÄi.ùí´, œÄi.ùí´.‚Ñê, œÄi.ùí´.ùíÆ, œÄi.ùí´.ùíú, œÄi.ùí´.R, œÄi.ùí´.Œ≥
    i, Q, N = œÄi.i, œÄi.Q, œÄi.N
    M = NashEquilibrium()
    ùí¢ = SimpleGame(Œ≥, ‚Ñê, ùíú, a‚Ä≤ -> [Q[j, s‚Ä≤, a‚Ä≤] for j in ‚Ñê])
    œÄ = solve(M, ùí¢)
    œÄi.N[s, a] += 1
    Œ± = 1 / sqrt(N[s, a])
    for j in ‚Ñê
        œÄi.Q[j,s,a] += Œ±*(R(s,a)[j] + Œ≥*utility(ùí¢,œÄ,j) - Q[j,s,a])
    end
end
####################

#################### state_uncertainty 1
struct POMG
    Œ≥  # discount factor
    ‚Ñê  # agents
    ùíÆ  # state space
    ùíú  # joint action space
    ùí™  # joint observation space
    T  # transition function
    O  # joint observation function
    R  # joint reward function
end
####################

#################### state_uncertainty 2
function lookahead(ùí´::POMG, U, s, a)
    ùíÆ, ùí™, T, O, R, Œ≥ = ùí´.ùíÆ, joint(ùí´.ùí™), ùí´.T, ùí´.O, ùí´.R, ùí´.Œ≥
    u‚Ä≤ = sum(T(s,a,s‚Ä≤)*sum(O(a,s‚Ä≤,o)*U(o,s‚Ä≤) for o in ùí™) for s‚Ä≤ in ùíÆ)
    return R(s,a) + Œ≥*u‚Ä≤
end

function evaluate_plan(ùí´::POMG, œÄ, s)
    a = Tuple(œÄi() for œÄi in œÄ)
    U(o,s‚Ä≤) = evaluate_plan(ùí´, [œÄi(oi) for (œÄi, oi) in zip(œÄ,o)], s‚Ä≤)
    return isempty(first(œÄ).subplans) ? ùí´.R(s,a) : lookahead(ùí´, U, s, a)
end

function utility(ùí´::POMG, b, œÄ)
    u = [evaluate_plan(ùí´, œÄ, s) for s in ùí´.ùíÆ]
    return sum(bs * us for (bs, us) in zip(b, u))
end
####################

#################### state_uncertainty 3
struct POMGNashEquilibrium
    b # initial belief
    d # depth of conditional plans
end

function create_conditional_plans(ùí´, d)
    ‚Ñê, ùíú, ùí™ = ùí´.‚Ñê, ùí´.ùíú, ùí´.ùí™
    Œ† = [[ConditionalPlan(ai) for ai in ùíú[i]] for i in ‚Ñê]
    for t in 1:d
        Œ† = expand_conditional_plans(ùí´, Œ†)
    end
    return Œ†
end

function expand_conditional_plans(ùí´, Œ†)
    ‚Ñê, ùíú, ùí™ = ùí´.‚Ñê, ùí´.ùíú, ùí´.ùí™
    return [[ConditionalPlan(ai, Dict(oi => œÄi for oi in ùí™[i]))
        for œÄi in Œ†[i] for ai in ùíú[i]] for i in ‚Ñê]
end

function solve(M::POMGNashEquilibrium, ùí´::POMG)
    ‚Ñê, Œ≥, b, d = ùí´.‚Ñê, ùí´.Œ≥, M.b, M.d
    Œ† = create_conditional_plans(ùí´, d)
    U = Dict(œÄ => utility(ùí´, b, œÄ) for œÄ in joint(Œ†))
    ùí¢ = SimpleGame(Œ≥, ‚Ñê, Œ†, œÄ -> U[œÄ])
    œÄ = solve(NashEquilibrium(), ùí¢)
    return Tuple(argmax(œÄi.p) for œÄi in œÄ)
end
####################

#################### state_uncertainty 4
struct POMGDynamicProgramming
    b   # initial belief
    d   # depth of conditional plans
end

function solve(M::POMGDynamicProgramming, ùí´::POMG)
    ‚Ñê, ùíÆ, ùíú, R, Œ≥, b, d = ùí´.‚Ñê, ùí´.ùíÆ, ùí´.ùíú, ùí´.R, ùí´.Œ≥, M.b, M.d
    Œ† = [[ConditionalPlan(ai) for ai in ùíú[i]] for i in ‚Ñê]
    for t in 1:d
        Œ† = expand_conditional_plans(ùí´, Œ†)
        prune_dominated!(Œ†, ùí´)
    end
    ùí¢ = SimpleGame(Œ≥, ‚Ñê, Œ†, œÄ -> utility(ùí´, b, œÄ))
    œÄ = solve(NashEquilibrium(), ùí¢)
    return Tuple(argmax(œÄi.p) for œÄi in œÄ)
end

function prune_dominated!(Œ†, ùí´::POMG)
    done = false
    while !done
        done = true
        for i in shuffle(ùí´.‚Ñê)
            for œÄi in shuffle(Œ†[i])
                if length(Œ†[i]) > 1 && is_dominated(ùí´, Œ†, i, œÄi)
                    filter!(œÄi‚Ä≤ -> œÄi‚Ä≤ ‚â† œÄi, Œ†[i])
                    done = false
                    break
                end
            end
        end
    end
end

function is_dominated(ùí´::POMG, Œ†, i, œÄi)
    ‚Ñê, ùíÆ = ùí´.‚Ñê, ùí´.ùíÆ
    jointŒ†noti = joint([Œ†[j] for j in ‚Ñê if j ‚â† i])
    œÄ(œÄi‚Ä≤, œÄnoti) = [j==i ? œÄi‚Ä≤ : œÄnoti[j>i ? j-1 : j] for j in ‚Ñê]
    Ui = Dict((œÄi‚Ä≤, œÄnoti, s) => evaluate_plan(ùí´, œÄ(œÄi‚Ä≤, œÄnoti), s)[i]
              for œÄi‚Ä≤ in Œ†[i], œÄnoti in jointŒ†noti, s in ùíÆ)
    model = Model(Ipopt.Optimizer)
    @variable(model, Œ¥)
    @variable(model, b[jointŒ†noti, ùíÆ] ‚â• 0)
    @objective(model, Max, Œ¥)
    @constraint(model, [œÄi‚Ä≤=Œ†[i]],
        sum(b[œÄnoti, s] * (Ui[œÄi‚Ä≤, œÄnoti, s] - Ui[œÄi, œÄnoti, s])
        for œÄnoti in jointŒ†noti for s in ùíÆ) ‚â• Œ¥)
    @constraint(model, sum(b) == 1)
    optimize!(model)
    return value(Œ¥) ‚â• 0
end
####################

#################### collaborative_agents 1
struct DecPOMDP
    Œ≥  # discount factor
    ‚Ñê  # agents
    ùíÆ  # state space
    ùíú  # joint action space
    ùí™  # joint observation space
    T  # transition function
    O  # joint observation function
    R  # reward function
end
####################

#################### collaborative_agents 2
struct DecPOMDPDynamicProgramming
    b   # initial belief
    d   # depth of conditional plans
end

function solve(M::DecPOMDPDynamicProgramming, ùí´::DecPOMDP)
    ‚Ñê, ùíÆ, ùíú, ùí™, T, O, R, Œ≥ = ùí´.‚Ñê, ùí´.ùíÆ, ùí´.ùíú, ùí´.ùí™, ùí´.T, ùí´.O, ùí´.R, ùí´.Œ≥
    R‚Ä≤(s, a) = [R(s, a) for i in ‚Ñê]
    ùí´‚Ä≤ = POMG(Œ≥, ‚Ñê, ùíÆ, ùíú, ùí™, T, O, R‚Ä≤)
    M‚Ä≤ = POMGDynamicProgramming(M.b, M.d)
    return solve(M‚Ä≤, ùí´‚Ä≤)
end
####################

#################### collaborative_agents 3
struct DecPOMDPIteratedBestResponse
    b     # initial belief
    d     # depth of conditional plans
    k_max # number of iterations
end

function solve(M::DecPOMDPIteratedBestResponse, ùí´::DecPOMDP)
    ‚Ñê, ùíÆ, ùíú, ùí™, T, O, R, Œ≥ = ùí´.‚Ñê, ùí´.ùíÆ, ùí´.ùíú, ùí´.ùí™, ùí´.T, ùí´.O, ùí´.R, ùí´.Œ≥
    b, d, k_max = M.b, M.d, M.k_max
    R‚Ä≤(s, a) = [R(s, a) for i in ‚Ñê]
    ùí´‚Ä≤ = POMG(Œ≥, ‚Ñê, ùíÆ, ùíú, ùí™, T, O, R‚Ä≤)
    Œ† = create_conditional_plans(ùí´, d)
    œÄ = [rand(Œ†[i]) for i in ‚Ñê]
    for k in 1:k_max
        for i in shuffle(‚Ñê)
            œÄ‚Ä≤(œÄi) = Tuple(j == i ? œÄi : œÄ[j] for j in ‚Ñê)
            Ui(œÄi) = utility(ùí´‚Ä≤, b, œÄ‚Ä≤(œÄi))[i]
            œÄ[i] = argmax(Ui, Œ†[i])
        end
    end
    return Tuple(œÄ)
end
####################

#################### collaborative_agents 4
struct DecPOMDPHeuristicSearch
    b     # initial belief
    d     # depth of conditional plans
    œÄ_max # number of policies
end

function solve(M::DecPOMDPHeuristicSearch, ùí´::DecPOMDP)
    ‚Ñê, ùíÆ, ùíú, ùí™, T, O, R, Œ≥ = ùí´.‚Ñê, ùí´.ùíÆ, ùí´.ùíú, ùí´.ùí™, ùí´.T, ùí´.O, ùí´.R, ùí´.Œ≥
    b, d, œÄ_max = M.b, M.d, M.œÄ_max
    R‚Ä≤(s, a) = [R(s, a) for i in ‚Ñê]
    ùí´‚Ä≤ = POMG(Œ≥, ‚Ñê, ùíÆ, ùíú, ùí™, T, O, R‚Ä≤)
    Œ† = [[ConditionalPlan(ai) for ai in ùíú[i]] for i in ‚Ñê]
    for t in 1:d
        allŒ† = expand_conditional_plans(ùí´, Œ†)
        Œ† = [[] for i in ‚Ñê]
        for z in 1:œÄ_max
            b‚Ä≤ = explore(M, ùí´, t)
            œÄ = argmax(œÄ -> first(utility(ùí´‚Ä≤, b‚Ä≤, œÄ)), joint(allŒ†))
            for i in ‚Ñê
                push!(Œ†[i], œÄ[i])
                filter!(œÄi -> œÄi != œÄ[i], allŒ†[i])
            end
        end
    end
    return argmax(œÄ -> first(utility(ùí´‚Ä≤, b, œÄ)), joint(Œ†))
end

function explore(M::DecPOMDPHeuristicSearch, ùí´::DecPOMDP, t)
    ‚Ñê, ùíÆ, ùíú, ùí™, T, O, R, Œ≥ = ùí´.‚Ñê, ùí´.ùíÆ, ùí´.ùíú, ùí´.ùí™, ùí´.T, ùí´.O, ùí´.R, ùí´.Œ≥
    b = copy(M.b)
    b‚Ä≤ = similar(b)
    s = rand(SetCategorical(ùíÆ, b))
    for œÑ in 1:t
        a = Tuple(rand(ùíúi) for ùíúi in ùíú)
        s‚Ä≤ = rand(SetCategorical(ùíÆ, [T(s,a,s‚Ä≤) for s‚Ä≤ in ùíÆ]))
        o = rand(SetCategorical(joint(ùí™), [O(a,s‚Ä≤,o) for o in joint(ùí™)]))
        for (i‚Ä≤, s‚Ä≤) in enumerate(ùíÆ)
            po = O(a, s‚Ä≤, o)
            b‚Ä≤[i‚Ä≤] = po*sum(T(s,a,s‚Ä≤)*b[i] for (i,s) in enumerate(ùíÆ))
        end
        normalize!(b‚Ä≤, 1)
        b, s = b‚Ä≤, s‚Ä≤
    end
    return b‚Ä≤
end
####################

#################### collaborative_agents 5
struct DecPOMDPNonlinearProgramming
    b # initial belief
    ‚Ñì # number of nodes for each agent
end

function tensorform(ùí´::DecPOMDP)
    ‚Ñê, ùíÆ, ùíú, ùí™, R, T, O = ùí´.‚Ñê, ùí´.ùíÆ, ùí´.ùíú, ùí´.ùí™, ùí´.R, ùí´.T, ùí´.O
    ‚Ñê‚Ä≤ = eachindex(‚Ñê)
    ùíÆ‚Ä≤ = eachindex(ùíÆ)
    ùíú‚Ä≤ = [eachindex(ùíúi) for ùíúi in ùíú]
    ùí™‚Ä≤ = [eachindex(ùí™i) for ùí™i in ùí™]
    R‚Ä≤ = [R(s,a) for s in ùíÆ, a in joint(ùíú)]
    T‚Ä≤ = [T(s,a,s‚Ä≤) for s in ùíÆ, a in joint(ùíú), s‚Ä≤ in ùíÆ]
    O‚Ä≤ = [O(a,s‚Ä≤,o) for a in joint(ùíú), s‚Ä≤ in ùíÆ, o in joint(ùí™)]
    return ‚Ñê‚Ä≤, ùíÆ‚Ä≤, ùíú‚Ä≤, ùí™‚Ä≤, R‚Ä≤, T‚Ä≤, O‚Ä≤
end

function solve(M::DecPOMDPNonlinearProgramming, ùí´::DecPOMDP)
    ùí´, Œ≥, b = ùí´, ùí´.Œ≥, M.b
    ‚Ñê, ùíÆ, ùíú, ùí™, R, T, O = tensorform(ùí´)
    X = [collect(1:M.‚Ñì) for i in ‚Ñê]
    jointX, jointùíú, jointùí™ = joint(X), joint(ùíú), joint(ùí™)
    x1 = jointX[1]
    model = Model(Ipopt.Optimizer)
    @variable(model, U[jointX,ùíÆ])
    @variable(model, œà[i=‚Ñê,X[i],ùíú[i]] ‚â• 0)
    @variable(model, Œ∑[i=‚Ñê,X[i],ùíú[i],ùí™[i],X[i]] ‚â• 0)
    @objective(model, Max, b‚ãÖU[x1,:])
    @NLconstraint(model, [x=jointX,s=ùíÆ],
        U[x,s] == (sum(prod(œà[i,x[i],a[i]] for i in ‚Ñê)
                   *(R[s,y] + Œ≥*sum(T[s,y,s‚Ä≤]*sum(O[y,s‚Ä≤,z]
                       *sum(prod(Œ∑[i,x[i],a[i],o[i],x‚Ä≤[i]] for i in ‚Ñê)
                               *U[x‚Ä≤,s‚Ä≤] for x‚Ä≤ in jointX)
                       for (z, o) in enumerate(jointùí™)) for s‚Ä≤ in ùíÆ))
                   for (y, a) in enumerate(jointùíú))))
    @constraint(model, [i=‚Ñê,xi=X[i]],
                sum(œà[i,xi,ai] for ai in ùíú[i]) == 1)
    @constraint(model, [i=‚Ñê,xi=X[i],ai=ùíú[i],oi=ùí™[i]],
                sum(Œ∑[i,xi,ai,oi,xi‚Ä≤] for xi‚Ä≤ in X[i]) == 1)
    optimize!(model)
    œà‚Ä≤, Œ∑‚Ä≤ = value.(œà), value.(Œ∑)
    return [ControllerPolicy(ùí´, X[i],
            Dict((xi,ùí´.ùíú[i][ai]) => œà‚Ä≤[i,xi,ai]
                 for xi in X[i], ai in ùíú[i]),
            Dict((xi,ùí´.ùíú[i][ai],ùí´.ùí™[i][oi],xi‚Ä≤) => Œ∑‚Ä≤[i,xi,ai,oi,xi‚Ä≤]
                 for xi in X[i], ai in ùíú[i], oi in ùí™[i], xi‚Ä≤ in X[i]))
        for i in ‚Ñê]
end
####################

#################### search 1
struct Search
    ùíÆ  # state space
    ùíú  # valid action function
    T  # transition function
    R  # reward function
end
####################

#################### search 2
function forward_search(ùí´::Search, s, d, U)
	ùíú, T, R = ùí´.ùíú(s), ùí´.T, ùí´.R
	if isempty(ùíú) || d ‚â§ 0
		return (a=nothing, u=U(s))
	end
	best = (a=nothing, u=-Inf)
	for a in ùíú
		s‚Ä≤ = T(s,a)
		u = R(s,a) + forward_search(ùí´, s‚Ä≤, d-1, U).u
		if u > best.u
			best = (a=a, u=u)
		end
	end
	return best
end
####################

#################### search 3
function branch_and_bound(ùí´::Search, s, d, Ulo, Qhi)
	ùíú, T, R = ùí´.ùíú(s), ùí´.T, ùí´.R
	if isempty(ùíú) || d ‚â§ 0
		return (a=nothing, u=Ulo(s))
	end
	best = (a=nothing, u=-Inf)
	for a in sort(ùíú, by=a->Qhi(s,a), rev=true)
		if Qhi(s,a) ‚â§ best.u
			return best # safe to prune
		end
		u = R(s,a) + branch_and_bound(ùí´,T(s,a),d-1,Ulo,Qhi).u
		if u > best.u
			best = (a=a, u=u)
		end
	end
	return best
end
####################

#################### search 4
function dynamic_programming(ùí´::Search, s, d, U, M=Dict())
	if haskey(M, (d,s))
		return M[(d,s)]
	end
	ùíú, T, R = ùí´.ùíú(s), ùí´.T, ùí´.R
	if isempty(ùíú) || d ‚â§ 0
		best = (a=nothing, u=U(s))
	else
		best = (a=first(ùíú), u=-Inf)
		for a in ùíú
			s‚Ä≤ = T(s,a)
			u = R(s,a) + dynamic_programming(ùí´, s‚Ä≤, d-1, U, M).u
			if u > best.u
				best = (a=a, u=u)
			end
		end
	end
	M[(d,s)] = best
	return best
end
####################

#################### search 5
function heuristic_search(ùí´::Search, s, d, Uhi, U, M)
	if haskey(M, (d,s))
		return M[(d,s)]
	end
	ùíú, T, R = ùí´.ùíú(s), ùí´.T, ùí´.R
	if isempty(ùíú) || d ‚â§ 0
		best = (a=nothing, u=U(s))
	else
		best = (a=first(ùíú), u=-Inf)
		for a in sort(ùíú, by=a->R(s,a) + Uhi(T(s,a)), rev=true)
			if R(s,a) + Uhi(T(s,a)) ‚â§ best.u
				break
			end
			s‚Ä≤ = T(s,a)
			u = R(s,a) + heuristic_search(ùí´, s‚Ä≤, d-1, Uhi, U, M).u
			if u > best.u
				best = (a=a, u=u)
			end
		end
	end
	M[(d,s)] = best
	return best
end
####################