In [2]:
from rl.monte_carlo import mc_control
from typing import Iterable, Iterator, Tuple, TypeVar, Callable, Optional, Dict,Mapping,Sequence

from rl.distribution import Distribution
from rl.function_approx import FunctionApprox
import rl.markov_process as mp
import rl.markov_decision_process as markov_decision_process
from rl.markov_decision_process import (MarkovDecisionProcess)

from rl.markov_decision_process import Policy, FinitePolicy, TransitionStep
from rl.distribution import (Bernoulli, Constant, Categorical, Choose,
                             Distribution, FiniteDistribution, Gaussian, SampledDistribution)
from IPython.display import clear_output
from pprint import pprint
import numpy as np
from operator import itemgetter
import rl.iterate as iterate



## Import AssetAllocDiscrete from rl.chapter7.asset_alloc_discrete

In [3]:
from rl.chapter7.asset_alloc_discrete import AssetAllocDiscrete
from rl.function_approx import DNNSpec, AdamGradient, DNNApprox

steps: int = 4
μ: float = 0.13
σ: float = 0.2
r: float = 0.07
a: float = 1.0
init_wealth: float = 1.0
init_wealth_var: float = 0.1
excess: float = μ - r
var: float = σ * σ
base_alloc: float = excess / (a * var)
risky_ret: Sequence[Gaussian] = [Gaussian(μ=μ, σ=σ) for _ in range(steps)]
riskless_ret: Sequence[float] = [r for _ in range(steps)]
utility_function: Callable[[float], float] = lambda x: - np.exp(-a * x) / a
alloc_choices: Sequence[float] = np.linspace(
    2 / 3 * base_alloc,
    4 / 3 * base_alloc,
    11
)
feature_funcs: Sequence[Callable[[Tuple[float, float]], float]] = \
    [
        lambda _: 1.,
        lambda w_x: w_x[0],
        lambda w_x: w_x[1],
        lambda w_x: w_x[1] * w_x[1]
    ]
dnn: DNNSpec = DNNSpec(
    neurons=[],
    bias=False,
    hidden_activation=lambda x: x,
    hidden_activation_deriv=lambda y: np.ones_like(y),
    output_activation=lambda x: - np.sign(a) * np.exp(-x),
    output_activation_deriv=lambda y: -y
)
init_wealth_distr: Gaussian = Gaussian(μ=init_wealth, σ=init_wealth_var)
aad: AssetAllocDiscrete = AssetAllocDiscrete(
    risky_return_distributions=risky_ret,
    riskless_returns=riskless_ret,
    utility_func=utility_function,
    risky_alloc_choices=alloc_choices,
    feature_functions=feature_funcs,
    dnn_spec=dnn,
    initial_wealth_distribution=init_wealth_distr
)


## Define Function SARSA for the function approximation SARSA

In [20]:
S = TypeVar('S')
A = TypeVar('A')

def policy_from_q(
        q: FunctionApprox[Tuple[S, A]],
        actions: Iterable[A],
        ϵ: float = 0.0
) -> Policy[S, A]:

    explore = Bernoulli(ϵ)

    class QPolicy(Policy[S, A]):
        def act(self, s: S) -> Optional[Distribution[A]]:
            #terminal state?

            if explore.sample():
                return Choose(set(actions))

            ind = np.argmax(q.evaluate([(s, a) for a in actions]))
            return Constant(actions[ind])

    return QPolicy()


def SARSA(
        simulator: Callable[[S,A,int], TransitionStep[S, A]],
        state_distribution: Distribution[S],
        n_steps: int,      # max number of steps in an episode
        q_0: Sequence[FunctionApprox[Tuple[S, A]]], # a sequence of length n_steps
        actions: Sequence[A],  # Assume same action space at each time step, as in AssetAllocDiscrete
        gamma: float = 1.0,
) -> Iterator[Sequence[FunctionApprox[Tuple[S, A]]]]:

    # initialize the q and p for each time step
    q = q_0
    p = []


    trace_count = 0
    while True:
        trace_count += 1
        epsilon = 1/trace_count

        state = state_distribution.sample()      # initialize S
        p = policy_from_q(q[0],actions,epsilon)
        action = p.act(state).sample()

        # update q_t,p_t given the (state,action),return pair.
        for step_count in range(n_steps-1):
            next_state,reward = simulator(state,action,step_count)
            p = policy_from_q(q[step_count],actions,epsilon)
            next_action = p.act(next_state).sample()
            #update Q
            q[step_count] = q[step_count].update(
                [((state, action), reward + gamma*q[step_count+1].evaluate([(next_state,next_action)])[0])])
            state = next_state
            action = next_action

        q[n_steps-1] = q[n_steps-1].update([((state, action), reward)])
        yield q


n_steps = steps
states = init_wealth_distr
actions = alloc_choices

# Here define the simulator to simulate  Reward steps
def AssetAlloc_simulator(state,action,step_count, mdp = aad):

    next_distribution = mdp.get_mdp(step_count).step(state, action)
    if next_distribution is None:
        return
    next_state, reward = next_distribution.sample()
    return next_state,reward

# Here initialize the Q with DNN
adam_gradient: AdamGradient = AdamGradient(
          learning_rate=0.01,
          decay1=0.9,
          decay2=0.999
      )

f0 = DNNApprox.create(
            feature_functions=feature_funcs,
            dnn_spec=dnn,
            adam_gradient=adam_gradient
        )

q_0 = [f0]*steps

x = SARSA(
        simulator = AssetAlloc_simulator,
        state_distribution = states,
        n_steps = n_steps,      # max number of steps in an episode
        q_0 = q_0, # a sequence of length n_steps
        actions = actions  # Assume same action space at each time step, as in AssetAllocDiscrete
)


## Run Simulation for Discrete Asset Allocation

In [21]:
import itertools
*_,q_final = itertools.islice(x,10000)



for t, q in enumerate(q_final):
    print(f"Time {t:d}")
    print()
    opt_alloc: float = max(
        ((q.evaluate([(init_wealth, ac)])[0], ac) for ac in alloc_choices),
        key=itemgetter(0)
    )[1]
    val: float = max(q.evaluate([(init_wealth, ac)])[0]
                     for ac in alloc_choices)
    print(f"Opt Risky Allocation = {opt_alloc:.3f}, Opt Val = {val:.3f}")
    print("Optimal Weights below:")
    for wts in q.weights:
        pprint(wts.weights)
    print()

Time 0

Opt Risky Allocation = 2.000, Opt Val = -0.000
Optimal Weights below:
array([[0.92668216, 0.80368065, 1.19502037, 1.46901892]])

Time 1

Opt Risky Allocation = 2.000, Opt Val = -0.000
Optimal Weights below:
array([[0.83521631, 0.60700166, 1.14078539, 1.56938303]])

Time 2

Opt Risky Allocation = 2.000, Opt Val = -0.000
Optimal Weights below:
array([[1.58797381, 1.07730445, 1.45525804, 1.12153839]])

Time 3

Opt Risky Allocation = 2.000, Opt Val = -0.000
Optimal Weights below:
array([[1.40528257, 0.9757113 , 1.47585994, 1.40533774]])



Like in MC control there are some unknown problems here. But I think the overall framework should be reasonable...

