In [69]:
from rl.monte_carlo import mc_control
from typing import Iterable, Iterator, Tuple, TypeVar, Callable, Optional, Dict,Mapping,Sequence

from rl.distribution import Distribution
from rl.function_approx import FunctionApprox
import rl.markov_process as mp
import rl.markov_decision_process as markov_decision_process
from rl.markov_decision_process import (MarkovDecisionProcess)

from rl.markov_decision_process import Policy, FinitePolicy, TransitionStep
from rl.distribution import (Bernoulli, Constant, Categorical, Choose,
                             Distribution, FiniteDistribution, Gaussian, SampledDistribution)
from IPython.display import clear_output
from pprint import pprint
import numpy as np
from operator import itemgetter
import rl.iterate as iterate

## Import AssetAllocDiscrete from rl.chapter7.asset_alloc_discrete

In [62]:
from rl.chapter7.asset_alloc_discrete import AssetAllocDiscrete
from rl.function_approx import DNNSpec, AdamGradient, DNNApprox

steps: int = 4
μ: float = 0.13
σ: float = 0.2
r: float = 0.07
a: float = 1.0
init_wealth: float = 1.0
init_wealth_var: float = 0.1
excess: float = μ - r
var: float = σ * σ
base_alloc: float = excess / (a * var)
risky_ret: Sequence[Gaussian] = [Gaussian(μ=μ, σ=σ) for _ in range(steps)]
riskless_ret: Sequence[float] = [r for _ in range(steps)]
utility_function: Callable[[float], float] = lambda x: - np.exp(-a * x) / a
alloc_choices: Sequence[float] = np.linspace(
    2 / 3 * base_alloc,
    4 / 3 * base_alloc,
    11
)
feature_funcs: Sequence[Callable[[Tuple[float, float]], float]] = \
    [
        lambda _: 1.,
        lambda w_x: w_x[0],
        lambda w_x: w_x[1],
        lambda w_x: w_x[1] * w_x[1]
    ]
dnn: DNNSpec = DNNSpec(
    neurons=[],
    bias=False,
    hidden_activation=lambda x: x,
    hidden_activation_deriv=lambda y: np.ones_like(y),
    output_activation=lambda x: - np.sign(a) * np.exp(-x),
    output_activation_deriv=lambda y: -y
)
init_wealth_distr: Gaussian = Gaussian(μ=init_wealth, σ=init_wealth_var)
aad: AssetAllocDiscrete = AssetAllocDiscrete(
    risky_return_distributions=risky_ret,
    riskless_returns=riskless_ret,
    utility_func=utility_function,
    risky_alloc_choices=alloc_choices,
    feature_functions=feature_funcs,
    dnn_spec=dnn,
    initial_wealth_distribution=init_wealth_distr
)



In [90]:
it_qvf: Iterator[DNNApprox[Tuple[float, float]]] = \
        aad.backward_induction_qvf()
print("Backward Induction on Q-Value Function")
print("--------------------------------------")
print()
for t, q in enumerate(it_qvf):
    print(f"Time {t:d}")
    print()
    opt_alloc: float = max(
        ((q.evaluate([(init_wealth, ac)])[0], ac) for ac in alloc_choices),
        key=itemgetter(0)
    )[1]
    val: float = max(q.evaluate([(init_wealth, ac)])[0]
                     for ac in alloc_choices)
    print(f"Opt Risky Allocation = {opt_alloc:.3f}, Opt Val = {val:.3f}")
    print("Optimal Weights below:")
    for wts in q.weights:
        pprint(wts.weights)
    print()

Backward Induction on Q-Value Function
--------------------------------------

Time 0

Opt Risky Allocation = 1.200, Opt Val = -0.225
Optimal Weights below:
array([[ 0.1372123 ,  1.30858048,  0.0749945 , -0.03075826]])

Time 1

Opt Risky Allocation = 1.300, Opt Val = -0.257
Optimal Weights below:
array([[ 0.09134505,  1.22497502,  0.06730742, -0.0255609 ]])

Time 2

Opt Risky Allocation = 1.400, Opt Val = -0.291
Optimal Weights below:
array([[ 0.04540807,  1.14462205,  0.06450261, -0.02289097]])

Time 3

Opt Risky Allocation = 1.500, Opt Val = -0.328
Optimal Weights below:
array([[-0.0033298 ,  1.07023387,  0.06499304, -0.02184007]])



## Define mc_control for MC control algorithm on Finite Horizon. Define helper functions policy_from_q,returns

In [91]:
## use a modified version of rl.monte_carlo.mc_control
S = TypeVar('S')
A = TypeVar('A')

def policy_from_q(
        q: FunctionApprox[Tuple[S, A]],
        actions: Iterable[A],
        ϵ: float = 0.0
) -> Policy[S, A]:
    '''Return a policy that chooses the action that maximizes the reward
    for each state in the given Q function.

    Arguments:
      q -- approximation of the Q function for the MDP
      mdp -- the process for which we're generating a policy
      ϵ -- the fraction of the actions where we explore rather
      than following the optimal policy

    Returns a policy based on the given Q function.

    '''
    explore = Bernoulli(ϵ)

    class QPolicy(Policy[S, A]):
        def act(self, s: S) -> Optional[Distribution[A]]:
            #terminal state?

            if explore.sample():
                return Choose(set(actions))

            ind = np.argmax(q.evaluate([(s, a) for a in actions]))
            return Constant(actions[ind])

    return QPolicy()


def returns(trace, γ):
    '''Given an iterator of states and rewards, calculate the return of
    the first N states.

    Arguments:
    rewards -- instantaneous rewards
    γ -- the discount factor (0 < γ ≤ 1)
    tolerance -- a small value—we stop iterating once γᵏ ≤ tolerance

    '''
    trace = iter(trace)
    *transitions, last_transition = list(trace)

    return_steps = iterate.accumulate(
        reversed(transitions),
        func=lambda next, curr: curr.add_return(γ, next.return_),
        initial=last_transition.add_return(γ, 0)
    )
    return_steps = reversed(list(return_steps))


    return return_steps


def mc_control(
        simulator: Callable[[Distribution[S],Sequence[Policy[S,A]]], Iterator[TransitionStep[S, A]]],
        states: Distribution[S],
        n_steps: int,      # max number of steps in an episode
        q_0: Sequence[FunctionApprox[Tuple[S, A]]], # a sequence of length n_steps
        actions: Sequence[A],  # Assume same action space at each time step, as in AssetAllocDiscrete
        gamma: float = 1.0,
) -> Iterator[Sequence[FunctionApprox[Tuple[S, A]]]]:

    # initialize the q and p for each time step
    q = q_0
    p = []
    for q_t in q:
        p.append(policy_from_q(q_t,actions,1.0))

    trace_count = 0
    while True:
        trace_count += 1
        epsilon = 1/trace_count

        # get the trace with returns of n_steps steps
        trace: Iterable[TransitionStep[S, A]] =\
            simulator(states, p)
        return_trace = returns(trace,gamma)
        count = 0

        # update q_t,p_t given the (state,action),return pair.
        for step in return_trace:
            q[count] = q[count].update(
                [((step.state, step.action), step.return_)])
            p[count] = policy_from_q(q[count], actions, epsilon)
            count += 1
        yield q


n_steps = steps
states = init_wealth_distr
actions = alloc_choices

# Here define the simulator to simulate the trace of Reward steps
def AssetAlloc_simulator(start_states,policy,n = n_steps, mdp = aad):
    state: S = start_states.sample()
    count =0

    #for each time step use the different mdps
    while count < n_steps:
        action_distribution = policy[count].act(state)
        if action_distribution is None:
            return
        action = action_distribution.sample()
        next_distribution = mdp.get_mdp(count).step(state, action)
        if next_distribution is None:
            return
        next_state, reward = next_distribution.sample()
        yield TransitionStep(state, action, next_state, reward)
        state = next_state
        count += 1


# Here initialize the Q with DNN
adam_gradient: AdamGradient = AdamGradient(
          learning_rate=0.0001,
          decay1=0.9,
          decay2=0.999
      )

f0 = DNNApprox.create(
            feature_functions=feature_funcs,
            dnn_spec=dnn,
            adam_gradient=adam_gradient
        )

q_0 = [f0]*steps

# run the MC contorl algorithm,get the iterator of q_t for each time step
x =  mc_control(
        simulator =  AssetAlloc_simulator,
        states = states,
        n_steps = n_steps,      # max number of steps in an episode
        q_0 = q_0, # a sequence of length n_steps
        actions=actions,  # Assume same action space at each time step, as in AssetAllocDiscrete
        gamma= 1.0
)

import itertools
*_,q_final = itertools.islice(x,50000)

## Print the optimal policy and values

In [92]:
for t, q in enumerate(q_final):
    print(f"Time {t:d}")
    print()
    opt_alloc: float = max(
        ((q.evaluate([(init_wealth, ac)])[0], ac) for ac in alloc_choices),
        key=itemgetter(0)
    )[1]
    val: float = max(q.evaluate([(init_wealth, ac)])[0]
                     for ac in alloc_choices)
    print(f"Opt Risky Allocation = {opt_alloc:.3f}, Opt Val = {val:.3f}")
    print("Optimal Weights below:")
    for wts in q.weights:
        pprint(wts.weights)
    print()


Time 0

Opt Risky Allocation = 1.000, Opt Val = -0.236
Optimal Weights below:
array([[ 0.74646261,  0.67924272,  0.09229573, -0.07429557]])

Time 1

Opt Risky Allocation = 1.000, Opt Val = -0.256
Optimal Weights below:
array([[ 0.66592855,  0.80579313,  0.01925434, -0.12936472]])

Time 2

Opt Risky Allocation = 1.000, Opt Val = -0.286
Optimal Weights below:
array([[ 0.60633376,  0.90173984, -0.0473305 , -0.20874174]])

Time 3

Opt Risky Allocation = 1.000, Opt Val = -0.324
Optimal Weights below:
array([[ 0.54796225,  0.96893698, -0.10966852, -0.28004666]])



However the outcome of MC control is not consistent with Approx DP. Not sure why...