# A5
https://github.com/joshkmartinez/RL-book/tree/master/A5

# 1

$$ \text{Goal: Maximize } g(S)=p\cdot \int_S^\infty (x-S) \cdot f(x)dx+h \cdot \int_{-\infty}^S (S-x)\cdot f(x)dx
\newline
g'(S)=-p \int_S^\infty f(x)dx+h + h\int_{-\infty}^S f(x)dx
\newline
\text{Let C be the CDF of }x
\newline
g'(S)=-p \cdot(1-C(S))+h\cdot C(S)
\\
p \cdot C(S^*)+h\cdot F(S^*) = p \\
F(S^*)=\frac{p}{p+h} \\
\therefore  S^* = F^{-1}(\frac{p}{p+h}) \\
$$
This problem would be the same to that of managing a portfolio of $p$ number of calls and $h$ number of puts, with the stike price being $K=S$

# 2

In [106]:
from dataclasses import dataclass
from typing import TypeVar, Iterable, Callable, Sequence, Tuple, List, Iterator
from rl.policy import DeterministicPolicy
from rl.function_approx import FunctionApprox, LinearFunctionApprox
from rl.approximate_dynamic_programming import back_opt_vf_and_policy, ValueFunctionApprox
from rl.distribution import SampledDistribution, Distribution
from rl.markov_decision_process import MarkovDecisionProcess, NonTerminal, State, Terminal
from rl.distribution import Gaussian

In [136]:
A = TypeVar('A')
S = TypeVar('S')

class OptionExecution:
    
    def __init__(
        self,
        time_steps: int,
        price_dynamics: Sequence[Callable[[float], Distribution[float]]],
        utility_f: Callable[[float], float],
        γ: float,
        f_approx: ValueFunctionApprox[float],
        strike_price: float,
        price_distribution: Distribution[float],
        is_call: bool # if put then False
    ):
        
        self.time_steps = time_steps
        self.price_dynamics = price_dynamics
        self.utility_f = utility_f
        self.γ = γ
        self.f_approx = f_approx
        self.price_distribution = price_distribution
        self.strike_price = strike_price
        self.is_call = is_call

    def state_distribution(self, t: int) -> SampledDistribution[NonTerminal[float]]:

        def sampler() -> NonTerminal[float]:
            price = float(self.price_distribution.sample())
            
            # sample
            for i in range(t):
                price = self.price_dynamics[i](price).sample()
                
            return NonTerminal(price)

        return SampledDistribution(sampler)
        

    def mdp(self, t: int) -> MarkovDecisionProcess[float, int]:
        utility_f = self.utility_f
        price_dynamics = self.price_dynamics
        steps = self.time_steps
        strike_price = self.strike_price
        is_call = self.is_call
        
        class BinTree(MarkovDecisionProcess[float, str]):
            def __init__(self):
                self.strike_price = strike_price
            
            def step(
                self,
                state: NonTerminal[float],
                action: bool
            ) -> Distribution[Tuple[State[float], float]]:
                    def sampler(
                            prev = state.state
                        ) -> Tuple[State[float], float]:
                            next_price = price_dynamics[t](prev).sample()
                            
                            if not action:
                                reward = float(utility_f(0))
                                next_state = NonTerminal(next_price)
                                
                            else:
                                if is_call:
                                    reward = float(strike_price - next_price)
                                else:
                                    reward = float(next_price - strike_price)
                                    
                                next_state = Terminal(next_price)
                            
                            return (next_state, reward)
                    
                    return SampledDistribution(
                            sampler = sampler,
                            expectation_samples = 50
                        )
                
            def actions(self, state: NonTerminal[S]) -> Iterable[A]:
                return [True, False] # Exercise or not

        return BinTree()

    
    def get_vf_pi(
        self
    ) -> Iterator[Tuple[ValueFunctionApprox[float],
                        DeterministicPolicy[float, bool]]]:
        
        return back_opt_vf_and_policy(
            mdp_f0_mu_triples = [(
                self.mdp(i),
                self.f_approx,
                self.state_distribution(i)
            ) for i in range(self.time_steps)],
            γ = self.γ,
            num_state_samples = 1000,
            error_tolerance = 1e-2 # reasonable value?
        )


In [148]:
time_steps = 6
mean = 103.0
OptionExecutionTest = OptionExecution(
    time_steps = time_steps,
    
    price_dynamics=[lambda p: Gaussian(
        μ = p,
        σ = 3.0
    ) for i in range(time_steps)],
    
    utility_f = lambda x: x,
    
    γ = 1,
    
    f_approx = LinearFunctionApprox.create(feature_functions=[
        lambda p_s: p_s.state,
    ]),
    
    strike_price = 99.0,
    
    price_distribution=Gaussian(
        # testing
        μ = mean,
        σ = 9.0
    ),
    
    is_call = True #False
)

for step, (v_f, p) in enumerate(OptionExecutionTest.get_vf_pi()):
    print("Time step " + str(step))
    print("Exercise? " + str(p.action_for(mean)))
    print("Value " + str(v_f(NonTerminal(mean))))
    print()

Time step 0
Exercise? False
Value 7.263467825733316

Time step 1
Exercise? False
Value 6.69429927624563

Time step 2
Exercise? False
Value 5.971647358099739

Time step 3
Exercise? False
Value 4.902376307865111

Time step 4
Exercise? False
Value 3.7287896934822187

Time step 5
Exercise? False
Value 1.9278270209978514



# 3

In [161]:
from rl.markov_process import MarkovProcess, NonTerminal, State
from rl.distribution import Distribution, Constant, Categorical, Gamma, Poisson, Gaussian, SampledDistribution
from functools import partial
from dataclasses import dataclass
from numpy.random import poisson
from typing import Sequence
import itertools
from rl.chapter9.order_book import OrderBook

In [207]:
S = OrderBook

class OrderBookDynamics(MarkovProcess[State[S]]):

    def transition(self, state: NonTerminal[S]) -> Distribution[NonTerminal[S]]:
        
        def sampler(state=state):
            
            s = copy.deepcopy(state.state)
            # LO buy
            for _ in range(poisson(3)):
                s = s.buy_limit_order(normal(state.state.bid_price(), 3), int(normal(20, 9)))[1]
            
            # LO sell
            for _ in range(poisson(2)):
                s = s.sell_limit_order(normal(state.state.ask_price(), 6), int(normal(21, 6)))[1]
            
            # MO buy
            for _ in range(poisson(2)):
                s = s.buy_market_order(int(normal(16, 3)))[1]
            
            # MO sell
            for _ in range(poisson(3)):
                s = s.sell_market_order(int(normal(12, 4)))[1]

            return NonTerminal(s)

        return SampledDistribution(sampler=sampler)

In [231]:
bids: PriceSizePairs = [DollarsAndShares(
    dollars = i,
    shares = poisson(100)) for i in range(100, 85, -1)]
asks: PriceSizePairs = [DollarsAndShares(
    dollars = i,
    shares = poisson(100)) for i in range(100, 135, 1)]

list(itertools.islice(OrderBookMP.simulate(Constant(NonTerminal(OrderBook(descending_bids=bids, ascending_asks=asks)))
), 500))[-1].state.pretty_print_order_book()


Bids
[DollarsAndShares(dollars=118.20534454614376, shares=17),
 DollarsAndShares(dollars=115.49351232423284, shares=12),
 DollarsAndShares(dollars=115.19251323975021, shares=17),
 DollarsAndShares(dollars=114.58784918989626, shares=21),
 DollarsAndShares(dollars=114.56109165272171, shares=24),
 DollarsAndShares(dollars=114.45409459020637, shares=31),
 DollarsAndShares(dollars=114.40282748887243, shares=15),
 DollarsAndShares(dollars=114.35910841156421, shares=15),
 DollarsAndShares(dollars=114.35256435803689, shares=10),
 DollarsAndShares(dollars=114.25011088731696, shares=34),
 DollarsAndShares(dollars=114.02103822227878, shares=7),
 DollarsAndShares(dollars=113.76406805697478, shares=8),
 DollarsAndShares(dollars=113.76214250498802, shares=1),
 DollarsAndShares(dollars=113.75025227553353, shares=22),
 DollarsAndShares(dollars=113.69027520093532, shares=4),
 DollarsAndShares(dollars=113.29648830295292, shares=9),
 DollarsAndShares(dollars=113.18374551250552, shares=13),
 DollarsAndSh