# Write-up and code for Feb 20 and 22

## MDP Interface for RL Algorithms with Value Function Approximation

In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
from typing import Dict, NamedTuple, Callable, Set, Tuple, List
from modules.MDP import MDP, Q, Policy, V
from modules.RL_interface import RL_interface
from modules.state_action_vars import S, A
import random
import numpy as np
import timeit

In [5]:
class RL_interface_FA(NamedTuple):
    # interface for reinforcement learning with value function approximation, 
    # largely inspired by Professor Ashwin Rao's implementation
    
    # function that takes in a state and return a set of possible actions
    state_action_func: Callable[[S], Set[A]]
    # function that takes in a state and an action, and returns a new state sp and the reward 
    state_reward_func: Callable[[S, A], Tuple[S, float]]
    # initial state generator
    init_state_gen: Callable[[], S]
    # inital state generator
    init_state_action_gen: Callable[[], Tuple[S, A]]
    # discount factor
    gamma: float

## Monte-Carlo Prediction Algorithm with Value Function Approximation

In [6]:
def mc_vi_approx(polf: Callable[[S], A], alpha: float, v_hat: Callable[[S, np.ndarray], float], 
                 sampler: RL_interface_FA, num_epi: int, num_steps: int, d: int) \
    -> Callable[[S], float]:
    # implementation of Monte-Carlo Prediction Algorithm with Value Function Approximation
    # assume the approximation function is linear

    # initialize weight vector
    w = np.zeros((d,1))
    
    gamma = sampler.gamma
    
    for i in range(num_epi):
        s_list, a_list, r_list = get_mc_path(polf, sampler, num_steps)
        G = 0
        for j in range(num_steps):
            G = np.sum(np.multiply(np.power(gamma, np.arange(num_steps-j)), np.array(r_list[j:])))
            w += alpha*(G - v_hat(s_list[j], w))*w
   
    return w
    
    
def get_mc_path(polf: Callable[[S], A], sampler: RL_interface_FA, num_steps: int) \
    -> Tuple[List[S], List[A], List[float]]:
    # simulate a Monte-Carlo path
    s_list = []
    a_list = []
    r_list = []

    s, a = sampler.init_state_action_gen()
    s_list.append(s)
    a_list.append(a)
    
    for i in range(n_steps):
        s, r = sampler.state_reward_func(s, a)
        a = polf(s)
        s_list.append(s)
        a_list.append(a)
        r_list.append(r)
        
    # sample the last reward
    _, r = sampler.state_reward_func(s, a)
    r_list.append(r)
    
    return s_list, a_list, r_list

## 1-step TD Prediction with Value Function Approximation

In [7]:
def td_0_approx(polf: Callable[[S], A], alpha: float, v_hat: Callable[[S, np.ndarray], float], 
                 sampler: RL_interface_FA, num_epi: int, num_steps: int, d: int) \
    -> Callable[[S], float]:
    # implementation of 1-step TD prediction with Value Function Approximation
    # assume the approximation function is linear

    # initialize weight vector
    w = np.zeros((d,1))
    
    gamma = sampler.gamma
    
    for i in range(num_epi):
        s = sampler.init_state_gen()
        G = 0
        for j in range(num_steps):
            # sample a state from the policy
            a = polf(s)
            # observe next state and the reward
            sp, r = sampler.state_reward_func(s, a)
            
            w += alpha*(r + gamma*v_hat(sp, w) - v_hat(s, w))*w
            s = sp
   
    return w

## Eligibility Traces based TD($\lambda$) with Value Function Approximation

In [8]:
def backward_td_lambda_approx(polf: Callable[[S], A], alpha: float, v_hat: Callable[[S, np.ndarray], float], 
                     sampler: RL_interface_FA, num_epi: int, num_steps: int, d: int, 
                    feature_func: Callable[[S], np.ndarray], lam: float) \
    -> Callable[[S], float]:
    # implementation of Backward View TD(lambda) Prediction Algorithm with Value Function Approximation
    # assume the approximation function is linear

    # initialize weight vector
    w = np.zeros((d,1))
    
    gamma = sampler.gamma
    
    for i in range(num_epi):
        # initialize eligibility traces
        E = np.zeros((d,1))
        s = sampler.init_state_gen()
        
        G = 0
        for j in range(num_steps):
            # sample a state from the policy
            a = polf(s)
            # observe next state and the reward
            sp, r = sampler.state_reward_func(s, a)
            
            delta = r + gamma*v_hat(sp, w) - v_hat(s, w)
            E += gamma*lam*E + feature_func(s)
            w += alpha*delta*E
            
            s = sp
   
    return w

## SARSA and SARSA($\lambda$) with Value Function Approximation

In [9]:
def sarsa_fa(feature_func: Callable[[S, A], np.ndarray], polf: Callable[[S, np.ndarray, float], A],
             alpha: float, sampler: RL_interface_FA, num_epi: int, num_steps: int, d: int, eps: float) \
    -> np.ndarray:
    # implementation of Sarsa with Value Function Approximation
    # assume the approximation function is linear    
    
    # initialize weight vector
    w = np.zeros((d,1))
    
    gamma = sampler.gamma
    for i in range(num_epi):
        s,a = sampler.init_state_action_gen()
        for t in range(num_steps):
            # observe next state and the reward
            sp, r = sampler.state_reward_func(s, a)
            # take next action ap using epsilon greedy policy
            ap = polf(sp, w, eps)
            # find Q(sp, ap; w) and Q(s, a; w)
            q = np.dot(feature_func(s,a), w)
            q_next = np.dot(feature_func(sp,ap), w)
            # calculate delta
            delta = r + gamma*q_next - q
            # update w
            w += alpha*delta*feature_func(s,a)
            # update s and a
            s, a = sp, ap
            
    return w

In [10]:
def sarsa_lambda_fa(feature_func: Callable[[S, A], np.ndarray], polf: Callable[[S, np.ndarray, float], A], 
            alpha: float, sampler: RL_interface_FA, num_epi: int, num_steps: int, d: int, eps: float) \
    -> np.ndarray:
    # implementation of Sarsa(lambda) with Value Function Approximation
    # assume the approximation function is linear    
    
    # initialize weight vector
    w = np.zeros((d,1))
    
    gamma = sampler.gamma
    for i in range(num_epi):
        E = np.zeros((d,1))
        s,a = sampler.init_state_action_gen()
    
        G = 0
        for t in range(num_steps):
            # observe next state and the reward
            sp, r = sampler.state_reward_func(s, a)
            # take next action ap using epsilon greedy
            ap = polf(sp, w, eps)
            # find Q(sp, ap; w) and Q(s, a; w)
            q = np.dot(feature_func(s,a), w)
            q_next = np.dot(feature_func(sp,ap), w)
            # calculate delta
            delta = r + gamma*q_next - q
            # update eligibility traces
            E += gamma*lam*E + feature_func(s,a)
            # update weights
            w += alpha*delta*E
            
            s = sp
            a = ap
            
    return w

## Q-learning with Function Approximation

In [11]:
def qlearning_fa(feature_func: Callable[[S, A], np.ndarray], polf: Callable[[S, np.ndarray, float], A],
             alpha: float, sampler: RL_interface_FA, num_epi: int, num_steps: int, d: int, eps: float,
                q_max_finder: Callable[[S, np.ndarray], float]) \
    -> np.ndarray:
    # implementation of Q-learning with Value Function Approximation
    # assume the approximation function is linear    
    
    # initialize weight vector
    w = np.zeros((d,1))
    
    gamma = sampler.gamma
    for i in range(num_epi):
        s = sampler.init_state_gen()
        for t in range(num_steps):
            # take action a, sampled from a epsilon greedy policy
            a = polf(s, w, eps)
            # observe next state and the reward
            sp, r = sampler.state_reward_func(s, a)
            # find Q(s, a; w)
            q = np.dot(feature_func(s,a), w)
            # find best Q for next state
            q_max = q_max_finder(sp, w)
            # calculate delta
            delta = r + gamma*q_max - q
            # update w
            w += alpha*delta*feature_func(s,a)
            # update s
            s = sp
            
    return w

## Least Square Policy Iteration for American Option Pricing

In [59]:
from modules.Option import Option, monte_carlo_stock, payoff, longstaff_schwartz, Binary_Tree

def lspi(option: Option, m: int, n: int, r: int) -> float:
    # simulate stock paths
    SP = monte_carlo_stock(option, m, n)
    # initialize parameters
    A = np.zeros((r,r))
    B = np.zeros((r,1))
    w = np.zeros((r,1))
    delta_t = option.tau / n
    for i in range(m):
        time = 0
        for j in range(n):
            Q = payoff(SP[i,j+1], option)
            phi = feature_func(SP[i,j], option, time)
            phi_next = feature_func(SP[i,j+1], option, time + delta_t)
            
            P = np.zeros((1,r))
            if j < n-1 and Q <= np.matmul(phi_next,w):
                P = phi_next
            
            R = 0
            if Q > np.matmul(P,w):
                R = Q
            
            A += np.matmul(phi.T, np.subtract(phi, np.exp(-option.r*delta_t) * P))
            B += np.exp(-option.r*delta_t) * R * phi.T
            time += delta_t
        if (i+1) % 100 == 0 or (i+1) == m:     
            w = np.matmul(np.linalg.inv(A), B)
            A = np.zeros((r,r))
            B = np.zeros((r,1))
        if (i) % 10000 == 0 and i > 0:
            print("Price after {} iterations: ".format(i), np.matmul(feature_func(option.S, option, 0), w)[0][0])
    
    return np.matmul(feature_func(option.S, option, 0), w)[0][0]


def feature_func(s: float, option: Option, time: float) -> np.ndarray:
        sp = s / option.K
        ttm = option.tau - time
        phi0 = 1.0
        phi1 = np.exp(-sp / 2.0)
        phi2 = phi1 * (1.0 - sp)
        phi3 = phi1 * (1.0 - 2.0*sp + np.square(sp)/2.0)
        phi_t_0 = np.sin(-time*np.pi/(2.0*option.tau)+np.pi/2.0)
        phi_t_1 = np.log(ttm)
        phi_t_2 = np.square(time/option.tau)
        features = np.array((phi0, phi1, phi2, phi3, phi_t_0, phi_t_1, phi_t_2)).reshape(1, 7)
        
        return features

In [60]:
am_put = Option(False, True, 36., 40., 0.2, 1.0, 0.06, 0)
am_put_2 = Option(False, True, 100., 110., 0.25, 0.5, 0.05, 0)

In [63]:
start = timeit.default_timer()

print("Price for American put using LSPI: ", lspi(am_put, 100000, 100, 7))
print("Price for American put using Longstaff-Schwartz: ", longstaff_schwartz(am_put, 1000, 100))
print("Price for American put using binary tree: ", Binary_Tree(am_put, 20))

stop = timeit.default_timer()

print('Time: ', stop - start)  



Price after 10000 iterations:  4.471473257563076
Price after 20000 iterations:  4.829348800646807
Price after 30000 iterations:  4.608171980580508
Price after 40000 iterations:  4.88816264779814
Price after 50000 iterations:  4.649290890093857
Price after 60000 iterations:  4.559197012090703
Price after 70000 iterations:  4.772003005452177
Price after 80000 iterations:  4.873095222981682
Price after 90000 iterations:  4.320027732771136
Price for American put using LSPI:  4.575048600610511
Price for American put using Longstaff-Schwartz:  4.642148884469446
Price for American put using binary tree:  4.490003213687527
Time:  395.3375598449493


In [50]:
print("Price for American put using LSPI: ", lspi(am_put_2, 10000, 100, 7))
print("Price for American put using Longstaff-Schwartz: ", longstaff_schwartz(am_put_2, 1000, 100))
print("Price for American put using binary tree: ", Binary_Tree(am_put_2, 20))

Price after 0 iterations:  0.0




Price after 1000 iterations:  12.6058723869879
Price after 2000 iterations:  11.770321257150504
Price after 3000 iterations:  11.965668510449419
Price after 4000 iterations:  11.592394066477201
Price after 5000 iterations:  13.213612152520172
Price after 6000 iterations:  12.940232198231497
Price after 7000 iterations:  14.346933230949892
Price after 8000 iterations:  12.714963702592684
Price after 9000 iterations:  13.082343755210312
Price for American put using LSPI:  11.950302912033951
Price for American put using Longstaff-Schwartz:  12.403651641369288
Price for American put using binary tree:  12.151536447011493
