In [1]:
import numpy as np
np.set_printoptions(precision=2)

In [2]:
# example environment used in section 9
# description on p203

class RandomWalk():
    
    def __init__(self, ):
        self.init()
    
    def init(self):
        self.s = 500
        return np.array(self.s), 0, False
        
    def step(self):
        
        step = int(np.round(np.random.rand() * 200 - 100))
        self.s += step
        
        if self.s < 0:
            return np.array(self.s), -1, True
        elif self.s >= 1000:
            return np.array(self.s), 1, True
        else:
            return np.array(self.s), 0, False

In [5]:
# Monte Carlo Stochastic Gradient Descent

class MCSGD():
    def __init__(self, func, gamma):
        self.func = func
        self.gamma = gamma
    
    def episode(self, env):
        states = []
        rewards = []
        s,r,t = env.init()
        while True:
            s,r,t = env.step()
            states.append(s)
            rewards.append(r)
            if t:
                break
        self.update(states, rewards)
        
    def update(self, states, rewards):
        
        discount = [1] + list(np.repeat(self.gamma,len(rewards)-1))
        discount = np.cumprod(discount)
        returns = discount * np.cumsum(rewards[::-1])
        
        for s, r in zip(states[::-1][1:], returns[1::]):
            self.func.update(s, r)

In [23]:
# State Aggregation method for random walk
# Example 9.1 on p.204

class StateAggregation():
    def __init__(self, alpha):
        self.alpha = alpha
        self.values = np.repeat(0.0,10)
    
    def get_state(self, state):
        return int(state/100)
    
    def get_value(self, state):
        return self.values[self.get_state(state)]
    
    def update(self, state, reward):
        state = self.get_state(state)
        error = reward - self.values[state]
        self.values[state] = self.values[state] + self.alpha * error
        

env = RandomWalk()
f = StateAggregation(alpha=0.05)
estimator = MCSGD(f, 0.99)
for _ in range(10000):
    estimator.episode(env)
    f.alpha *= 0.999  # learning rate decay
    
print(f.values)

[-0.77 -0.57 -0.37 -0.21 -0.05  0.09  0.24  0.39  0.55  0.77]


In [None]:
# TD(0) Stochastic Gradient Descent *WIP
class TDSGD():
    def __init__(self, func, gamma):
        self.func = func
        self.gamma = gamma
        
    def update(self, state, reward, state_):
        self.func.update(state, reward, state_)