In [1]:
import numpy as np
import matplotlib.pyplot as plt
np.set_printoptions(precision=2, suppress=True)

In [44]:
# Random walk with 100 states
class RandomWalk():
    
    def __init__(self, ):
        self.init()
    
    def init(self):
        self.s = 50
        return self.form_state(self.s)
        
    def step(self):
        
        step = int(np.round(np.random.rand() * 20 - 10))
        self.s += step
        
        if self.s < 0:
            return self.form_state(self.s), -1, True
        elif self.s >= 100:
            return self.form_state(self.s), 1, True
        else:
            return self.form_state(self.s), 0, False
        
    # output state aggregation [0,100] -> [0,10]
    def form_state(self, state):
        output = np.zeros(12)
        output[int((state+10)/10)] = 1
        return output

In [45]:
# TD(lambda) in section 12.2
class TDlambda():
    def __init__(self, weights, alpha, gamma, lambda_):
        self.weights = weights
        self.alpha = alpha
        self.gamma = gamma
        self.lambda_ = lambda_
        
    def episode(self, env):
        state = env.init()
        trace = np.zeros_like(self.weights)  # eligibility trace Z
        
        while True:
            state_, reward, termination = env.step()
            
            trace = self.gamma * self.lambda_ * trace + state
            delta = reward + self.gamma * self.value(state_) - self.value(state)
            
            self.weights += self.alpha * delta * trace
            
            state = state_
            
            if termination:
                break
                
    def value(self, state):
        return np.dot(self.weights, state)

In [46]:
predictor = TDlambda(np.zeros(12), 0.4, 0.9, 0.2)
for _ in range(100):
    predictor.episode(RandomWalk())

In [47]:
predictor.weights

array([ 0.  , -0.74, -0.52, -0.27, -0.08,  0.01,  0.09,  0.15,  0.25,
        0.35,  0.61,  0.  ])