In [2]:
import numpy as np
import gym

In [205]:
class random_walk:
    
    def __init__(self, size, policy):
        """ size: int, size of random walk
            policy: function, returns -1 (left) or 1 (right)
        """
        self.policy = policy
        self.size = size
        self.pos = int(size / 2) + 1
        self.gamma = .99
        self.features = np.array([
            [1,0,0,0,0],
            [1/np.sqrt(2), 1/np.sqrt(2), 0,0,0],
            [1/np.sqrt(3), 1/np.sqrt(3), 1/np.sqrt(3), 0, 0],
            [0, 1/np.sqrt(3), 1/np.sqrt(3), 1/np.sqrt(3), 0],
            [0, 0, 1/np.sqrt(3), 1/np.sqrt(3), 1/np.sqrt(3)],
            [0,0,0,0,0]
        ]) # these features are coming right out of the paper.  I did not figure out how he came up with them
        
        self.state = self.features[self.pos-1]
    
    def step(self):
        self.pos += self.policy()
#         if self.pos < 1: self.pos = 1          
        reward = 1 if self.pos == self.size else 0
        
        if self.pos == self.size or self.pos == 1:
            end = True
        else: end = False
            
        self.state = self.features[self.pos-1]
    
        return self.state, reward, end
    
    def reset(self):
        self.__init__(self.size, self.policy)
        
    
            
        

In [206]:
class td_pred:
    """ td_lambda for prediction (not control, that has to be a separate class)
    """
    def __init__(self, task, lam):
        self.task = task
        self.weights = np.zeros_like(task.state)
        self.lam = lam  # lambda in the algorithm
        
    
    def learn(self, n, alpha):
        gamma = self.task.gamma
        lam = self.lam
        for _ in range(n):
            e = np.zeros_like(self.task.state)
            s = self.task.state
            vs = self.weights.dot(self.task.state)
            
            while(1):
                s_, r, t = self.task.step()
                vs_ = self.weights.dot(s_)
                d = r + self.task.gamma * vs_ - vs
                e = gamma * lam * e + alpha * (1 - gamma * lam * e.dot(s)) * s
                self.weights = self.weights + d * e + alpha * (vs - self.weights.dot(s)) * s
                vs = vs_
                s = s_
                
                if t:
                    self.task.reset()
                    break
        
        return self.weights, self.task.features.dot(self.weights)
                

In [207]:
p = [.5, .5]
pol = lambda : np.random.choice([-1, 1], p=p)
rw = random_walk(6, pol)


In [225]:
td = td_pred(rw, .5)
td.learn(1000, .1)

(array([ 0.30916685,  0.25048911,  0.34838933,  0.55757514,  0.61625288]),
 array([ 0.30916685,  0.39573652,  0.52426019,  0.66767878,  0.87885259,  0.        ]))