In [1]:
import numpy as np
# import gym

In [2]:
class random_walk:
    
    def __init__(self, size, policy, gamma=.99):
        """ size: int, size of random walk
            policy: function, returns -1 (left) or 1 (right)
            gamma: float between 0 and 1, discounting factor
        """
        self.policy = policy
        self.size = size
        self.pos = int(size / 2) + 1  # start in the middle of the states
        self.gamma = gamma
        # Create features according to pattern in paper
        self.features = np.zeros((size, size))
        self.features[1,0] = 1
        self.features[2,:2] = [1/np.sqrt(2), 1/np.sqrt(2)]
        for i in range(3, size-1):
            self.features[i, i-2:i+1] = [1/np.sqrt(3), 1/np.sqrt(3), 1/np.sqrt(3)]
        
        self.state = self.features[self.pos-1]  # pos describes the position of the walk.  state is the features at that position
    
    
    def step(self):
        """ Take a step according to the policy, and observe new state, reward, and terminality.
            Returns: new state (features), reward, and a bool end which is True if terminal state, False otherwise
        """
        self.pos += self.policy()
#         if self.pos < 1: self.pos = 1       #   In the paper, state 1 was not terminal.  Right now it's terminal with reward 0
        reward = 1 if self.pos == self.size else 0
        
        if self.pos == self.size or self.pos == 1:
            end = True
        else: end = False
            
        self.state = self.features[self.pos-1]
    
        return self.state, reward, end
    
    
    def reset(self):
        """ Reset the random walk, at the end of an episode.
        """
        self.__init__(self.size, self.policy)
        
    
            
        

In [3]:
class td_pred:
    """ td_lambda for prediction (not control, that has to be a separate class)
        with linear function approximation
    """
    
    def __init__(self, task, lam):
        """ Task: hopefully this will be general enough that we can pass in a random walk class, mountain car task,
                    or whatever else, and td_pred can use them all the same way.
            lam: lambda
        """
        self.task = task
        self.weights = np.zeros_like(task.state)  # initialize weights like state features
        self.lam = lam  # lambda in the algorithm
        
    
    def learn(self, n, alpha):
        """ Perform the td algorithm.  This is straight out of the paper
            n: int, number of episodes
            alpha: float from 0 to 1, step size
            
            returns: weights, state value estimates
        """
        
        gamma = self.task.gamma  
        lam = self.lam
        for _ in range(n):
            e = np.zeros_like(self.task.state)
            s = self.task.state
            vs = self.weights.dot(self.task.state)
            
            while(1):
                s_, r, t = self.task.step()
                vs_ = self.weights.dot(s_)
                d = r + self.task.gamma * vs_ - vs
                e = gamma * lam * e + alpha * (1 - gamma * lam * e.dot(s)) * s
                self.weights = self.weights + d * e + alpha * (vs - self.weights.dot(s)) * s
                vs = vs_
                s = s_
                
                if t:
                    self.task.reset()
                    break
        
        return self.weights, self.task.features.dot(self.weights)
                

In [20]:
# make a policy and instantiate random walk task
p = [.5, .5]
pol = lambda : np.random.choice([-1, 1], p=p)
WALK_SIZE = 11
rw = random_walk(WALK_SIZE, pol)

In [5]:
# A couple random walk demos

td = td_pred(rw, .5)  # lambda .5
td.learn(1000, .1)  # learning rate .1

(array([ 0.2447728 ,  0.42377753,  0.4359081 ,  0.4359081 ,  0.73266429,  0.        ]),
 array([ 0.        ,  0.2447728 ,  0.47273647,  0.74801139,  0.92634724,  0.        ]))

In [22]:
import pandas as pd
learning_rates = np.linspace(0, 1.5, 151)
lambdas = [0.5, 0.2]
runs = 100
df = pd.DataFrame(columns=["iteration", "lambda", "learning_rate", "rms"])
episodes = 10
true_state_vals = np.linspace(0, 1, WALK_SIZE)
true_state_vals[-1] = 0.0

In [23]:
for i in range(1, runs + 1):
    for lmb in lambdas:
        for lr in learning_rates:
            td = td_pred(rw, lmb)
            _, state_val_est = td.learn(episodes, lr)
            rms = ((true_state_vals - state_val_est)**2).mean()**0.5
            row = pd.DataFrame({"iteration": i, "lambda": lmb, "learning_rate": lr, "rms": rms}, index=[0])
            df = df.append(row)

In [24]:
df.head()

Unnamed: 0,iteration,lambda,learning_rate,rms
0,1,0.5,0.0,0.50901
0,1,0.5,0.01,0.477907
0,1,0.5,0.02,0.461345
0,1,0.5,0.03,0.40532
0,1,0.5,0.04,0.356917


In [25]:
df.to_csv("./mse.csv", index=False)