# Preconditioned TDL

In this notebook, we implement and verify the results from preconditioned TDL. 

In [None]:
import numpy as np
import math
import random
import plotly.graph_objects as go

In [None]:
class BoyanEnvironment:

    def __init__(self, env_info):

        self.num_states = env_info['num_states']
        self.terminal_reward = -2
        self.transition_reward = -3

    def transition(self, current_state, action):

        reward = self.transition_reward
        next_state = max(0, current_state - action)
        terminal = False

        #if current agent state is 1, then any action will lead to terminal
        #if current state is 2 and action is 2 --> terminal
        if next_state == 0:
            terminal = True
            if current_state == 1:
                reward = self.terminal_reward

        return (reward, next_state, terminal)

In [None]:
def get_features(state,num_features=4):

    shape = 4
    features = np.zeros(shape)
    
    # if state == 0:
    #     return features

    features[int(state/4)] = 1 - 0.25*(state%4)
    if int(state/4 + 1) < num_features:
        features[int(state/4 + 1)] = 0.25*(state%4)

    return features

In [None]:
get_features(0)

array([1., 0., 0., 0.])

In [None]:
#Computing RM(lambda) estimates first

num_states = 13
num_episodes = 1000
num_iterations = 1
num_features = math.ceil(num_states/4)
lambdas = [0, 0.2, 0.4, 0.6, 0.8, 1]
num_lambdas = len(lambdas)

As = np.zeros((num_lambdas, num_iterations, num_episodes, num_features, num_features))
bs = np.zeros((num_lambdas, num_iterations, num_episodes, num_features))
Ds = np.zeros((num_lambdas, num_iterations, num_episodes, num_features, num_features))

env = BoyanEnvironment({'num_states': num_states})


for l in range(num_lambdas):
    lamb = lambdas[l]
    for i in range(num_iterations):
        A = np.zeros((num_features, num_features))
        b = np.zeros(num_features)
        D = np.zeros((num_features, num_features))
        T = 1
        for j in range(num_episodes):
            current_state = num_states - 1
            z = get_features(current_state)
            terminal = False
            while(not terminal):
                action = 1
                if current_state > 1:
                    action = random.choice([1,2])
                reward, next_state, terminal = env.transition(current_state, action)
                z = lamb*z + get_features(current_state)
                A += (np.outer(z, get_features(next_state) - get_features(current_state)) - A)/T
                b += (z*reward - b)/T
                D += (np.outer(get_features(current_state), get_features(current_state)) - D)/T
                current_state = next_state
                T += 1
            As[l, i, j, :, :] = A
            bs[l, i, j, :] = b
            Ds[l, i, j, :, :] = D
            
As = np.sum(As, axis = 1)/num_iterations
bs = np.sum(bs, axis = 1)/num_iterations
Ds = np.sum(Ds, axis = 1)/num_iterations
        
    
alpha_0 = [0.1, 0.01]
n_0 = [100, 1000, 1000000]
            
    
LSTD = "LSTD"
iLSTD = "iLSTD"
LSPE = "LSPE"
GRAD = "GRAD"

In [None]:
def RMS_error(y1, y2):
    return np.sum((y1-y2)**2)/y1.shape[0]

def rho(mat):
    eig_val, vec = np.linalg.eig(mat)
    return max(abs(eig_val))

In [None]:
#TODO: modularize this

def ALGO(As, bs, Ds, method):
    (num_lambdas, num_episodes, num_features) = bs.shape
    rms_errors = []
    weights = []
    radius = []
    alpha1_radii = []
    alpha2_radii = []
    alpha3_radii = []
    alpha4_radii = []
    alpha5_radii = []
    alpha6_radii = []
    true_y =  np.array([-2*i for i in range(13)])
    for lamb in range(num_lambdas):
        rms_error = 10
        weights.append([])
        rms_errors.append([])
        radius.append([])
        alpha1_radii.append([])
        alpha2_radii.append([])
        alpha3_radii.append([])
        alpha4_radii.append([])
        alpha5_radii.append([])
        alpha6_radii.append([])
        theta = np.zeros(num_features)
        while(rms_error > 0.01):
            weights[-1].append([])
            rms_errors[-1].append([])
            radius[-1].append([])
            alpha1_radii[-1].append([])
            alpha2_radii[-1].append([])
            alpha3_radii[-1].append([])
            alpha4_radii[-1].append([])
            alpha5_radii[-1].append([])
            alpha6_radii[-1].append([])
            for episode in range(num_episodes):
                if method == LSTD:
                    A = As[lamb, episode, :, :]
                    B = As[lamb, episode, :, :]
                    Binv = np.linalg.pinv(B)
                elif method == iLSTD:
                    A = As[lamb, episode, :, :]
                    B = -np.eye(num_features)
                    Binv = -np.eye(num_features)
                elif method == LSPE:
                    A = As[lamb, episode, :, :]
                    B = -Ds[lamb, episode, :, :]
                    Binv = np.linalg.pinv(B)
                elif method == GRAD:
                    A = As[lamb, episode, :, :]
                    Binv = np.transpose(As[lamb, episode, :, :])
                    B = np.linalg.pinv(Binv)
                else:
                    print("NO METHOD")
                    return
                e = np.dot(As[lamb, episode, :, :], theta) + bs[lamb, episode, :]
                delta = np.dot(Binv, e)
                alpha_num = np.dot(np.dot(Binv, A), delta)
                alpha = (np.dot(np.transpose(delta), alpha_num))/np.dot(np.transpose(alpha_num), alpha_num)
                theta -= alpha * delta
                weights[-1][-1].append(theta)
                y = np.array([np.dot(theta, get_features(i)) for i in range(13)])
                rms_error = RMS_error(y, true_y)
                rms_errors[-1][-1].append(rms_error)
                C = np.dot(np.transpose(A), B)
                r = rho(np.eye(num_features) - alpha * np.dot(np.dot(np.linalg.pinv(C),np.transpose(A)), A))
                radius[-1][-1].append(r)  
                alpha1 = 0.1*(100 + 1)/(100 + episode + 1)
                alpha1_radii[-1][-1].append(rho(np.eye(num_features) - alpha1 * np.dot(np.dot(np.linalg.pinv(C),np.transpose(A)), A)))
                alpha2 = 0.1*(1000 + 1)/(1000 + episode + 1)
                alpha2_radii[-1][-1].append(rho(np.eye(num_features) - alpha2 * np.dot(np.dot(np.linalg.pinv(C),np.transpose(A)), A)))
                alpha3 = 0.1*(1000000 + 1)/(1000000 + episode + 1)
                alpha3_radii[-1][-1].append(rho(np.eye(num_features) - alpha3 * np.dot(np.dot(np.linalg.pinv(C),np.transpose(A)), A)))
                alpha4 = 0.01*(100 + 1)/(100 + episode + 1)
                alpha4_radii[-1][-1].append(rho(np.eye(num_features) - alpha4 * np.dot(np.dot(np.linalg.pinv(C),np.transpose(A)), A)))
                alpha5 = 0.01*(1000 + 1)/(1000 + episode + 1)
                alpha5_radii[-1][-1].append(rho(np.eye(num_features) - alpha5 * np.dot(np.dot(np.linalg.pinv(C),np.transpose(A)), A)))
                alpha6 = 0.01*(1000000 + 1)/(1000000 + episode + 1)
                alpha6_radii[-1][-1].append(rho(np.eye(num_features) - alpha6 * np.dot(np.dot(np.linalg.pinv(C),np.transpose(A)), A)))
        rms_errors[-1] = np.array(rms_errors[-1])
        weights[-1] = np.array(weights[-1])
        radius[-1] = np.array(radius[-1])
        alpha1_radii[-1] = np.array(alpha1_radii[-1])
        alpha2_radii[-1] = np.array(alpha2_radii[-1])
        alpha3_radii[-1] = np.array(alpha3_radii[-1])
        alpha4_radii[-1] = np.array(alpha4_radii[-1])
        alpha5_radii[-1] = np.array(alpha5_radii[-1])
        alpha6_radii[-1] = np.array(alpha6_radii[-1])
    return rms_errors, weights, radius, [alpha1_radii, alpha2_radii, alpha3_radii, alpha4_radii, alpha5_radii, alpha6_radii]

In [None]:
LSTD_rms_errors, LSTD_weights, LSTD_radius, LSTD_alpha_radii = ALGO(As, bs, Ds, LSTD)
iLSTD_rms_errors, iLSTD_weights, iLSTD_radius, iLSTD_alpha_radii = ALGO(As, bs, Ds, iLSTD)
LSPE_rms_errors, LSPE_weights, LSPE_radius, LSPE_alpha_radii = ALGO(As, bs, Ds, LSPE)
GRAD_rms_errors, GRAD_weights, GRAD_radius, GRAD_alpha_radii = ALGO(As, bs, Ds, GRAD)

In [None]:
fig = go.Figure()

x = [i+1 for i in range(1,120)]
fig.add_trace(go.Scatter(x=x, y=LSTD_rms_errors[-1][-1,2:120], name='LSTD'))
fig.add_trace(go.Scatter(x=x, y=iLSTD_rms_errors[-1][-1,2:120], name='iLSTD'))
fig.add_trace(go.Scatter(x=x, y=LSPE_rms_errors[-1][-1,2:120], name='LSPE'))
fig.add_trace(go.Scatter(x=x, y=GRAD_rms_errors[-1][-1,2:120], name='GRAD'))
fig.update_layout(title='RMS error VS #Episodes',
                   xaxis_title='#Episodes',
                   yaxis_title='RMS error')


fig.show()

In [None]:
fig = go.Figure()

x = [i+1 for i in range(1,1000)]
fig.add_trace(go.Scatter(x=x, y=LSTD_radius[-1][-1,:], name='LSTD'))
fig.add_trace(go.Scatter(x=x, y=iLSTD_radius[-1][-1,:], name='iLSTD'))
fig.add_trace(go.Scatter(x=x, y=LSPE_radius[-1][-1,:], name='LSPE'))
fig.add_trace(go.Scatter(x=x, y=GRAD_radius[-1][-1,:], name='GRAD'))
fig.update_layout(title='Spectral radius vs #Episodes',
                   xaxis_title='#Episode',
                   yaxis_title='Spectral Radius')


fig.show()

In [None]:
# spectral radius using alpha_0 and n_0

fig = go.Figure()

x = [i+1 for i in range(1,1000)]
fig.add_trace(go.Scatter(x=x, y=LSTD_alpha_radii[0][-1][:,:][0], name='a0 0.1 n0 100'))
fig.add_trace(go.Scatter(x=x, y=LSTD_alpha_radii[1][-1][:,:][0], name='a0 0.1 n0 1000'))
fig.add_trace(go.Scatter(x=x, y=LSTD_alpha_radii[2][-1][:,:][0], name='a0 0.1 n0 1000000'))
fig.add_trace(go.Scatter(x=x, y=LSTD_alpha_radii[3][-1][:,:][0], name='a0 0.01 n0 100'))
fig.add_trace(go.Scatter(x=x, y=LSTD_alpha_radii[4][-1][:,:][0], name='a0 0.01 n0 1000'))
fig.add_trace(go.Scatter(x=x, y=LSTD_alpha_radii[5][-1][:,:][0], name='a0 0.01 n0 1000000'))
fig.add_trace(go.Scatter(x=x, y=LSTD_radius[-1][-1,:], name='adaptive'))

fig.update_layout(title='LSTD ',
                   xaxis_title='#Episode',
                   yaxis_title='Spectral Radius')

fig.update_layout(legend=dict(
    yanchor="top",
    y=0.85,
    xanchor="left",
    x=0.75
))



fig.show()

In [None]:
fig = go.Figure()

x = [i+1 for i in range(1,1000)]
fig.add_trace(go.Scatter(x=x, y=GRAD_alpha_radii[0][-1][:,:][0], name='a0 0.1 n0 100'))
fig.add_trace(go.Scatter(x=x, y=GRAD_alpha_radii[1][-1][:,:][0], name='a0 0.1 n0 1000'))
fig.add_trace(go.Scatter(x=x, y=GRAD_alpha_radii[2][-1][:,:][0], name='a0 0.1 n0 1000000'))
fig.add_trace(go.Scatter(x=x, y=GRAD_alpha_radii[3][-1][:,:][0], name='a0 0.01 n0 100'))
fig.add_trace(go.Scatter(x=x, y=GRAD_alpha_radii[4][-1][:,:][0], name='a0 0.01 n0 1000'))
fig.add_trace(go.Scatter(x=x, y=GRAD_alpha_radii[5][-1][:,:][0], name='a0 0.01 n0 1000000'))
fig.add_trace(go.Scatter(x=x, y=GRAD_radius[-1][-1,:], name='adaptive'))

fig.update_layout(title='Average High and Low Temperatures in New York',
                   xaxis_title='#Episode',
                   yaxis_title='Spectral Radius')
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.85,
    xanchor="left",
    x=0.75
))


fig.show()

In [None]:
fig = go.Figure()

x = [i+1 for i in range(1,1000)]
fig.add_trace(go.Scatter(x=x, y=iLSTD_alpha_radii[0][-1][:,:][0], name='a0 0.1 n0 100'))
fig.add_trace(go.Scatter(x=x, y=iLSTD_alpha_radii[1][-1][:,:][0], name='a0 0.1 n0 1000'))
fig.add_trace(go.Scatter(x=x, y=iLSTD_alpha_radii[2][-1][:,:][0], name='a0 0.1 n0 1000000'))
fig.add_trace(go.Scatter(x=x, y=iLSTD_alpha_radii[3][-1][:,:][0], name='a0 0.01 n0 100'))
fig.add_trace(go.Scatter(x=x, y=iLSTD_alpha_radii[4][-1][:,:][0], name='a0 0.01 n0 1000'))
fig.add_trace(go.Scatter(x=x, y=iLSTD_alpha_radii[5][-1][:,:][0], name='a0 0.01 n0 1000000'))
fig.add_trace(go.Scatter(x=x, y=iLSTD_radius[-1][-1,:], name='adaptive'))

fig.update_layout(title='Average High and Low Temperatures in New York',
                   xaxis_title='#Episode',
                   yaxis_title='Spectral Radius')

fig.update_layout(legend=dict(
    yanchor="top",
    y=0.85,
    xanchor="left",
    x=0.75
))
fig.show()

In [None]:
fig = go.Figure()

x = [i+1 for i in range(1,1000)]
fig.add_trace(go.Scatter(x=x, y=LSPE_alpha_radii[0][-1][:,:][0], name='a0 0.1 n0 100'))
fig.add_trace(go.Scatter(x=x, y=LSPE_alpha_radii[1][-1][:,:][0], name='a0 0.1 n0 1000'))
fig.add_trace(go.Scatter(x=x, y=LSPE_alpha_radii[2][-1][:,:][0], name='a0 0.1 n0 1000000'))
fig.add_trace(go.Scatter(x=x, y=LSPE_alpha_radii[3][-1][:,:][0], name='a0 0.01 n0 100'))
fig.add_trace(go.Scatter(x=x, y=LSPE_alpha_radii[4][-1][:,:][0], name='a0 0.01 n0 1000'))
fig.add_trace(go.Scatter(x=x, y=LSPE_alpha_radii[5][-1][:,:][0], name='a0 0.01 n0 1000000'))
fig.add_trace(go.Scatter(x=x, y=LSPE_radius[-1][-1,:], name='adaptive'))

fig.update_layout(title='Average High and Low Temperatures in New York',
                   xaxis_title='#Episode',
                   yaxis_title='Spectral Radius')

fig.update_layout(legend=dict(
    yanchor="top",
    y=0.85,
    xanchor="left",
    x=0.75
))

fig.show()