# LSPI with PDFS

In [1]:
from lspi import LSPI
from basis import *
from utils import *
import gym
import pickle
#from grid import make_grid
#import matplotlib.pyplot as plt
#from matplotlib import cm
#%matplotlib qt

In [2]:
GAMMA = 0.95

In [52]:
def run_off_lspi(env, tol, horizon, n_episodes, basis_name, covering, cutsom=None, **kwargs):
    
    # Sampling parameters
    horizon = horizon
    n_episodes = n_episodes

    # Raw sampling
    data = collect_episodes(env, horizon=horizon, n_episodes=n_episodes, custom=custom)
    print('%s states encountered during the random walk' %len(data))
    
    nactions = env.action_space.n
    
    if basis_name == 'RBF':
        means = kwargs['means']
        gamma = kwargs['gamma']
        basis = RadialBasisFunction(means, gamma, nactions)
    
    if basis_name == 'PVF':
        
        # Subsample to build pvfs
        graph_states = subsample([data[i][0] for i in range(len(data))], covering)
        print('%s states were kept to build the laplacian'%len(graph_states))
        
        num_eig = kwargs['num_eigens']
        var = kwargs['variance']
        nn = kwargs['num_neighbors']
        print('Computing the Laplacian')
        # Learn state manifold
        basis = ProtoValueBasis(graph_states, var, nn, nactions)
        print('Laplacian computed')
        basis.set_num_features(num_eig)
    
    print('Start training')
    lspi = LSPI(data, basis, GAMMA, nactions)
    theta = np.random.rand(basis.size())
    i = 0
    dist = np.inf
    while dist > tol:
        old_theta = copy.copy(theta)
        theta = lspi.iteration(theta)
        i += 1
        dist = np.linalg.norm(old_theta - theta)
    print('Training complete')
    
    # Evaluate policy
    l = []
    for _ in range(30): 
        s = env.reset()
        term = False
        length = 0
        while not term and length < 500:
            q = np.dot(basis.evaluate(s), theta)
            action = np.argmax(q)
            s, _, term, _ = env.step(action)
            length += 1
        l.append(length)
    return l

In [3]:
# Set tolerance for lspi
tol = 0.05

# Global number of features
num_features = 25

# PVF parameters
var = 0.25
nn = 20

# Args
kwargs = {'num_eigens':num_features, 
          'num_neighbors':nn, 'variance':var}

In [4]:
espilon_covering = {}

### a) Acrobot

In [97]:
env = gym.make('Acrobot-v1')
espilon_covering['acrobot'] = 0.9

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [98]:
# Training parameters
env._max_episode_steps = np.inf
horizon = 100000
n_episodes = 35

In [7]:
res_pvf_acrobot = []
for _ in range(5):
    res_pvf_acrobot.append(run_off_lspi(env, tol, horizon, 
                                        n_episodes, 'PVF', covering=epsilon_covring['acrobot'], **kwargs))

63417 states encountered during the random walk
Computing the Laplacian
Laplacian computed
Start training
1 iterations completed
Current distance 138.2879858224527
2 iterations completed
Current distance 35.48782232617139
3 iterations completed
Current distance 0.11638567631398805
4 iterations completed
Current distance 5.5104580033949405e-05
74669 states encountered during the random walk
Computing the Laplacian
Laplacian computed
Start training
1 iterations completed
Current distance 162.01363150220885
2 iterations completed
Current distance 42.887822281224935
3 iterations completed
Current distance 0.5071892069883043
4 iterations completed
Current distance 0.000245035023191832
71573 states encountered during the random walk
Computing the Laplacian
Laplacian computed
Start training
1 iterations completed
Current distance 177.12106399120594
2 iterations completed
Current distance 45.3814379202748
3 iterations completed
Current distance 0.4605846389122085
4 iterations completed
Current

In [9]:
res_rbf_acrobot = []
for _ in range(5):
    res_rbf_acrobot.append(run_off_lspi(env, tol, horizon,
                                        n_episodes, 'RBF', covering=epsilon_covring['acrobot'],**kwargs))

77864 states encountered during the random walk
Start training
1 iterations completed
Current distance 37.41086352747375
2 iterations completed
Current distance 0.9826635640391354
3 iterations completed
Current distance 0.02053861204002026
69526 states encountered during the random walk
Start training
1 iterations completed
Current distance 37.41541669316846
2 iterations completed
Current distance 0.6918741733686892
3 iterations completed
Current distance 0.00855038658772186
84680 states encountered during the random walk
Start training
1 iterations completed
Current distance 37.39211235386282
2 iterations completed
Current distance 0.3523143814492172
3 iterations completed
Current distance 0.002179173168508964
82660 states encountered during the random walk
Start training
1 iterations completed
Current distance 37.40779669059466
2 iterations completed
Current distance 0.7386349896739763
3 iterations completed
Current distance 0.03971080428592329
65590 states encountered during the ran

In [14]:
import pickle
pickle.dump(res_pvf_acrobot, open('res_pvf_acrobot', 'wb'))
pickle.dump(res_rbf_acrobot, open('res_rbf_acrobot', 'wb'))

### b) CartPole

In [7]:
env = gym.make('CartPole-v0')
espilon_covering['cartpole'] = 0.08

# Sampling parameters
horizon = 70
n_episodes = 700

# Custom reward (none provided by gym)
custom = -100

  result = entry_point.load(False)


In [23]:
for _ in range(4):
    res_pvf_cartpole.append(run_off_lspi(env, tol, 
                    horizon, n_episodes, 'PVF', custom=custom, covering=espilon_covering['cartpole'], **kwargs))

15841 states encountered during the random walk
Computing the Laplacian
Laplacian computed
Start training
1 iterations completed
Current distance 18021.698933237294
2 iterations completed
Current distance 15887.886142270498
3 iterations completed
Current distance 9679.367617612148
4 iterations completed
Current distance 8442.82583816962
5 iterations completed
Current distance 7856.926240115174
6 iterations completed
Current distance 12368.622615572887
7 iterations completed
Current distance 3946.5799666590074
8 iterations completed
Current distance 8562.895392718705
9 iterations completed
Current distance 7288.76729552844
10 iterations completed
Current distance 5252.955791016443
11 iterations completed
Current distance 411.220024994828
12 iterations completed
Current distance 63.674838320183035
13 iterations completed
Current distance 0.4142883118903049
14 iterations completed
Current distance 0.0
15409 states encountered during the random walk
Computing the Laplacian
Laplacian comput

In [28]:
pickle.dump(res_pvf_cartpole,open('res_pvf_cartpole', 'wb'))

In [8]:
# RBF parameters
means = build_rbf_centers(env, num_features)
gamma = 1

kwargs['means'] = means
kwargs['gamma'] = gamma

In [18]:
res_rbf_cartpole = []
for _ in range(5):
    res_rbf_cartpole.append(run_off_lspi(env, tol, 
                    horizon, n_episodes, 'RBF', custom=custom, covering=None, **kwargs))

15559 states encountered during the random walk
Start training
Training complete
15087 states encountered during the random walk
Start training
Training complete
15481 states encountered during the random walk
Start training
Training complete
15274 states encountered during the random walk
Start training
Training complete
15636 states encountered during the random walk
Start training
Training complete


In [40]:
pickle.dump(res_rbf_cartpole, open('Results/res_rbf_cartpole', 'wb'))

### c) Mountain car

In [5]:
env = gym.make('MountainCar-v0')
env._max_episode_steps = np.inf
env._max_episode_seconds = np.inf

# For subsampling
espilon_covering['mc'] = 0.01

# Sampling parameters
horizon = 10000
n_episodes = 3

# Custom reward (none provided by gym)
custom = 100

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


  result = entry_point.load(False)


In [51]:
res_pvf_mc = []
for _ in range(5):
    res_pvf_mc.append(run_off_lspi(env, tol, 
                    horizon, n_episodes, 'PVF', custom=custom, covering=espilon_covering['mc'], **kwargs))

NameError: name 'run_off_lspi' is not defined

In [None]:
res_rbf_mc = []
for _ in range(5):
    res_pvf_mc.append(run_off_lspi(env, tol, 
                    horizon, n_episodes, 'RBF', custom=custom, covering=None, **kwargs))

In [3]:
import pickle
import numpy as np

In [6]:
res_pvf_cartpole = np.array(pickle.load(open('Results/res_pvf_cartpole', 'rb')))
res_pvf_acrobot = np.array(pickle.load(open('Results/res_pvf_acrobot', 'rb')))
res_rbf_cartpole = np.array(pickle.load(open('Results/res_rbf_cartpole', 'rb')))
res_rbf_acrobot = np.array(pickle.load(open('Results/res_rbf_acrobot', 'rb')))

In [7]:
means_pvf_c = np.mean(res_pvf_cartpole, axis=1)
means_pvf_a = np.mean(res_pvf_acrobot, axis=1)
means_rbf_c = np.mean(res_rbf_cartpole, axis=1)
means_rbf_a = np.mean(res_rbf_acrobot, axis=1)

In [13]:
print('PVF CartPole, mean perf = ' + str(np.mean(means_pvf_c)) + ' std = ' + str(np.var(means_pvf_c)**0.5))
print('RBF CartPole, mean perf = ' + str(np.mean(means_rbf_c)) + ' std = ' + str(np.var(means_rbf_c)**0.5))
print('')
print('PVF Acrobot, mean perf = ' + str(np.mean(means_pvf_a)) + ' std = ' + str(np.var(means_pvf_a)**0.5))
print('RBF Acrobot, mean perf = ' + str(np.mean(means_rbf_a)) + ' std = ' + str(np.var(means_rbf_a)**0.5))

PVF CartPole, mean perf = 160.85999999999999 std = 54.005910787616564
RBF CartPole, mean perf = 95.80666666666667 std = 9.736698048562914

PVF Acrobot, mean perf = 176.51333333333332 std = 82.30775203129072
RBF Acrobot, mean perf = 473.88 std = 41.31294443364909
