In [1]:
from __future__ import print_function, division
import gym
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.kernel_approximation import RBFSampler
from sklearn.pipeline import FeatureUnion
seed = 417

### Action-Value function based Actor-Critic ([description](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/pg.pdf#25))

In [2]:
class Agent:
    def __init__(self, env, alpha=0.01, beta=0.01):
        self.nA = env.action_space.n
        n_components = 100
        self.alpha = alpha
        self.beta = beta
        observation_examples = np.array([np.concatenate(([env.action_space.sample()], 
                                                         env.observation_space.sample())) for x in range(100000)])

        self.scaler = StandardScaler()
        self.feature_map = FeatureUnion([("rbf1", RBFSampler(n_components=n_components, gamma=1., random_state=1)),
                                         ("rbf01", RBFSampler(n_components=n_components, gamma=0.1, random_state=1)),
                                         ("rbf10", RBFSampler(n_components=n_components, gamma=10, random_state=1))])

        self.feature_map.fit(self.scaler.fit_transform(observation_examples))
        
        self.theta = np.random.rand(3 * n_components)
        self.w = np.random.rand(3 * n_components)
        
    def compute_features(self, s, a):
        seed = 417
        np.random.seed(seed)
        return self.feature_map.transform(self.scaler.transform(np.hstack((a, s))[np.newaxis, :]))
    
    def act(self, s):
        self.Phi_s = np.vstack([self.compute_features(s, a) for a in range(self.nA)])
        probs = np.exp(np.dot(self.Phi_s, self.theta))
        self.probs = probs / np.sum(probs)
        return np.random.choice(self.nA, p=self.probs)
    
    def update(self, s, a, r, sp, ap):
        gamma = 0.95
        phi = self.Phi_s[a]
        Q_old = np.inner(phi, self.w)
        Q_new = np.inner(self.compute_features(sp, ap).ravel(), self.w)
        
        self.theta += self.alpha * (phi - np.sum(self.probs[:, np.newaxis] * self.Phi_s, axis=0)) * Q_old
        delta = r + gamma * Q_new - Q_old
        self.w += self.beta * delta * phi

In [3]:
done = False
render = True 

In [4]:
env_name = 'MountainCar-v0'
env = gym.make(env_name)
np.random.seed(seed)
agent = Agent(env, 0.001, 0.1)

for e in range(20):
    s = env.reset()
    t = 0
    done = False
    a = agent.act(s)
    
    while not done and t < 500:
        if render: env.render()
        
        sp, r, done, _ = env.step(a)
        ap = agent.act(sp)
        
        agent.update(s, a, r, sp, ap)
        
        s = sp
        t += 1
    
    print('episode {} finished in {} steps'.format(e, t))

INFO:gym.envs.registration:Making new env: MountainCar-v0
[2017-01-10 11:53:05,132] Making new env: MountainCar-v0


episode 0 finished in 500 steps
episode 1 finished in 500 steps
episode 2 finished in 500 steps
episode 3 finished in 500 steps
episode 4 finished in 500 steps
episode 5 finished in 500 steps
episode 6 finished in 500 steps
episode 7 finished in 500 steps
episode 8 finished in 500 steps
episode 9 finished in 500 steps
episode 10 finished in 500 steps
episode 11 finished in 500 steps
episode 12 finished in 500 steps
episode 13 finished in 500 steps
episode 14 finished in 500 steps
episode 15 finished in 500 steps
episode 16 finished in 500 steps
episode 17 finished in 500 steps
episode 18 finished in 500 steps
episode 19 finished in 500 steps


### Value function based Actor-Critic (can be found in book on p.294)

In [5]:
gamma = 0.95
done = False
render = True 
n_components = 100

In [6]:
class FeatureMaker:
    def __init__(self, env, n_components=100):
        observation_examples = np.array([env.observation_space.sample() for x in range(100000)])

        self.scaler = StandardScaler()
        self.feature_map = FeatureUnion([("rbf1", RBFSampler(n_components=n_components, gamma=1., random_state=seed)),
                                         ("rbf01", RBFSampler(n_components=n_components, gamma=0.1, random_state=seed)),
                                         ("rbf10", RBFSampler(n_components=n_components, gamma=10, random_state=seed))])

        self.feature_map.fit(self.scaler.fit_transform(observation_examples))
        
    def compute_features(self, s):
        return self.feature_map.transform(self.scaler.transform(s[np.newaxis, :]))[0]
    
    
env_name = 'MountainCar-v0'
env = gym.make(env_name)
fm = FeatureMaker(env, n_components=n_components)

INFO:gym.envs.registration:Making new env: MountainCar-v0
[2017-01-10 11:53:38,439] Making new env: MountainCar-v0


In [7]:
class PolicyApproximator:
    def __init__(self, env, n_components=100, alpha=0.01):
        self.alpha = alpha
        self.nA = env.action_space.n
        self.theta = np.random.rand(3*n_components+1)
        
    def act(self, s):
        self.Phi_s = np.hstack((np.arange(self.nA)[:, np.newaxis], 
                                np.repeat(fm.compute_features(s)[np.newaxis, :], self.nA, axis=0)))
        probs = np.exp(np.dot(self.Phi_s, self.theta))
        self.probs = probs / np.sum(probs)
        return np.random.choice(self.nA, p=self.probs)
    
    def update(self, delta, a):
        phi = self.Phi_s[a]
        self.theta += self.alpha * delta * (phi - np.sum(self.probs[:, np.newaxis] * self.Phi_s, axis=0))
        
    
        
class ValueApproximator:
    def __init__(self, env, n_components=100, beta=0.01):
        self.beta = beta
        self.w = np.random.rand(3*n_components)
        
    def predict(self, s):
        return np.inner(self.w, fm.compute_features(s))
        
    def update(self, delta, s):
        self.w += self.beta * delta * fm.compute_features(s)

In [8]:
np.random.seed(seed)
policy = PolicyApproximator(env, alpha=0.001)
value = ValueApproximator(env, beta=0.1)

for e in range(20):
    s = env.reset()
    t = 0
    done = False
    
    while not done and t < 500:
        if render: env.render()
        
        a = policy.act(s)
        sn, r, done, _ = env.step(a)
        
        V_old = value.predict(s)
        V_new = value.predict(sn)
        delta = r + gamma * V_new - V_old
        
        value.update(delta, s)
        policy.update(delta, a)
        
        s = sn
        t += 1
    
    print('episode {} finished in {} steps'.format(e, t))

episode 0 finished in 500 steps
episode 1 finished in 500 steps
episode 2 finished in 500 steps
episode 3 finished in 500 steps
episode 4 finished in 500 steps
episode 5 finished in 500 steps
episode 6 finished in 500 steps
episode 7 finished in 500 steps
episode 8 finished in 500 steps
episode 9 finished in 500 steps
episode 10 finished in 500 steps
episode 11 finished in 500 steps
episode 12 finished in 500 steps
episode 13 finished in 500 steps
episode 14 finished in 500 steps
episode 15 finished in 500 steps
episode 16 finished in 500 steps
episode 17 finished in 500 steps
episode 18 finished in 500 steps
episode 19 finished in 500 steps


### Monte-Carlo policy gradient (bad idea since estimate of Q is always the same)

In [3]:
done = False
discount = 1.0
render = True

In [9]:
class Agent:
    def __init__(self, env, learning_rate):
        self.nA = env.action_space.n
        n_components = 100
        self.lr = learning_rate
        self.feature_memory = []
        observation_examples = np.array([np.concatenate(([env.action_space.sample()], 
                                                         env.observation_space.sample())) for x in range(100000)])

        self.scaler = StandardScaler()
        self.scaler.fit(observation_examples)

        self.feature_map = FeatureUnion([("rbf1", RBFSampler(n_components=n_components, gamma=1., random_state=1)),
                                         ("rbf01", RBFSampler(n_components=n_components, gamma=0.1, random_state=1)),
                                         ("rbf10", RBFSampler(n_components=n_components, gamma=10, random_state=1))])

        self.feature_map.fit(self.scaler.transform(observation_examples))
        
        self.theta = np.random.rand(3 * n_components)
    
    def act(self, s):
        Phi_s = self.feature_map.transform(self.scaler.transform(
                np.hstack((np.arange(self.nA)[:, np.newaxis], np.repeat(s[np.newaxis, :], self.nA, axis=0)))))
        
        self.feature_memory.append(Phi_s)
        probs = np.exp(np.dot(Phi_s, self.theta))
        return np.random.choice(self.nA, p=probs/np.sum(probs))
    
    def update(self, memory):
        gamma = 1 # 0.999
        # Q_samples = np.cumsum(map(lambda x: x[2], memory))[::-1] * np.cumprod(np.repeat(gamma, len(memory)))
        Q_samples = (map(lambda x: x[2], memory))[::-1] * np.cumprod(np.repeat(gamma, len(memory)))
        
        for t in range(len(memory)):
            self.theta += self.lr * (self.feature_memory[t][memory[t][1]] - \
                                     np.mean(self.feature_memory[t], axis=0)) * Q_samples[t]
        
        self.feature_memory = []

In [11]:
agent = Agent(env, 0.01)

In [12]:
for e in range(20):
    s = env.reset()
    episode = 0
    done = False
    memory = []
    
    while not done and episode < 500:
        if render: env.render()
        a = agent.act(s)
        sp, r, done, _ = env.step(a)
        memory.append((s, a, r, sp))
        s = sp
        episode += 1
    
    print('episode {} finished in {} steps'.format(e, episode))
    
    agent.update(memory)

episode 0 finished in 500 steps
episode 1 finished in 500 steps
episode 2 finished in 500 steps
episode 3 finished in 500 steps
episode 4 finished in 500 steps
episode 5 finished in 500 steps
episode 6 finished in 500 steps
episode 7 finished in 500 steps
episode 8 finished in 500 steps
episode 9 finished in 500 steps
episode 10 finished in 500 steps
episode 11 finished in 500 steps
episode 12 finished in 500 steps
episode 13 finished in 500 steps
episode 14 finished in 500 steps
episode 15 finished in 500 steps
episode 16 finished in 500 steps
episode 17 finished in 500 steps
episode 18 finished in 500 steps
episode 19 finished in 500 steps
