In [None]:
from copy import deepcopy

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import hsv_to_rgb

from naf.priority_buffer import PriorityBuffer

In [None]:
class HoleEnvironment:
    
    def __init__(self):
        self.max_dist = 0.01
        self.min_x = -0.10
        self.max_x = 0.10
        self.min_y = 0.10
        self.max_y = 0.30
        self.hole_radius = 0.03
        self.LOSE = 0
        self.NEUTRAL = 1
        self.WIN = 2
        self.reset()
        
    def reset(self):
        self.hole_x = 0.0
        self.hole_y = 0.2
        self.goal_x = self.hole_x
        self.goal_y = self.hole_y + self.hole_radius + 0.02
        theta = 2 * np.pi * np.random.rand()
        d = 0.03 + 0.05 * np.random.rand()
        self.eef_x = self.hole_x + d * np.cos(theta)
        self.eef_y = self.hole_y + d * np.sin(theta)
        
    @property
    def x_size(self):
        return 2
    
    @property
    def u_size(self):
        return 2
        
    def get_state(self):
        return np.array([
            self.eef_x,
            self.eef_y
        ])
    
    def step(self, u):
        goal = np.array([self.goal_x, self.goal_y])
        hole = np.array([self.hole_x, self.hole_y])
        eef_before = np.array([self.eef_x, self.eef_y])
        self.eef_x += u[0]
        self.eef_y += u[1]
        eef_after = np.array([self.eef_x, self.eef_y])
        if np.linalg.norm(eef_after - hole) < self.hole_radius:
            reward = -1
            state = self.LOSE
        elif not (self.min_x <= self.eef_x <= self.max_x):
            reward = -1
            state = self.LOSE
        elif not (self.min_y <= self.eef_y <= self.max_y):
            reward = -1
            state = self.LOSE
        else:
            dist_before = np.linalg.norm(eef_before - goal)
            dist_after = np.linalg.norm(eef_after - goal)
            reward = 10 * np.exp(-200 * dist_after)
            if dist_after < self.max_dist:
                state = self.WIN
            else:
                state = self.NEUTRAL
        if state == self.LOSE:
            self.eef_x, self.eef_y = eef_before
        return self.get_state(), reward, state == self.LOSE, state
        
    def plot(self, ax=None, eef_color='b'):
        import matplotlib.pyplot as plt
        if ax is None:
            fig, ax = plt.subplots()
        ax.add_artist(plt.Circle(
            (self.hole_x, self.hole_y),
            self.hole_radius,
            color='k',
            alpha=0.5
        ))
        ax.plot(self.eef_x, self.eef_y, '+', color=eef_color, markersize=10)
        ax.plot(self.goal_x, self.goal_y, 'ko', markersize=10)
        ax.plot(self.goal_x, self.goal_y, 'ro', markersize=8)
        plt.axis('equal')
        ax.set_xlim((self.min_x, self.max_x))
        ax.set_ylim((self.min_y, self.max_y))


env = HoleEnvironment()

In [None]:
_, r, _, _ = env.step([0.001, -0.000])
print(r)
env.plot()
plt.show()

In [None]:
import keras
import theano
from keras.regularizers import l2
from keras.optimizers import Adam
from keras.models import Sequential, Model
from keras.layers import Dense, BatchNormalization, Input, merge, Merge


class Critic:
    
    def build(self, x, u):
        y = merge([x, u], mode='concat', name='x_u_merge')
        for layer in self.layers:
            y = layer(y)
        return y
    
    def __init__(self, x_size, u_size):
        x = Input(shape=(x_size,))
        u = Input(shape=(u_size,))
        self.layers = []
        self.layers = [
            BatchNormalization(name='critic_bn1', input_shape=(x_size + u_size,)),
            Dense(input_dim=(x_size,), W_regularizer=l2(), activation='elu', output_dim=400, name='critic_fc1'),
            BatchNormalization(name='critic_bn2'),
            Dense(output_dim=300, W_regularizer=l2(), activation='elu', name='critic_fc2'),
            Dense(output_dim=1, name='critic_q'),
        ]
        self.q = Model(input=[x, u], output=self.build(x, u))
        self.q.compile(loss='mse', optimizer=Adam(1e-3))
    
    def soft_update(self, other, tau=0.001):
        for a, b in zip(self.q.weights, other.q.weights):
            a.set_value((1 - tau) * a.get_value() + tau * b.get_value())


def loss(y_true, y_pred):
    return -y_pred


class Actor:
    
    def __init__(self, x_size, u_size, critic):
        x = Input(shape=(x_size,), name='actor_x')
        self.u = Sequential([
            BatchNormalization(input_shape=(x_size,), name='actor_bn1'),
            Dense(output_dim=400, name='actor_fc1', activation='elu'),
            BatchNormalization(name='actor_bn2'),
            Dense(output_dim=300, name='actor_fc2', activation='elu'),
            Dense(output_dim=u_size, name='actor_fc3', activation='tanh')
        ], name='actor_u_model')
        q = critic.build(x, self.u(x))
        self.q = Model(input=x, output=q, name='actor_q_part_model')
        layers = []
        for l in self.q.layers:
            if hasattr(l, 'trainable') and l.trainable:
                if l.name.startswith('critic'):
                    layers.append(l)
                    l.trainable = False
        self.q.compile(loss=loss, optimizer=Adam(1e-4))
        for l in layers:
            l.trainable = True # reset

    def soft_update(self, other, tau=0.001):
        for a, b in zip(self.u.weights, other.u.weights):
            a.set_value((1 - tau) * a.get_value() + tau * b.get_value())
        
critic = Critic(env.x_size, env.u_size)
critic_target = Critic(env.x_size, env.u_size)
actor = Actor(env.x_size, env.u_size, critic)
actor_target = Actor(env.x_size, env.u_size, critic)
critic_target.q.set_weights(critic.q.get_weights())
actor_target.u.set_weights(actor.u.get_weights())

In [None]:
def sample_batch(buffer, env, size=32):
    x_size = env.x_size
    X = np.zeros((size, x_size))
    Xp = np.zeros((size, x_size))
    U = np.zeros((size, 2))
    R = np.zeros((size, 1))
    S = np.zeros((size, 1))
    exp_nodes = []
    for i in range(size):
        sample = priority_buffer.sample()
        exp_nodes.append(sample)
        X[i, :] = sample.data['x']
        Xp[i, :] = sample.data['xp']
        U[i, :] = sample.data['u']
        R[i, :] = sample.data['r']
        S[i, :] = sample.data['s']
    return X, U, Xp, R, S, exp_nodes

In [None]:
def plot_v(actor, critic, env):
    env = deepcopy(env)
    x_size = env.get_state().shape[0]
    res = 64
    xs = np.linspace(env.min_x, env.max_x, res)
    ys = np.linspace(env.min_y, env.max_y, res)
    xss, yss = np.meshgrid(xs, ys)
    zss = np.zeros(xss.shape)
    for i, x in enumerate(xs):
        for j, y in enumerate(ys):
            env.eef_x = x
            env.eef_y = y
            X = env.get_state().reshape(1, env.x_size)
            zss[len(ys) - j - 1, i] = critic.q.predict([X, actor.u.predict(X)])
    plt.imshow(zss, cmap='inferno', interpolation='gaussian', aspect='auto',
               extent=[env.min_x, env.max_x, env.min_y, env.max_y])
    #plt.plot(env.hole_x, env.hole_y, 'ko', alpha=0.01, markersize=68)
    plt.plot(env.goal_x, env.goal_y, 'ko', alpha=1.0, markersize=8)
    plt.plot(env.goal_x, env.goal_y, 'ro', alpha=1.0, markersize=6)

    plt.xticks(np.linspace(env.min_x, env.max_x, 5))
    plt.yticks(np.linspace(env.min_y, env.max_y, 5))
    plt.colorbar().set_label('$V(\mathbf{x})$')
    
env.reset()
plot_v(actor, critic, env)
plt.show()

In [None]:
def plot_pi(nn, env, eef=None):
    from copy import deepcopy
    env = deepcopy(env)
    res = 25
    for x in np.linspace(env.min_x, env.max_x, res):
        for y in np.linspace(env.min_y, env.max_y, res):
            env.eef_x = x
            env.eef_y = y
            dx, dy = nn.u.predict(env.get_state().reshape(1, env.x_size))[0, :]
            plt.arrow(x, y, 0.01 * dx, 0.01 * dy, head_width=0.002)
    plt.plot(env.hole_x, env.hole_y, 'ko', alpha=0.5, markersize=68)
    plt.plot(env.goal_x, env.goal_y, 'ko', markersize=8)
    plt.plot(env.goal_x, env.goal_y, 'ro', markersize=6)
    
    plt.title('$\mathbf{\pi(x)}$')
    plt.xlim(env.min_x, env.max_x)
    plt.ylim(env.min_y, env.max_y)
    
env.reset()
plot_pi(actor, env)
plt.show()

In [None]:
priority_buffer = PriorityBuffer(2 ** 20)

In [None]:
#i = 200000
last_reset = i
batch_size = 64
while True:
    i += 1
    x = env.get_state().reshape(1, env.x_size)
    epsilon = max(0.1, 1 - 1e-6 * i)
    u = (1 - epsilon) * actor.u.predict(x) + epsilon * 2 * (np.random.rand(1, 2) - 0.5)
    xp, r, done, state = env.step(u.flatten() * env.max_dist)
    priority_buffer.add({
        'x': x,
        'u': u.flatten(),
        'r': r,
        'xp': xp.flatten(),
        's': state
    }).set_value(10.0)
    X, U, Xp, R, S, exp_nodes = sample_batch(priority_buffer, env, size=batch_size)
    target_q = R + 0.99 * critic_target.q.predict([Xp, actor_target.u.predict(Xp)])
    q = critic.q.predict([X, U])
    critic.q.fit([X, U], target_q, batch_size=batch_size, nb_epoch=1, verbose=False)
    actor.q.fit(X, np.zeros(target_q.shape), batch_size=batch_size, nb_epoch=1, verbose=False)
    actor_target.soft_update(actor)
    critic_target.soft_update(critic)
    [n.set_value(e) for n, e in zip(exp_nodes, abs(target_q - q).flatten() + 1e-9)]
    if done or i - last_reset > 64:
        last_reset = i
        env.reset()
    if i % 1024 == 0:
        print(priority_buffer)
        print('epsilon:', epsilon)
        print('iteration:', i)
        fig, ax = plt.subplots()
        env.reset()
        env.plot(ax=ax, eef_color=hsv_to_rgb((1, 1, 1)))
        for j in range(16):
            u = actor_target.u.predict(env.get_state().reshape(1, env.x_size)) * env.max_dist
            _, r, done, _ = env.step(u.flatten())
            env.plot(ax=ax, eef_color=hsv_to_rgb((j / 16, 1, 1)))
            if done:
                break
        plt.show()
        plt.figure(figsize=(12, 4))
        plt.subplot(121)
        plot_pi(actor, env)
        plt.subplot(122)
        plot_v(actor, critic, env)
        plt.show()
        print('targets:')
        plt.figure(figsize=(12, 4))
        plt.subplot(121)
        plot_pi(actor_target, env)
        plt.subplot(122)
        plot_v(actor_target, critic_target, env)
        plt.show()

In [None]:
plt.figure(figsize=(12, 4))
plt.subplot(121)
plot_pi(actor_target, env)
plt.subplot(122)
plot_v(actor_target, critic_target, env)
plt.savefig('ddpg-hole-pi-and-v.pdf')
plt.show()

In [None]:
fig, ax = plt.subplots()
for it in range(512):
    env.reset()
    xs, ys = [env.eef_x], [env.eef_y]
    for j in range(512):
        u = actor.u.predict(env.get_state().reshape(1, env.x_size))
        _, r, done, _ = env.step(u.flatten() * env.max_dist)
        xs.append(env.eef_x)
        ys.append(env.eef_y)
        if done:
            break
    plt.plot(xs, ys, 'k', alpha=0.1, linewidth=0.5)
env.eef_x = -0.2
env.plot(ax=ax, eef_color='k')
plt.savefig('ddpg-hole-traces.pdf')
plt.show()

In [None]:
0.5 + 0.99 * 100