In [None]:
from __future__ import print_function

import sys
import pickle
import threading
from operator import mul
from copy import deepcopy
from functools import reduce
if sys.version_info.major == 2:
    from Queue import Queue
else:
    from queue import Queue

import keras
import theano
import theano.tensor as T
from keras import backend as K
from keras.optimizers import Adam
from keras.models import Sequential, Model
from keras.layers import Dense, Merge, Input, Lambda, merge, Layer, BatchNormalization

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from naf.priority_buffer import PriorityBuffer

In [None]:
class DDPG:
    
    def predict(self, *x):
        return self.nn.predict(list(x))
    
    @property
    def trainable_weights(self):
        return [w for w in self.nn.trainable_weights if not w.name.startswith('bn')]
    
    def soft_update(self, weights, lr=0.001):
        """
        Accepts theano tensors as inputs
        """
        for w_old, w_new in zip(self.nn.weights, weights):
            w_old.set_value(
                lr * w_new.get_value() + (1 - lr) * w_old.get_value()
            )
            

class Critic(DDPG):
    
    def __init__(self, x_size, u_size, hidden_size=100):
    
        x = Input(shape=(x_size, ), name='x')
        u = Input(shape=(u_size, ), name='u')
        x_model = Model(input=x, output=x)
        u_model = Model(input=u, output=u)
        
        self.nn = Sequential([
            Merge([x_model, u_model], mode='concat'),
            BatchNormalization(input_shape=(x_size,), name='bn1'),
            Dense(output_dim=hidden_size, activation='relu', name='fc1'),
            BatchNormalization(name='bn2'),
            Dense(output_dim=hidden_size, activation='relu', name='fc2'),
            BatchNormalization(name='bn3'),
            Dense(output_dim=(1), name='Q'),
        ])

        adam = Adam(lr=0.0001)
        self.nn.compile(loss='mse', optimizer=adam)
        self._gradients = theano.function(
            self.nn.inputs + [K.learning_phase()],
            T.grad(self.nn.output[0, 0], u_model.output),
            allow_input_downcast=True
        )

    def gradients(self, x, u):
        assert x.shape[0] == 1
        return self._gradients(x, u, False)
        
    
class Actor(DDPG):
    
    def __init__(self, x_size, u_size, mu_scaling, hidden_size=100):
        x = Input(shape=(x_size, ), name='state')
        self.nn = Sequential([
            BatchNormalization(input_shape=(x_size,), name='bn1'),
            Dense(input_shape=(x_size,), output_dim=hidden_size, activation='relu', name='fc1'),
            BatchNormalization(name='bn2'),
            Dense(output_dim=hidden_size, activation='relu', name='fc2'),
            BatchNormalization(name='bn3'),
            Dense(output_dim=u_size, name='mu_unscaled', activation='tanh'),
            Lambda(lambda x: mu_scaling * x, output_shape=(u_size, ), name='mu')
        ])
        
        # This optimizer won't be needed, learning from policy gradient
        self.nn.compile(loss='mse', optimizer='sgd')
        
        # gradients
        params = self.trainable_weights
        gradients = [T.grad(self.nn.output[0, i], params) for i in range(u_size)]
        gradients_list = []
        for g in gradients:
            gradients_list.extend(g)
        self._gradients = theano.function(
            self.nn.inputs + [K.learning_phase()],
            gradients_list,
            allow_input_downcast=True
        )
    
    def gradients(self, x):
        assert x.shape[0] == 1
        res = []
        for g in self._gradients(x, False):
            res.extend(g.flatten())
        return np.array(res).reshape((2, int(len(res) / 2)))
    
    def update_with_policy_gradient(self, policy_gradient, lr=0.0001):
        """
        Update from separate actor and critic gradients, which
        multiply to make the policy gradient
        """
        i = 0
        policy_gradient = policy_gradient.astype(np.float32)
        for g in self.trainable_weights:
            v = g.get_value()
            param_len = reduce(mul, v.shape)
            g.set_value(v + lr * policy_gradient[0, i:i + param_len].reshape(v.shape))
            i += param_len

In [None]:
def create_state_vector(eef_x, eef_y, circle_x, circle_y, goal_x, goal_y):
    return np.array([
        [eef_x, eef_y, circle_x, circle_y, goal_x, goal_y]
    ], dtype=np.float32)

In [None]:
WIN = 0
LOSE = 1
NEUTRAL = 2
MAX_DIST = 0.01

class Circle:
    
    def __init__(self, x, y):
        self.x = x
        self.y = y
        self.radius = 0.02
        
    def interact(self, x, y):
        theta = np.arctan2(y - self.y, x - self.x)
        center_distance = np.linalg.norm([self.y - y, self.x - x])
        distance = self.radius - center_distance
        if center_distance > self.radius:
            return
        self.x -= distance * np.cos(theta)
        self.y -= distance * np.sin(theta)
        
class Environment:
    
    def __init__(self):
        self.reset()
    
    def reset(self):
        # Random on inner and outer circle
        eef_theta = np.random.rand() * 2 * np.pi
        self.eef_x = 0.10 * np.cos(eef_theta)
        self.eef_y = 0.20 + 0.07 * np.sin(eef_theta)
        circle_theta = np.random.rand() * 2 * np.pi
        circle_x = 0.04 * np.cos(circle_theta)
        circle_y = 0.20 + 0.02 * np.sin(circle_theta)
        self.circle = Circle(circle_x, circle_y)
        while True:
            goal_theta = np.random.rand() * 2 * np.pi
            self.goal_x = 0.04 * np.cos(goal_theta)
            self.goal_y = 0.20 + 0.02 * np.sin(goal_theta)
            if np.linalg.norm([self.goal_x - circle_x, self.goal_y - circle_y]) > 0.04:
                break
        while True:
            self.eef_x  = -0.10 + np.random.rand() * 0.20
            self.eef_y  =  0.12 + np.random.rand() * 0.17
            if np.linalg.norm([self.eef_x - circle_x, self.eef_y - circle_y]) < 0.04:
                continue
            else:
                break

    def get_state(self):
        return create_state_vector(
            self.eef_x,
            self.eef_y,
            self.circle.x,
            self.circle.y,
            self.goal_x,
            self.goal_y,
        )

    def interact(self, dx, dy):
        dist = np.linalg.norm([dx, dy])
        if dist > MAX_DIST:
            dx = MAX_DIST * dx / dist
            dy = MAX_DIST * dy / dist
        self.eef_x += dx
        self.eef_y += dy
        self.circle.interact(self.eef_x, self.eef_y)
        state = NEUTRAL
        reward = -4
        if not -0.15 <= self.eef_x <= 0.15:
            state = LOSE
        elif not 0.10 <= self.eef_y <= 0.30:
            state = LOSE
        elif not -0.15 <= self.circle.x <= 0.15:
            state = LOSE
        elif not 0.10 <= self.circle.y <= 0.30:
            state = LOSE
        elif np.linalg.norm([self.goal_x - self.circle.x, self.goal_y - self.circle.y]) < 0.005:
            state = WIN
            
        if state != LOSE:
            eef2circle = np.linalg.norm([self.eef_x - self.circle.x, self.eef_y - self.circle.y])
            circle2goal = np.linalg.norm([self.goal_x - self.circle.x, self.goal_y - self.circle.y])
            reward = (
                np.exp(-200 * eef2circle ** 2) - 1 +
                2 * np.exp(-200 * circle2goal ** 2) - 1
            )
        
        return state, reward, self.get_state()
        
    def plot(self, ax=None):
        if ax is None:
            _, ax = plt.subplots()
        plt.grid()
        ax.add_artist(plt.Circle(
            (self.goal_x, self.goal_y),
            self.circle.radius,
            color='k',
        ))
        ax.add_artist(plt.Circle(
            (self.goal_x, self.goal_y),
            self.circle.radius - 0.001,
            color='w',
        ))
        ax.add_artist(plt.Circle(
            (self.circle.x, self.circle.y),
            self.circle.radius,
            color='r',
            alpha=0.5
        ))
        plt.plot(self.eef_x, self.eef_y, 'k+', markersize=10)
        plt.xlim((-0.15, 0.15))
        plt.ylim((0.10, 0.30))
        
e = Environment()
e.plot()
plt.show()

In [None]:
hidden_size = 400
actor = Actor((2 + 2 + 2), 2, MAX_DIST, hidden_size=hidden_size)
actor_target = Actor((2 + 2 + 2), 2, MAX_DIST, hidden_size=hidden_size)
actor_target.nn.set_weights(actor.nn.get_weights())

critic = Critic((2 + 2 + 2), 2, hidden_size=hidden_size)
critic_target = Critic((2 + 2 + 2), 2, hidden_size=hidden_size)
critic_target.nn.set_weights(critic.nn.get_weights())

In [None]:
critic.nn.summary()

In [None]:
#def plot_v(nn, cube_x, cube_y, goal_x, goal_y):
#    xs = np.linspace(-0.15, 0.15, 12)
#    ys = np.linspace(0.10, 0.30, 12)
#    xss, yss = np.meshgrid(xs, ys)
#    zss = np.zeros(xss.shape)
#    for i, x in enumerate(xs):
#        for j, y in enumerate(ys):
#            zss[len(ys) - j - 1, i] = nn.v.predict(np.array([[x, y, cube_x, cube_y, goal_x, goal_y]]))[0, 0]
#    plt.imshow(zss, cmap='inferno', interpolation='gaussian', aspect='auto',
#               extent=[-0.15, 0.15, 0.10, 0.30])
#    plt.plot(cube_x, cube_y, 'ko', markersize=8)
#    plt.plot(cube_x, cube_y, 'ro', markersize=6)
#    plt.plot(goal_x, goal_y, 'ko', markersize=8)
#    plt.plot(goal_x, goal_y, 'wo', markersize=6)
#    plt.colorbar().set_label('$V(\mathbf{x})$')
    
def plot_q(nn, eef_x, eef_y, cube_x, cube_y, goal_x, goal_y):
    xs = np.linspace(-0.01, 0.01, 12)
    ys = np.linspace(-0.01, 0.01, 12)
    xss, yss = np.meshgrid(xs, ys)
    zss = np.zeros(xss.shape)
    for i, x in enumerate(xs):
        for j, y in enumerate(ys):
            zss[len(ys) - j - 1, i] = nn.predict(
                np.array([[eef_x, eef_y, cube_x, cube_y, goal_x, goal_y]]),
                np.array([[x, y]])
            )[0, 0]
    plt.imshow(zss, cmap='inferno', interpolation='gaussian', aspect='auto',
               extent=[-0.01, 0.01, -0.01, 0.01])
    plt.plot(0.0, 0.0, 'ko', markersize=10)
    plt.plot(0.0, 0.0, 'w+', markersize=10)
    plt.xticks(np.linspace(-0.01, 0.01, 5))
    plt.yticks(np.linspace(0.01, -0.01, 5))
    plt.colorbar().set_label('$Q(\mathbf{x, u})$')


def plot_pi(nn, cube_x, cube_y, goal_x, goal_y, eef=None):
    for x in np.linspace(-0.15, 0.15, 20):
        for y in np.linspace(0.12, 0.30, 20):
            X = np.array([[x, y, cube_x, cube_y, goal_x, goal_y]])
            dx, dy = nn.predict(X)[0, :]
            plt.arrow(x, y, 2 * dx, 2 * dy)
    if eef:
        plt.plot(eef[0], eef[1], 'ko', markersize=10)
        plt.plot(eef[0], eef[1], 'w+', markersize=10)
    plt.plot(cube_x, cube_y, 'ko', markersize=10)
    plt.plot(cube_x, cube_y, 'ro', markersize=8)
    plt.plot(goal_x, goal_y, 'ko', markersize=10)
    plt.plot(goal_x, goal_y, 'wo', markersize=8)
    plt.title('$\mathbf{\mu(x)}$')
    plt.xlim(-0.15, 0.15)
    plt.ylim(0.12, 0.30)
    print('dx, dy:', dx, dy)
    
e.reset()
plt.figure(figsize=(13, 3))
plt.subplot(121)
plot_pi(actor, e.circle.x, e.circle.y, e.goal_x, e.goal_y, eef=(e.eef_x, e.eef_y))
plt.subplot(122)
plot_q(critic, e.eef_x, e.eef_y, e.circle.x, e.circle.y, e.goal_x, e.goal_y)
plt.show()

In [None]:
from datetime import datetime, timedelta

batch_size = 1024
replay_buffer = PriorityBuffer(2 ** 21)
gamma = 0.98
epsilon = 0.1

X = np.zeros((batch_size, 6))
Xp = np.zeros((batch_size, 6))
U = np.zeros((batch_size, 2))
R = np.zeros((batch_size, 1))
gradient_len = actor.gradients(X[:1, :]).shape[1]
policy_gradient = np.zeros((1, gradient_len))

n_iterations = 2048.0
latest_plot = datetime.now() - timedelta(seconds=30)
latest_trial_plot = datetime.now() - timedelta(seconds=60)
a = 0
for a in range(a, int(n_iterations)):
    print('iteration {} / {}'.format(a + 1, n_iterations))
    e.reset()
    latest_trial = []
    latest_rewards = []
    for b in range(batch_size):
        x1 = e.get_state()
        mu = actor.predict(x1)
            
        noise = np.random.randn(1, 2) * MAX_DIST * 1.0
        mu = mu + noise
        dist = np.linalg.norm(mu)
        if dist > MAX_DIST:
            mu = mu * MAX_DIST / dist
        state, reward, x2 = e.interact(*(mu)[0, :])
        latest_trial.append(x2[0, :])
        latest_rewards.append(reward)
        replay_buffer.add({
            'x1': x1,
            'x2': x2,
            'u': mu,
            'r': reward
        }).set_value(10.0)
        if state in [LOSE, WIN] or b == batch_size - 1:
            if datetime.now() > latest_trial_plot + timedelta(seconds=60):
                latest_trial_plot = datetime.now()
                x = np.array(latest_trial)
                plt.figure(figsize=(12, 3))
                plt.subplot(121)
                plt.plot(x[:, 0], x[:, 1], 'b')
                plt.plot(x[:, 2], x[:, 3], 'ro', markersize=14, alpha=0.2)
                plt.plot(x[:, 4], x[:, 5], 'ko', markersize=14)
                plt.plot(x[:, 4], x[:, 5], 'wo', markersize=12)
                plt.ylim((0.10, 0.30))
                plt.xlim((-0.15, 0.15))
                plt.subplot(122)
                plt.plot(latest_rewards)
                plt.show()
            latest_trial = []
            latest_rewards = []
            e.reset()
    
    n_inner = 4
    for i in range(n_inner):
        exp_nodes = []
        for b in range(batch_size):
            sample = replay_buffer.sample()
            exp_nodes.append(sample)
            X[b, :] = sample.data['x1']
            Xp[b, :] = sample.data['x2']
            R[b, :] = sample.data['r']
            U[b, :] = sample.data['u']
        Q = critic.predict(X, U)
        Y = R + gamma * critic_target.predict(Xp, actor_target.predict(Xp))
        [node.set_value(abs(delta) + epsilon) for node, delta in zip(exp_nodes, (Q - Y)[:, 0])]
        beta = np.exp((a - n_iterations) / (0.1 * n_iterations))
        sample_weight = np.array([1.0 / node.value for node in exp_nodes]) ** beta
        critic.nn.fit([X, U], Y, verbose=0, sample_weight=sample_weight)
        policy_gradient *= 0
        for b in range(batch_size):
            policy_gradient += sample_weight[b] * np.dot(
                critic.gradients(X[b:b + 1, :], U[b:b + 1, :]),
                actor.gradients(X[b:b + 1, :])
            ) / batch_size
        actor.update_with_policy_gradient(policy_gradient, lr=0.001)
        actor_target.soft_update(actor.nn.weights, lr=0.001)
        critic_target.soft_update(critic.nn.weights, lr=0.001)
        
        if datetime.now() > latest_plot + timedelta(seconds=60):
            print('beta: {} outer: {}/{} inner: {}/{} {}'.format(beta, a, n_iterations, i, n_inner, replay_buffer))
            plt.figure(figsize=(13, 7))
            plt.subplot(221)
            e.circle.x = 0.00; e.circle.y = 0.20
            e.eef_x = -0.04; e.eef_y = 0.20
            e.goal_x = 0.04; e.goal_y = 0.20
            plot_pi(actor, e.circle.x, e.circle.y, e.goal_x, e.goal_y, eef=(e.eef_x, e.eef_y))
            plt.subplot(222)
            plot_q(critic, e.eef_x, e.eef_y, e.circle.x, e.circle.y, e.goal_x, e.goal_y)
            plt.subplot(223)
            plot_pi(actor_target, e.circle.x, e.circle.y, e.goal_x, e.goal_y, eef=(e.eef_x, e.eef_y))
            plt.subplot(224)
            plot_q(critic_target, e.eef_x, e.eef_y, e.circle.x, e.circle.y, e.goal_x, e.goal_y)
            plt.show()
            latest_plot = datetime.now()

In [None]:
e.reset()
e.eef_x = 0.05
e.eef_y = 0.20
e.plot()
plt.show()

In [None]:
#fig, ax = plt.subplots()
#for i in range(100):
#    mu = actor.predict(e.get_state())
#    s, _, _ = e.interact(*mu.flatten())
#    e.plot(ax=ax)
#    #plt.savefig('res/move{:04d}.png'.format(i))
#    if s == LOSE:
#        break

def plot(e, ax, nn):
    plt.grid()
    ax.add_artist(plt.Circle(
        (e.goal_x, e.goal_y),
        e.circle.radius,
        color='k',
    ))
    ax.add_artist(plt.Circle(
        (e.goal_x, e.goal_y),
        e.circle.radius - 0.001,
        color='w',
    ))
    n_steps = 512
    for i in range(n_steps):
        if i % 2:
            continue
        mu = nn.predict(e.get_state())
        s, _, _ = e.interact(*mu.flatten())
        ax.add_artist(plt.Circle(
            (e.circle.x, e.circle.y),
            e.circle.radius,
            color='r',
            alpha=(1.0 * i / n_steps)
        ))
        plt.plot(e.eef_x, e.eef_y, 'k+', markersize=10, alpha=np.sqrt(1.0 * i / n_steps))
    ax.add_artist(plt.Circle(
        (e.circle.x, e.circle.y),
        e.circle.radius,
        color='k',
    ))
    ax.add_artist(plt.Circle(
        (e.circle.x, e.circle.y),
        e.circle.radius * 0.95,
        color='r',
    ))
    plt.xlim((-0.15, 0.15))
    plt.ylim((0.10, 0.30))
    
fig, ax = plt.subplots()
e.reset()
plot(e, ax, actor)
#plt.savefig('naf_sim_failure_mode_ideal.pdf')
plt.show()