In [None]:
from __future__ import print_function

import sys
import json
import pickle
import threading
from operator import mul
from copy import deepcopy
from functools import reduce
from multiprocessing import Process, Queue, Value, Pool
if sys.version_info.major == 2:
    from Queue import Empty
else:
    from queue import Empty

import keras
import theano
import theano.tensor as T
from keras import backend as K
from keras.optimizers import Adam
from keras.models import Sequential, Model
from keras.layers import Dense, Merge, Input, Lambda, merge, Layer, BatchNormalization

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

#from ddpg import Actor, Critic
from naf.priority_buffer import PriorityBuffer
from environment import Environment, WIN, LOSE, NEUTRAL

In [None]:
e = Environment(0.01)
replay_buffer = PriorityBuffer(2 ** 20)

rewards = []
for trials in range(4096):
    e.reset()
    for i in range(32):
        mu = e.heuristic_move()
        x1 = e.get_state()
        s, r, x2 = e.interact(*mu)
        replay_buffer.add({
            'x1': x1,
            'x2': x2,
            'u': mu,
            'r': r
        }).set_value(10.0)
        if s == WIN:
            break
        
replay_buffer

In [None]:
from operator import mul
from functools import reduce

import keras
import theano
import numpy as np
import theano.tensor as T
from keras import backend as K
from keras.optimizers import Adam
from keras.regularizers import l2
from keras.models import Sequential, Model
from keras.layers import Dense, Merge, Input, Lambda, BatchNormalization


class DDPG:

    def predict(self, *x):
        return self.nn.predict(list(x))
    
    @property
    def trainable_weights(self):
        return [w for w in self.nn.trainable_weights if not w.name.startswith('bn')]
    
    def soft_update(self, weights, lr=0.001):
        """
        Accepts theano tensors as inputs
        """
        for w_old, w_new in zip(self.nn.weights, weights):
            w_old.set_value(
                lr * w_new.get_value() + (1 - lr) * w_old.get_value()
            )
            

class Critic(DDPG):
    
    def __init__(self, x_size, u_size, hidden_size=100):
    
        super(Critic, self).__init__()
        x = Input(shape=(x_size, ), name='x')
        u = Input(shape=(u_size, ), name='u')
        x_model = Model(input=x, output=x)
        u_model = Model(input=u, output=u)
        
        first_part = Sequential([
            BatchNormalization(input_shape=(x_size,), name='bn1'),
            Dense(output_dim=hidden_size, activation='relu', name='fc1', W_regularizer=l2(0.01)),
        ])
        
        self.nn = Sequential([
            Merge([first_part, u_model], mode='concat'),
            BatchNormalization(name='bn2'),
            Dense(output_dim=hidden_size - 100, activation='relu', name='fc2', W_regularizer=l2(0.01)),
            BatchNormalization(name='bn3'),
            Dense(output_dim=(1), name='Q', W_regularizer=l2(0.01)),
        ])

        adam = Adam(lr=0.0001)
        self.nn.compile(loss='mse', optimizer=adam)
        self._gradients = theano.function(
            self.nn.inputs + [K.learning_phase()],
            T.grad(self.nn.output[0, 0], u_model.output),
            allow_input_downcast=True
        )

    def gradients(self, x, u):
        assert x.shape[0] == 1
        return self._gradients(x, u, False)
        
    
class Actor(DDPG):
    
    def __init__(self, x_size, u_size, mu_scaling, hidden_size=100):
        
        # for Adam
        self.t = 0
        self.alpha = 0.001
        self.beta1 = 0.9
        self.beta2 = 0.999
        self.epsilon = 1e-8

        super(Actor, self).__init__()
        x = Input(shape=(x_size, ), name='state')
        self.nn = Sequential([
            BatchNormalization(input_shape=(x_size,), name='bn1'),
            Dense(input_shape=(x_size,), output_dim=hidden_size, activation='relu', name='fc1'),
            BatchNormalization(name='bn2'),
            Dense(output_dim=hidden_size - 100, activation='relu', name='fc2'),
            BatchNormalization(name='bn3'),
            Dense(output_dim=u_size, name='mu_unscaled', activation='tanh'),
            Lambda(lambda x: mu_scaling * x, output_shape=(u_size, ), name='mu')
        ])
        
        # This optimizer won't be needed, learning from policy gradient
        self.nn.compile(loss='mse', optimizer='sgd')
        
        # gradients
        params = self.trainable_weights
        gradients = [T.grad(self.nn.output[0, i], params) for i in range(u_size)]
        gradients_list = []
        for g in gradients:
            gradients_list.extend(g)
        self._gradients = theano.function(
            self.nn.inputs + [K.learning_phase()],
            gradients_list,
            allow_input_downcast=True
        )
    
    def gradients(self, x):
        assert x.shape[0] == 1
        res = []
        for g in self._gradients(x, False):
            res.extend(g.flatten())
        return np.array(res).reshape((2, int(len(res) / 2)))
    
    def update_with_policy_gradient(self, policy_gradient):
        """
        Update from separate actor and critic gradients, which
        multiply to make the policy gradient
        """
        i = 0
        if self.t == 0:
            self.m = np.zeros(policy_gradient.shape)
            self.v = np.zeros(policy_gradient.shape)
        self.t += 1
        self.m = self.beta1 * self.m + (1 - self.beta1) * policy_gradient
        self.v = self.beta2 * self.v + (1 - self.beta2) * policy_gradient ** 2
        m_hat = self.m / (1 - self.beta1 ** self.t)
        v_hat = self.v / (1 - self.beta2 ** self.t)
        policy_gradient = policy_gradient.astype(np.float32)
        for g in self.trainable_weights:
            prev = g.get_value()
            param_len = reduce(mul, prev.shape)
            mh = m_hat[0, i:i + param_len].reshape(prev.shape).astype(np.float32)
            vh = v_hat[0, i:i + param_len].reshape(prev.shape).astype(np.float32)
            g.set_value(prev + self.alpha * mh / (np.sqrt(vh) + self.epsilon))
            i += param_len

In [None]:
MAX_DIST = 0.01

hidden_size = 400
actor = Actor((2 + 2 + 2), 2, MAX_DIST, hidden_size=hidden_size)
#actor_target = Actor((2 + 2 + 2), 2, MAX_DIST, hidden_size=hidden_size)
#actor_target.nn.set_weights(actor.nn.get_weights())

critic = Critic((2 + 2 + 2), 2, hidden_size=hidden_size)
#critic_target = Critic((2 + 2 + 2), 2, hidden_size=hidden_size)
#critic_target.nn.set_weights(critic.nn.get_weights())

In [None]:
X = np.random.randn(1, 6)
U = np.random.randn(1, 2)
actor.update_with_policy_gradient(np.dot(critic.gradients(X, U), actor.gradients(X)))

In [None]:
#def plot_v(nn, cube_x, cube_y, goal_x, goal_y):
#    xs = np.linspace(-0.15, 0.15, 12)
#    ys = np.linspace(0.10, 0.30, 12)
#    xss, yss = np.meshgrid(xs, ys)
#    zss = np.zeros(xss.shape)
#    for i, x in enumerate(xs):
#        for j, y in enumerate(ys):
#            zss[len(ys) - j - 1, i] = nn.v.predict(np.array([[x, y, cube_x, cube_y, goal_x, goal_y]]))[0, 0]
#    plt.imshow(zss, cmap='inferno', interpolation='gaussian', aspect='auto',
#               extent=[-0.15, 0.15, 0.10, 0.30])
#    plt.plot(cube_x, cube_y, 'ko', markersize=8)
#    plt.plot(cube_x, cube_y, 'ro', markersize=6)
#    plt.plot(goal_x, goal_y, 'ko', markersize=8)
#    plt.plot(goal_x, goal_y, 'wo', markersize=6)
#    plt.colorbar().set_label('$V(\mathbf{x})$')
    
def plot_q(nn, eef_x, eef_y, cube_x, cube_y, goal_x, goal_y):
    xs = np.linspace(-0.01, 0.01, 12)
    ys = np.linspace(-0.01, 0.01, 12)
    xss, yss = np.meshgrid(xs, ys)
    zss = np.zeros(xss.shape)
    for i, x in enumerate(xs):
        for j, y in enumerate(ys):
            zss[len(ys) - j - 1, i] = nn.predict(
                np.array([[eef_x, eef_y, cube_x, cube_y, goal_x, goal_y]]),
                np.array([[x, y]])
            )[0, 0]
    plt.imshow(zss, cmap='inferno', interpolation='gaussian', aspect='auto',
               extent=[-0.01, 0.01, -0.01, 0.01])
    plt.plot(0.0, 0.0, 'ko', markersize=10)
    plt.plot(0.0, 0.0, 'w+', markersize=10)
    plt.xticks(np.linspace(-0.01, 0.01, 5))
    plt.yticks(np.linspace(0.01, -0.01, 5))
    plt.colorbar().set_label('$Q(\mathbf{x, u})$')


def plot_pi(nn, cube_x, cube_y, goal_x, goal_y, eef=None):
    for x in np.linspace(-0.15, 0.15, 20):
        for y in np.linspace(0.12, 0.30, 20):
            X = np.array([[x, y, cube_x, cube_y, goal_x, goal_y]])
            dx, dy = nn.predict(X)[0, :]
            plt.arrow(x, y, dx, dy)
    if eef:
        plt.plot(eef[0], eef[1], 'ko', markersize=10)
        plt.plot(eef[0], eef[1], 'w+', markersize=10)
    plt.plot(cube_x, cube_y, 'ko', markersize=10)
    plt.plot(cube_x, cube_y, 'ro', markersize=8)
    plt.plot(goal_x, goal_y, 'ko', markersize=10)
    plt.plot(goal_x, goal_y, 'wo', markersize=8)
    plt.title('$\mathbf{\mu(x)}$')
    plt.xlim(-0.15, 0.15)
    plt.ylim(0.12, 0.30)
    print('dx, dy:', dx, dy)
    
#e.reset()
#plt.figure(figsize=(13, 4))
#plt.subplot(121)
#plot_pi(actor, e.circle.x, e.circle.y, e.goal_x, e.goal_y, eef=(e.eef_x, e.eef_y))
#plt.subplot(122)
#plot_q(critic, e.eef_x, e.eef_y, e.circle.x, e.circle.y, e.goal_x, e.goal_y)
#plt.show()

In [None]:
def return_average(actor, gamma=0.98):
    rewards = []
    for trial in range(8):
        np.random.seed(trial)
        e.reset()
        n_steps = 256
        return_ = 0.0
        for i in range(n_steps):
            mu = actor.predict(e.get_state())
            _, r, _ = e.interact(*mu.flatten())
            return_ += gamma ** i * r
        rewards.append(return_)
    return np.mean(rewards), np.std(rewards)

#return_average(actor)

In [None]:
from datetime import datetime, timedelta

epoch_size = 256
batch_size = 32
gamma = 0.98
epsilon = 0.1

#reward_averages = []

X = np.zeros((epoch_size, 6))
Xp = np.zeros((epoch_size, 6))
U = np.zeros((epoch_size, 2))
R = np.zeros((epoch_size, 1))
gradient_len = actor.gradients(X[:1, :]).shape[1]
policy_gradient = np.zeros((1, gradient_len))

n_iterations = 2048.0
latest_plot = datetime.now() - timedelta(seconds=30)
latest_trial_plot = datetime.now() - timedelta(seconds=60)
#a = 0
for a in range(a, int(n_iterations)):
    print('iteration {} / {}'.format(a + 1, n_iterations))
    #e.reset()
    #latest_trial = []
    #latest_rewards = []
    #for b in range(batch_size):
    #    x1 = e.get_state()
    #    mu = actor.predict(x1, epsilon=1.0 * (n_iterations - a) / n_iterations + 0.1 * a / n_iterations)
    #        
    #    state, reward, x2 = e.interact(*(mu)[0, :])
    #    latest_trial.append(x2[0, :])
    #    latest_rewards.append(reward)
    #    replay_buffer.add({
    #        'x1': x1,
    #        'x2': x2,
    #        'u': mu,
    #        'r': reward
    #    }).set_value(10.0)
    #    if state in [LOSE, WIN] or b == batch_size - 1 or len(latest_trial) % 32 == 0:
    #        latest_trial = []
    #        latest_rewards = []
    #        e.reset()
    
    n_inner = 4
    for i in range(n_inner):
        exp_nodes = []
        for b in range(epoch_size):
            sample = replay_buffer.sample()
            exp_nodes.append(sample)
            X[b, :] = sample.data['x1']
            Xp[b, :] = sample.data['x2']
            R[b, :] = sample.data['r']
            U[b, :] = sample.data['u']
            
        timestamp = datetime.now()
        Q = critic.predict(X, U)
        Y = R + gamma * critic_target.predict(Xp, actor_target.predict(Xp))
        [node.set_value(abs(delta) + epsilon) for node, delta in zip(exp_nodes, (Q - Y)[:, 0])]
        beta = np.exp((a - n_iterations) / (0.1 * n_iterations))
        sample_weight = np.array([1.0 / node.value for node in exp_nodes]) ** beta
        print('batch prepared, took {}'.format(datetime.now() - timestamp))

        timestamp = datetime.now()
        critic.nn.fit([X, U], Y, verbose=0, sample_weight=sample_weight, batch_size=batch_size, nb_epoch=1)
        print('fit() took {}'.format(datetime.now() - timestamp))
        
        timestamp = datetime.now()
        policy_gradient *= 0
        for b in range(epoch_size):
            ag = actor.gradients(X[b:b + 1, :])
            ac = critic.gradients(X[b:b + 1, :], U[b:b + 1, :])
            policy_gradient += sample_weight[b] * np.dot(ac, ag)
        print('gradients calculated: {}'.format(datetime.now() - timestamp))
        
        actor.update_with_policy_gradient(policy_gradient)
        actor_target.soft_update(actor.nn.weights, lr=0.001)
        critic_target.soft_update(critic.nn.weights, lr=0.001)

        if datetime.now() > latest_plot + timedelta(seconds=15):
            print('beta: {} outer: {}/{} inner: {}/{} {}'.format(beta, a, n_iterations, i, n_inner, replay_buffer))
            now = datetime.now()
            np.random.seed(now.microsecond + now.second + now.minute)
            e.reset()
            plt.figure(figsize=(13, 8))
            plt.subplot(221)
            plot_pi(actor, e.circle.x, e.circle.y, e.goal_x, e.goal_y, eef=(e.eef_x, e.eef_y))
            plt.subplot(222)
            plot_q(critic, e.eef_x, e.eef_y, e.circle.x, e.circle.y, e.goal_x, e.goal_y)
            plt.subplot(223)
            plot_pi(actor_target, e.circle.x, e.circle.y, e.goal_x, e.goal_y, eef=(e.eef_x, e.eef_y))
            plt.subplot(224)
            plot_q(critic_target, e.eef_x, e.eef_y, e.circle.x, e.circle.y, e.goal_x, e.goal_y)
            plt.show()
            
            r_avg, r_std = return_average(actor_target)
            reward_averages.append(r_avg)
            plt.title('Average test trial return')
            plt.plot(reward_averages)
            plt.show()
            
            latest_plot = datetime.now()

In [None]:
datetime.now().second

In [None]:
e.reset()
e.eef_x = 0.05
e.eef_y = 0.20
e.plot()
plt.show()