In [None]:
from __future__ import print_function

import sys
import pickle
import threading
if sys.version_info.major == 2:
    from Queue import Queue
else:
    from queue import Queue

import keras
import theano
import theano.tensor as T
from keras import backend as K
from keras.optimizers import Adam
from keras.models import Sequential, Model
from keras.layers import Dense, Merge, Input, Lambda, merge, Layer

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from naf.priority_buffer import PriorityBuffer

In [None]:
def create_state_vector(eef_x, eef_y, circle_x, circle_y, goal_x, goal_y):
    return np.array([
        [eef_x, eef_y, circle_x, circle_y, goal_x, goal_y]
    ], dtype=np.float32)

In [None]:
WIN = 0
LOSE = 1
NEUTRAL = 2
MAX_DIST = 0.01

class Circle:
    
    def __init__(self, x, y):
        self.x = x
        self.y = y
        self.radius = 0.02
        
    def interact(self, x, y):
        theta = np.arctan2(y - self.y, x - self.x)
        center_distance = np.linalg.norm([self.y - y, self.x - x])
        distance = self.radius - center_distance
        if center_distance > self.radius:
            return
        self.x -= distance * np.cos(theta)
        self.y -= distance * np.sin(theta)
        
class Environment:
    
    def __init__(self):
        self.reset()
    
    def reset(self):
        ## Random on inner and outer circle
        #eef_theta = np.random.rand() * 2 * np.pi
        #self.eef_x = 0.10 * np.cos(eef_theta)
        #self.eef_y = 0.20 + 0.07 * np.sin(eef_theta)
        #circle_theta = np.random.rand() * 2 * np.pi
        #circle_x = 0.04 * np.cos(circle_theta)
        #circle_y = 0.20 + 0.02 * np.sin(circle_theta)
        #self.circle = Circle(circle_x, circle_y)
        #while True:
        #    goal_theta = np.random.rand() * 2 * np.pi
        #    self.goal_x = 0.04 * np.cos(goal_theta)
        #    self.goal_y = 0.20 + 0.02 * np.sin(goal_theta)
        #    if np.linalg.norm([self.goal_x - circle_x, self.goal_y - circle_y]) > 0.04:
        #        break
        #self.eef_x  = -0.10 + np.random.randn() * 0.01
        #self.eef_y  =  0.20 + np.random.randn() * 0.01
        self.eef_x  = -0.10 + np.random.rand() * 0.20
        self.eef_y  =  0.12 + np.random.rand() * 0.17
        circle_x    =  0.00 + np.random.randn() * 0.01
        circle_y    =  0.20 + np.random.randn() * 0.01
        self.goal_x =  0.10 + np.random.randn() * 0.01
        self.goal_y =  0.20 + np.random.randn() * 0.01
        self.circle = Circle(circle_x, circle_y)

    def get_state(self):
        return create_state_vector(
            self.eef_x,
            self.eef_y,
            self.circle.x,
            self.circle.y,
            self.goal_x,
            self.goal_y,
        )

    def interact(self, dx, dy):
        dist = np.linalg.norm([dx, dy])
        if dist > MAX_DIST:
            dx = MAX_DIST * dx / dist
            dy = MAX_DIST * dy / dist
        self.eef_x += dx
        self.eef_y += dy
        self.circle.interact(self.eef_x, self.eef_y)
        state = NEUTRAL
        reward = -4
        if not -0.15 <= self.eef_x <= 0.15:
            state = LOSE
        elif not 0.10 <= self.eef_y <= 0.30:
            state = LOSE
        elif not -0.15 <= self.circle.x <= 0.15:
            state = LOSE
        elif not 0.10 <= self.circle.y <= 0.30:
            state = LOSE
        #elif np.linalg.norm([self.goal_x - self.circle.x, self.goal_y - self.circle.y]) < 0.005:
        #    state = WIN
        elif np.linalg.norm([self.goal_x - self.eef_x, self.goal_y - self.eef_y]) < 0.005:
            state = WIN
            
        if state != LOSE:
            #eef2circle = np.linalg.norm([self.eef_x - self.circle.x, self.eef_y - self.circle.y])
            #circle2goal = np.linalg.norm([self.goal_x - self.circle.x, self.goal_y - self.circle.y])
            eef2goal = np.linalg.norm([self.goal_x - self.eef_x, self.goal_y - self.eef_y])
            reward = np.exp(-200 * eef2goal ** 2) - 1
            #reward = (
            #    np.exp(-200 * eef2circle ** 2) - 1 +
            #    2 * (np.exp(-1000 * circle2goal ** 2) - 1)
            #)
        
        return state, reward, self.get_state()
        
    def plot(self):
        fig, ax = plt.subplots()
        plt.grid()
        ax.add_artist(plt.Circle(
            (self.goal_x, self.goal_y),
            self.circle.radius,
            color='k',
        ))
        ax.add_artist(plt.Circle(
            (self.goal_x, self.goal_y),
            self.circle.radius - 0.001,
            color='w',
        ))
        ax.add_artist(plt.Circle(
            (self.circle.x, self.circle.y),
            self.circle.radius,
            color='r',
            alpha=0.5
        ))
        plt.plot(self.eef_x, self.eef_y, 'k+', markersize=10)
        plt.xlim((-0.15, 0.15))
        plt.ylim((0.10, 0.30))
        plt.show()
        
e = Environment()
e.plot()
ss = []
rs = []
svs = []
for i in range(30):
    s, r, sv = e.interact(MAX_DIST, 0.000)
    ss.append(s)
    rs.append(r)
    svs.append(sv[0, :])

In [None]:
X = np.array(svs)
plt.figure(figsize=(12, 3))
plt.subplot(121)
plt.plot(X[:, 0], X[:, 1], 'b')
plt.plot(X[:, 4], X[:, 5], 'ko', markersize=14, alpha=0.2)
plt.ylim((0.10, 0.30))
plt.subplot(122)
plt.plot(rs)
plt.show()

In [None]:
from naf import NNet

nn = NNet(x_size=(2 + 2 + 2), u_size=2, mu_scaling=MAX_DIST)

In [None]:
def plot_v(nn, cube_x, cube_y, goal_x, goal_y):
    xs = np.linspace(-0.15, 0.15, 12)
    ys = np.linspace(0.10, 0.30, 12)
    xss, yss = np.meshgrid(xs, ys)
    zss = np.zeros(xss.shape)
    for i, x in enumerate(xs):
        for j, y in enumerate(ys):
            zss[len(ys) - j - 1, i] = nn.v.predict(np.array([[x, y, cube_x, cube_y, goal_x, goal_y]]))[0, 0]
    plt.imshow(zss, cmap='inferno', interpolation='gaussian', aspect='auto',
               extent=[-0.15, 0.15, 0.10, 0.30])
    plt.plot(cube_x, cube_y, 'ko', markersize=8)
    plt.plot(cube_x, cube_y, 'ro', markersize=6)
    plt.plot(goal_x, goal_y, 'ko', markersize=8)
    plt.plot(goal_x, goal_y, 'wo', markersize=6)
    plt.colorbar().set_label('$V(\mathbf{x})$')

def plot_pi(nn, cube_x, cube_y, goal_x, goal_y):
    for x in np.linspace(-0.15, 0.15, 20):
        for y in np.linspace(0.15, 0.30, 20):
            X = np.array([[x, y, cube_x, cube_y, goal_x, goal_y]])
            dx, dy = nn.mu.predict(X)[0, :]
            plt.arrow(x, y, dx, dy)
    plt.plot(cube_x, cube_y, 'ko', markersize=8)
    plt.plot(cube_x, cube_y, 'ro', markersize=6)
    plt.plot(goal_x, goal_y, 'ko', markersize=8)
    plt.plot(goal_x, goal_y, 'wo', markersize=6)
    plt.title('$\mathbf{\mu(x)}$')
    plt.xlim(-0.15, 0.15)
    plt.ylim(0.15, 0.30)
    print('dx, dy:', dx, dy)
    
plt.figure(figsize=(12, 3))
plt.subplot(121)
plot_pi(nn, 0.04, 0.2, 0.10, 0.2)
plt.subplot(122)
plot_v(nn, 0.04, 0.2, 0.10, 0.2)
plt.show()

In [None]:
batch_size = 1024
#replay_buffer = PriorityBuffer(2 ** 18)
gamma = 0.80
epsilon = 0.1

#todo: check importance weights annealing if they seem to matter
#   2: try moving goal

n_iterations = 256.0
#a = 0
for a in range(a, int(n_iterations)):
    print('iteration {} / {}'.format(a + 1, n_iterations))
    e.reset()
    latest_trial = []
    latest_rewards = []
    for b in range(batch_size):
        x1 = e.get_state()
        mu = nn.mu.predict(x1)
            
        noise = np.random.randn(1, 2) * MAX_DIST
        mu = mu + noise
        dist = np.linalg.norm(mu)
        if dist > MAX_DIST:
            mu = mu * MAX_DIST / dist
        state, reward, x2 = e.interact(*(mu)[0, :])
        latest_trial.append(x2[0, :])
        latest_rewards.append(reward)
        replay_buffer.add({
            'x1': x1,
            'x2': x2,
            'u': mu,
            'r': reward
        }).set_value(10.0)
        if state in [LOSE, WIN]:
            if np.random.rand() < 0.5:
                X = np.array(latest_trial)
                plt.figure(figsize=(12, 3))
                plt.subplot(121)
                plt.plot(X[:, 0], X[:, 1], 'b')
                plt.plot(X[:, 4], X[:, 5], 'ko', markersize=14)
                plt.plot(X[:, 4], X[:, 5], 'wo', markersize=12)
                plt.ylim((0.10, 0.30))
                plt.xlim((-0.15, 0.15))
                plt.subplot(122)
                plt.plot(latest_rewards)
                plt.show()
            latest_trial = []
            latest_rewards = []
            e.reset()
        
    n_inner = 128
    for i in range(n_inner):
        X = np.zeros((batch_size, 6))
        Xp = np.zeros((batch_size, 6))
        U = np.zeros((batch_size, 2))
        R = np.zeros((batch_size, 1))
        exp_nodes = []
        for b in range(batch_size):
            sample = replay_buffer.sample()
            exp_nodes.append(sample)
            X[b, :] = sample.data['x1']
            Xp[b, :] = sample.data['x2']
            R[b, :] = sample.data['r']
            U[b, :] = sample.data['u']
        V = nn.v.predict(Xp)
        Y = R + gamma * V
        Q = nn.q.predict([X, U])
        [node.set_value(abs(delta) + epsilon) for node, delta in zip(exp_nodes, (Q - Y)[:, 0])]
        beta = np.exp((a - n_iterations) / (0.1 * n_iterations))
        sample_weights = np.array([1.0 / node.value for node in exp_nodes]) ** beta
        sample_weights /= sample_weights.sum()
        if a % 1 == 0 and i % 32 == 0:
            print(replay_buffer)
            print('beta: {} outer iteration: {}/{} inner iteration: {}/{}'.format(beta, a, n_iterations, i, n_inner))
            verbose = 1
            cx, cy, gx, gy = 0.00, 0.20, e.goal_x, e.goal_y
            plot_v(nn, cx, cy, gx, gy)
            plt.show()
            plot_pi(nn, cx, cy, gx, gy)
            plt.show()
        else:
            verbose = 0
        nn.q.fit([X, U], Y, verbose=verbose, nb_epoch=1, batch_size=batch_size, sample_weight=sample_weights)

In [None]:
replay_buffer.sample().data