In [None]:
# import os
# os.environ['KERAS_BACKEND'] = 'tensorflow'

In [3]:
from __future__ import print_function
import numpy as np
np.random.seed(0)

from keras.datasets import mnist
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD
from keras.utils import np_utils
from keras.objectives import categorical_crossentropy

In [2]:
%time (X_train, y_train), (X_test, y_test) = mnist.load_data()

X_train = X_train.reshape(60000, 784)
X_test = X_test.reshape(10000, 784)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255
X_test /= 255
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')

CPU times: user 1.19 s, sys: 429 ms, total: 1.62 s
Wall time: 1.64 s
60000 train samples
10000 test samples


In [37]:
import theano
from theano.tensor.extra_ops import fill_diagonal
from theano import tensor as T
from keras import backend as K

batch_size = 128

# function [H, P] = Hbeta(D, beta)
def Hbeta(D, beta):
    # P = exp(-D * beta);
    P = K.exp(-D * beta)
    # sumP = sum(P);
    sumP = K.sum(P)
    # H = log(sumP) + beta * sum(D .* P) / sumP;
    H = K.log(sumP) + beta * K.sum(K.prod(D, P)) / sumP
    # P = P / sumP;
    P = P / sumP
    return H, P

# https://github.com/kylemcdonald/Parametric-t-SNE/blob/master/src/x2p.m
def x2p(X, u=15, tol=1e-4):
    # n = size(X, 1);                     % number of instances
#     n = K.eval(K.shape(X)[0]) # this doesn't work: "An input of the graph .. was not provided and not given a value"
    n = batch_size
    # P = zeros(n, n);                    % empty probability matrix
    P = K.zeros((n, n))
    # beta = ones(n, 1);                  % empty precision vector
    beta = K.ones((n, 1))
    # logU = log(u);                      % log of perplexity (= entropy)
    logU = K.log(u)
    
    # sum_X = sum(X .^ 2, 2);
    sum_X = K.sum(K.square(X), axis=1)
    # D = bsxfun(@plus, sum_X, bsxfun(@plus, sum_X', -2 * X * X'));
    D = sum_X + (K.transpose(sum_X) + -2 * X * K.transpose(X))
    
    for i in range(n):
        # Di = D(i, [1:i-1 i+1:end]);
        Di = D[i] # can we use the whole row and make the diagonal zero later?
        # [H, thisP] = Hbeta(Di, beta(i));
        H, thisP = Hbeta(Di, beta[i])
        
        # ... a lot more right here
        
        P[i] = thisP
    
    return P #, beta
    
# curX is the high-dimensional input (Keras loss functions call this y_true)
# activations is the low-dimensional output (Keras loss functions call this y_pred)
def tsne(curX, activations):
    perplexity = 30
    
    # these joint probabilities should be pre-computed per-batch and passed to the fit() function
    
    # P{i} = x2p(curX{i}, perplexity, 1e-5); % compute affinities using fixed perplexity
    P = x2p(curX, perplexity, 1e-5)
    # P{i}(isnan(P{i})) = 0;                 % make sure we don't have NaN's
    # P = T.set_subtensor(P[T.isnan(P)], 0) # something like this?
    # P = T.switch(T.isnan(P), 0, P) # or like this? 
    # P{i} = (P{i} + P{i}') / 2;             % make symmetric
    P = (P + K.transpose(P)) / 2 # this seems to be missing the step of normalizing by "2n", just normalizes by "2"
    # P{i} = P{i} ./ sum(P{i}(:));           % obtain estimation of joint probabilities
    P = P / K.sum(P) # but maybe this makes up for the missing "n" above?
    # P{i} = max(P{i}, eps);
    P = K.maximum(P, K.epsilon())

    # v = length(network{end}.bias_upW) - 1
    v = K.shape(activations)[1] - 1
    
    # sum_act = sum(activations .^ 2, 2)
    sum_act = K.sum(K.square(activations), axis=1)
    # Q = (1 + (bsxfun(@plus, sum_act, bsxfun(@plus, sum_act', -2 * activations * activations')) ./ v)) .^ -((v + 1) / 2)
    Q = K.pow(1 + ((sum_act + (K.transpose(sum_act) + -2 * activations * K.transpose(activations))) / v), -((v + 1) / 2))
    # Q(1:n+1:end) = 0
    fill_diagonal(Q, 0) # Theano-only
    # Q = Q ./ sum(Q(:))
    Q = K.maximum(Q, K.epsilon())
    
    # C = sum(sum(P{1} .* log((P{1} + eps) ./ (Q + eps))))
    C = K.sum(K.sum(K.prod(P, K.log((P + K.epsilon()) / (Q + K.epsilon())))))
    return C

In [None]:
model = Sequential()
model.add(Dense(500, input_shape=(784,)))
model.add(Activation('relu'))
model.add(Dense(500))
model.add(Activation('relu'))
model.add(Dense(2000))
model.add(Activation('relu'))
model.add(Dense(2))

sgd = SGD()
%time model.compile(loss=tsne, optimizer=sgd)

In [None]:
model.fit(X_train, Y_train,
          batch_size=batch_size,
          nb_epoch=20,
          verbose=2)