In [1]:
import numpy as np

In [34]:
EPS = 1e-6
ALPHA = 0.01
MAX_EPOCHS = 20

In [3]:
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

In [39]:
def softmax(x):
    e = np.exp(x)
    return e / np.sum(e)

In [27]:
def relu(x):
    return max(0, x)

In [4]:
def forward_prop(X, weights, biases, activations):
    assert len(weights) == len(biases) == len(activations)
    result = X
    outputs = []
    for W, b, activation in zip(weights, biases, activations):
        layer_output = np.dot(result, W) + b
        result = activation(layer_output)
        outputs.append(result)
    return result, outputs

In [5]:
def cross_entropy_loss(y, t):
    return -np.mean(t * np.log(y) + (1 - t) * np.log(1 - y))

In [6]:
def back_prop(x, pred, target, weights, biases, outputs):
    Ew = pred - target
    Ev = outputs[0] * np.dot(weights[1], Ew)
    loss = cross_entropy_loss(pred, target)
    dW = np.outer(outputs[0], Ew)
    dV = np.outer(x, Ev)
    return loss, (dV, dW, Ev, Ew)

In [7]:
def predict(x, weights, biases, activations):
    return np.argmax(forward_prop(x, weights, biases, activations)[0])

In [33]:
def fit(X, y):
    assert X.shape[0] == y.shape[0]

    input_dim = X.shape[1]
    hidden_dim = 3
    output_dim = y.shape[-1]

    input_to_hidden_W = np.random.random((input_dim, hidden_dim))
    input_to_hidden_b = np.random.random(hidden_dim)
    hidden_to_output_W = np.random.random((hidden_dim, output_dim))
    hidden_to_output_b = np.random.random(output_dim)

    weights = [input_to_hidden_W, hidden_to_output_W]
    biases = [input_to_hidden_b, hidden_to_output_b]
    activations = [np.vectorize(relu), np.vectorize(softmax)]
    layers = weights + biases
    err = [9999]
    epoch = 0
    while EPS < np.mean(err) and epoch < MAX_EPOCHS:
        err = []
        upd = [0] * len(layers)
        for i in range(X.shape[0]):
            predicted, outputs = forward_prop(X[i], weights, biases, activations)
            loss, grad = back_prop(X[i], predicted, y[i], weights, biases, outputs)

            for j in range(len(layers)):
                layers[j] -= upd[j]

            for j in range(len(layers)):
                upd[j] = ALPHA * grad[j]
            err.append(loss)
        print 'Epoch {}: loss {}'.format(epoch, np.mean(err))
        epoch += 1
    return weights, biases, activations

In [20]:
X = np.random.binomial(1, 0.5, (1000, 10))
print X[:10]
y = X[:,1]
print y[:10]

y_one_hot = np.zeros((y.shape[0], 2))
for i, y_i in enumerate(y):
    y_one_hot[i][y_i] = 1

[[1 1 0 0 1 1 0 1 1 0]
 [1 1 1 1 1 1 0 1 1 0]
 [0 1 1 0 1 1 1 0 1 0]
 [0 1 0 1 0 0 0 1 0 0]
 [1 1 1 0 0 0 1 0 1 1]
 [0 1 0 0 1 0 0 1 0 1]
 [0 0 1 1 1 1 0 1 0 0]
 [1 1 0 0 0 0 0 0 0 0]
 [1 0 0 1 1 1 0 1 0 1]
 [1 0 1 0 0 0 1 1 0 1]]
[1 1 1 1 1 1 0 1 0 0]


In [40]:
params = fit(X, y)
print predict(X[0], *params)

Epoch 0: loss 0.705409721159
Epoch 1: loss 0.694123610032
Epoch 2: loss 0.694110114229
Epoch 3: loss 0.694113430547
Epoch 4: loss 0.694113730628
Epoch 5: loss 0.694113755316
Epoch 6: loss 0.694113757332
Epoch 7: loss 0.694113757496
Epoch 8: loss 0.69411375751
Epoch 9: loss 0.694113757511
Epoch 10: loss 0.694113757511
Epoch 11: loss 0.694113757511
Epoch 12: loss 0.694113757511
Epoch 13: loss 0.694113757511
Epoch 14: loss 0.694113757511


KeyboardInterrupt: 

In [38]:
predict(X[4], *params)

0