In [1]:
#tutorial based on 6.1 Example: Learning XOR (page 170 in Deep Learning book)
#The XOR function (“exclusive or”): operation on two binary values, x1 and x2.
#When only one of these values==1, the XOR function returns 1. Otherwise, 0.
#Right now, not concerned with statistical generalization. 
#We want our network to perform correctly on the four points X = {[0,0] , [0,1] , [1,0] , and [1,1] }.

In [3]:
#We can treat this problem as a regression problem and use a mean squared error loss function. 
#We choose this loss function to simplify the math for this example as much as possible. 
#(there are other, more appropriate approaches for modeling binary data)

In [4]:
#load tensorflow
import tensorflow as tf

In [5]:
#We can minimize in closed form with respect to w and b using the normal equations.
#After solving the normal equations, we obtain w = 0 and b = 1/2. 
#The linear model simply outputs 0.5 everywhere.
sess = tf.Session()
#input data
XOR_X = [[0,0],[0,1],[1,0],[1,1]] #input

#placeholders
x_ = tf.placeholder(tf.float32, shape=[4,2], name="x-input")
#use weights/biases from book solution (page 171)
w = tf.Variable(tf.zeros([2,1]), tf.float32)
b = tf.Variable([1/2.], tf.float32)

init = tf.global_variables_initializer()
sess.run(init)

#operation node
linear_model = tf.matmul(x_,w) + b 

#see what the predictions are
print(sess.run(linear_model, {x_: XOR_X})) 

[[ 0.5]
 [ 0.5]
 [ 0.5]
 [ 0.5]]


In [6]:
#solve problem using a model that learns a different feature space in which a 
#linear model is able to represent the solution.
#introduce a very simple feedforward network with one hidden layer containing two hidden units.
#change what is given to output layer
sess2 = tf.Session()
#input data
XOR_X = [[0,0],[0,1],[1,0],[1,1]] #input

#placeholders
x_ = tf.placeholder(tf.float32, shape=[4,2], name="x-input")
#use weights/biases from book example (page 173)
w1 = tf.Variable(tf.ones([2,2]), tf.float32) #W
w2 = tf.Variable([[1.],[-2.]], tf.float32) #w
b1 = tf.Variable([[0.,-1.]], tf.float32) #c
b2 = tf.Variable(tf.zeros(1), tf.float32) #b

init2 = tf.global_variables_initializer()
sess2.run(init2)

#operation nodes
transformedH = tf.nn.relu(tf.matmul(x_,w1) + b1, name=None) #hidden layer with rect. linear act. func.
linear_model = tf.matmul(transformedH, w2) + b2

#see what the predictions are
print(sess2.run(linear_model, {x_: XOR_X})) 

[[ 0.]
 [ 1.]
 [ 1.]
 [ 0.]]


In [8]:
#In a real situation, there are lots of model parameters and training examples, 
#we cannot guess the solution as we did above. Instead, a gradient-based optimization algorithm can 
#find parameters that produce very little error. 
#now solve the same problem but let's use gradient-based optimization to find params 
#in order to do so need to measure error/loss (also need predicted values!)
sess3 = tf.Session()
#input data
XOR_X = [[0,0],[0,1],[1,0],[1,1]] #input
XOR_Y = [[0],[1],[1],[0]] #predicted

#placeholders, now we need one for predicted vals too
x_ = tf.placeholder(tf.float32, shape=[4,2], name="x-input")
y_ = tf.placeholder(tf.float32, shape=[4,1], name="y-input") 
#now we will define with some random values as starting points 
w1 = tf.Variable(tf.random_uniform([2,2], -2, 2), tf.float32) #W
w2 = tf.Variable(tf.random_uniform([2,1], -2, 2), tf.float32) #w
b1 = tf.Variable(tf.zeros([2]), tf.float32) #c
b2 = tf.Variable(tf.zeros([1]), tf.float32) #b

#operation nodes
transformedH = tf.nn.relu(tf.matmul(x_,w1) + b1) #hidden layer with rect. linear act. func.
linear_model = tf.matmul(transformedH, w2) + b2

#MSE
loss = tf.reduce_sum(tf.square(linear_model - y_)) #create error vector.We call tf.square to square that error.

#gradient descent 
optimizer = tf.train.GradientDescentOptimizer(0.01) #0.01 is learning rate
train = optimizer.minimize(loss) #feed optimizer loss function 

init3 = tf.global_variables_initializer()
sess3.run(init3)

#train it
for i in range(10000):
        sess3.run(train, {x_: XOR_X, y_: XOR_Y})

#take a look at the results
predictions = sess3.run(linear_model, {x_: XOR_X}) 
curr_w1, curr_w2, curr_b1, curr_b2, curr_loss  = sess3.run([w1, w2, b1, b2, loss], {x_: XOR_X, y_: XOR_Y})
hidlay  = sess3.run(transformedH, {x_: XOR_X, y_: XOR_Y})
print("predictions:\n %s\n hlayout:\n %s\n"%(predictions,hidlay))
print("w1:\n %s \nw2:\n %s \nb1: %s \nb2: %s \nloss: %s"%(curr_w1, curr_w2, curr_b1, curr_b2, curr_loss))

predictions:
 [[  1.80513598e-06]
 [  9.99997258e-01]
 [  9.99999046e-01]
 [  1.68359838e-06]]
 hlayout:
 [[  0.00000000e+00   1.09952327e-03]
 [  1.19209290e-07   7.78593838e-01]
 [  4.13178086e-01   1.47565293e+00]
 [  1.33489180e+00   2.25314736e+00]]

w1:
 [[ 1.33489168  1.47455347]
 [ 0.92171371  0.77749431]] 
w2:
 [[-2.16986394]
 [ 1.28617752]] 
b1: [-0.92171359  0.00109952] 
b2: [-0.00141238] 
loss: 1.45201e-11


In [9]:
#using the approach above we will often find a different solution because the minima found depends 
#on the rand. initial weights (if sess3 ran enough, will find similar solution 2 examples every once in a while)
#if we set the weights closer to the values provided in the example we consistently get the same results 
sess4 = tf.Session()
#input data
XOR_X = [[0,0],[0,1],[1,0],[1,1]] #input
XOR_Y = [[0],[1],[1],[0]] #predicted

#placeholders, now we need one for predicted vals too
x_ = tf.placeholder(tf.float32, shape=[4,2], name="x-input")
y_ = tf.placeholder(tf.float32, shape=[4,1], name="y-input") 
#constrain rand. values
w1 = tf.Variable(tf.random_uniform([2,2], .7, 1.3), tf.float32) #W
w2 = tf.Variable(tf.random_uniform([2,1], -2, 1), tf.float32) #w
b1 = tf.Variable(tf.zeros([2]), tf.float32) #c
b2 = tf.Variable(tf.zeros([1]), tf.float32) #b

#operation nodes
transformedH = tf.nn.relu(tf.matmul(x_,w1) + b1) #hidden layer with rect. linear act. func.
linear_model = tf.matmul(transformedH, w2) + b2

#MSE
loss = tf.reduce_sum(tf.square(linear_model - y_)) #create error vector.We call tf.square to square that error.

#gradient descent 
optimizer = tf.train.GradientDescentOptimizer(0.01) #0.01 is learning rate
train = optimizer.minimize(loss) #feed optimizer loss function 

init4 = tf.global_variables_initializer()
sess4.run(init4)

#train it
for i in range(10000):
        sess4.run(train, {x_: XOR_X, y_: XOR_Y})

#stake a look at the results
predictions = sess4.run(linear_model, {x_: XOR_X}) 
curr_w1, curr_w2, curr_b1, curr_b2, curr_loss  = sess4.run([w1, w2, b1, b2, loss], {x_: XOR_X, y_: XOR_Y})
hidlay  = sess4.run(transformedH, {x_: XOR_X, y_: XOR_Y})
print("predictions:\n %s\n hlayout:\n %s\n"%(predictions,hidlay))
print("w1:\n %s \nw2:\n %s \nb1: %s \nb2: %s \nloss: %s"%(curr_w1, curr_w2, curr_b1, curr_b2, curr_loss))

predictions:
 [[  2.63455718e-06]
 [  9.99998033e-01]
 [  9.99997914e-01]
 [  1.41855867e-06]]
 hlayout:
 [[  2.79839547e-08   0.00000000e+00]
 [  1.17059112e+00   0.00000000e+00]
 [  1.17059100e+00   0.00000000e+00]
 [  2.34118223e+00   1.05052531e+00]]

w1:
 [[ 1.170591    1.05052531]
 [ 1.17059112  1.05052531]] 
w2:
 [[ 0.85426533]
 [-1.9038018 ]] 
b1: [  2.79839547e-08  -1.05052531e+00] 
b2: [  2.61065156e-06] 
loss: 1.71742e-11
