# Jonathan Halverson
# Wednesday, October 25, 2017
# Geron Chapter 9: Intro to Tensorflow

Let's import the module and then create a simple computation graph:

In [1]:
import tensorflow as tf

x = tf.Variable(3, name="x")
y = tf.Variable(9, name="y")
f = x * x + x * y + 4

The graph has been created. Note that no computation has taken place as this point. Next we create a TF session and execute the graph:

In [2]:
with tf.Session() as sess:
     x.initializer.run()
     y.initializer.run()
     result = f.eval()

In [3]:
print(result)

40


Note that x.initializer.run() is equivalent to tf.get_default_session().run(x.initializer) and f.eval() is equivalent to calling tf.get_default_session.run(f).

In [4]:
type(x)

tensorflow.python.ops.variables.Variable

In [5]:
type(f)

tensorflow.python.framework.ops.Tensor

In [6]:
type(result)

numpy.int32

### Graph 2

In [7]:
tf.reset_default_graph()

Let's try another graph:

In [8]:
a = tf.Variable(-10.0, name='a')
b = tf.Variable(1e3, name='b')
g = a / b - 1.0
init = tf.global_variables_initializer()

In [9]:
with tf.Session() as sess:
     init.run()
     result = g.eval()

In [10]:
print(result)

-1.01


### Interactive sessions

In [11]:
tf.reset_default_graph()

In [12]:
u = tf.Variable(4, name='u')
v = tf.Variable(7, name='v')
h = v * u

In [13]:
sess = tf.InteractiveSession()
u.initializer.run()
v.initializer.run()
result = sess.run(h)
print(result)
sess.close()

28


One needs to explicitly close the session as is done above.

### Managing graphs

In [14]:
tf.reset_default_graph()

In [15]:
x1 = tf.Variable(1)
x1.graph is tf.get_default_graph()

True

In [16]:
graph = tf.Graph()
with graph.as_default():
     x2 = tf.Variable(2)

In [17]:
x2.graph is graph

True

In [18]:
x2.graph is tf.get_default_graph()

False

Any node that you create is automatically added to the default graph. You have explicitly create another graph and then make this the default to assign nodes to that graph.

In [19]:
with tf.get_default_graph().as_default():
     x3 = tf.Variable(8, name='x3')

In [20]:
type(tf.get_default_graph())

tensorflow.python.framework.ops.Graph

### Lifecycle of a node

In [21]:
tf.reset_default_graph()

In [22]:
w = tf.constant(5)
x = w + 2
y = x + 5
z = x + 3

In [23]:
with tf.Session() as sess:
     print(y.eval())
     print(z.eval())

12
10


When z is evaluated the DAG is re-traversed. This is inefficient so we can use another way:

In [24]:
with tf.Session() as sess:
     y_val, z_val = sess.run([y, z])
     print(y_val)
     print(z_val)

12
10


### Working with tensors

In [25]:
tf.reset_default_graph()

In [26]:
import numpy as np

x = tf.Variable(np.arange(10.0, 20.0, 1.0))
y = tf.Variable(np.random.rand(10), dtype=tf.float64)
z = x + y
init = tf.global_variables_initializer()

In [27]:
sess = tf.InteractiveSession()
init.run()
print(z.eval())
sess.close()

[ 10.75653926  11.31889316  12.30415366  13.68870848  14.77124283
  15.16063615  16.22633327  17.90904999  18.85783076  19.62886324]


### Linear regression with tensorflow (the normal equation)

In [28]:
f = '../machine_learning/geron_housing/housing.csv'

import pandas as pd
data = pd.read_csv(f, header=0)
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [30]:
housing_labels = data.median_house_value.copy()
data.drop(['ocean_proximity', 'median_house_value'], axis=1, inplace=True)

### An aside on concatenate and append

In [31]:
z = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
z

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [32]:
np.c_[z, [[-9], [-9], [-9]]]

array([[ 1,  2,  3, -9],
       [ 4,  5,  6, -9],
       [ 7,  8,  9, -9]])

In [33]:
np.c_[z, [-9, -9, -9]]

array([[ 1,  2,  3, -9],
       [ 4,  5,  6, -9],
       [ 7,  8,  9, -9]])

In [34]:
np.concatenate([z, [[-9], [-9], [-9]]], axis=1)

array([[ 1,  2,  3, -9],
       [ 4,  5,  6, -9],
       [ 7,  8,  9, -9]])

In [35]:
np.append(z, [[-8, -8, -8]], axis=0)

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [-8, -8, -8]])

In [36]:
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

# impute missing data
imp = Imputer(strategy='median')
housing = imp.fit_transform(data)

# standardize the features
std_sc = StandardScaler()
housing = std_sc.fit_transform(housing)

# add bias term
housing = np.c_[np.ones((housing.shape[0], 1)), housing]

In [37]:
m, n = housing.shape
m, n

(20640, 9)

Begin to create the graph:

In [38]:
X = tf.constant(housing, name="X", dtype=tf.float32)
y = tf.constant(housing_labels.values.reshape(-1, 1), name="y", dtype=tf.float32)
XT = tf.transpose(X)
theta = tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(XT, X)), XT), y)

In [39]:
with tf.Session() as sess:
     theta_value = theta.eval()

In [40]:
print(theta_value)

[[ 206856.1875    ]
 [ -85369.03125   ]
 [ -90723.1796875 ]
 [  14403.26171875]
 [ -14444.0703125 ]
 [  34037.9453125 ]
 [ -45153.77734375]
 [  30319.48828125]
 [  75520.3125    ]]


In [41]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression().fit(housing[:, 1:], housing_labels)
print lin_reg.intercept_, lin_reg.coef_

206855.816909 [-85369.22518    -90723.40175504  14403.20315262 -14443.94445799
  34037.42560482 -45153.79498679  30319.8204304   75520.30834439]


### Linear regression with tensorflow (the gradient descent)

In [42]:
tf.reset_default_graph()

In [43]:
epochs = 5000
learning_rate = 0.01

In [44]:
X = tf.constant(housing, name="X", dtype=tf.float32)
y = tf.constant(housing_labels.values.reshape(-1, 1), name="y", dtype=tf.float32)
theta = tf.Variable(tf.random_uniform([n, 1], -1.0, 1.0), name='theta')
y_pred = tf.matmul(X, theta, name='predictions')
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name='mse')
gradients = (2.0 / m) * tf.matmul(tf.transpose(X), error)
training_op = tf.assign(theta, theta - learning_rate * gradients)
init = tf.global_variables_initializer()

The graph is now complete. Now we start a session to execute the computation graph:

In [45]:
with tf.Session() as sess:
     sess.run(init)
     for epoch in range(epochs):
          if (epoch % 500 == 0):
               print('Epoch', epoch, "MSE=", mse.eval())
          sess.run(training_op)
     best_theta = theta.eval()

('Epoch', 0, 'MSE=', 5.6104817e+10)
('Epoch', 500, 'MSE=', 5.143807e+09)
('Epoch', 1000, 'MSE=', 4.9188029e+09)
('Epoch', 1500, 'MSE=', 4.8696177e+09)
('Epoch', 2000, 'MSE=', 4.8575012e+09)
('Epoch', 2500, 'MSE=', 4.8540954e+09)
('Epoch', 3000, 'MSE=', 4.8529997e+09)
('Epoch', 3500, 'MSE=', 4.8525962e+09)
('Epoch', 4000, 'MSE=', 4.8524242e+09)
('Epoch', 4500, 'MSE=', 4.8523535e+09)


In [46]:
print(best_theta)

[[ 206855.4375    ]
 [ -85015.7265625 ]
 [ -90382.6171875 ]
 [  14422.95507812]
 [ -14494.73828125]
 [  33150.375     ]
 [ -45356.44140625]
 [  31449.94335938]
 [  75547.515625  ]]


### Using Autodiff

Tensorflow offers several ways to compute gradients using autodiff:

In [47]:
tf.reset_default_graph()

In [48]:
X = tf.constant(housing, name="X", dtype=tf.float32)
y = tf.constant(housing_labels.values.reshape(-1, 1), name="y", dtype=tf.float32)
theta = tf.Variable(tf.random_uniform([n, 1], -1.0, 1.0), name='theta')
y_pred = tf.matmul(X, theta, name='predictions')
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name='mse')
gradients = tf.gradients(mse, [theta])[0] # reverse-mode autodiff
training_op = tf.assign(theta, theta - learning_rate * gradients)
init = tf.global_variables_initializer()

In [49]:
with tf.Session() as sess:
     sess.run(init)
     for epoch in range(epochs):
          if (epoch % 500 == 0):
               print('Epoch', epoch, "MSE=", mse.eval())
          sess.run(training_op)
     best_theta = theta.eval()

('Epoch', 0, 'MSE=', 5.6105034e+10)
('Epoch', 500, 'MSE=', 5.1438065e+09)
('Epoch', 1000, 'MSE=', 4.9188019e+09)
('Epoch', 1500, 'MSE=', 4.8696161e+09)
('Epoch', 2000, 'MSE=', 4.8575017e+09)
('Epoch', 2500, 'MSE=', 4.8540954e+09)
('Epoch', 3000, 'MSE=', 4.8529997e+09)
('Epoch', 3500, 'MSE=', 4.8525957e+09)
('Epoch', 4000, 'MSE=', 4.8524242e+09)
('Epoch', 4500, 'MSE=', 4.8523535e+09)


### Using an optimizer

In [50]:
tf.reset_default_graph()

In [51]:
X = tf.constant(housing, name="X", dtype=tf.float32)
y = tf.constant(housing_labels.values.reshape(-1, 1), name="y", dtype=tf.float32)
theta = tf.Variable(tf.random_uniform([n, 1], -1.0, 1.0), name='theta')
y_pred = tf.matmul(X, theta, name='predictions')
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name='mse')
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate) # define optimizer
training_op = optimizer.minimize(mse) # tell it what to minimize
init = tf.global_variables_initializer()

In [52]:
with tf.Session() as sess:
     sess.run(init)
     for epoch in range(epochs):
          if (epoch % 500 == 0):
               print('Epoch', epoch, "MSE=", mse.eval())
          sess.run(training_op)
     best_theta = theta.eval()

('Epoch', 0, 'MSE=', 5.6105005e+10)
('Epoch', 500, 'MSE=', 5.1437993e+09)
('Epoch', 1000, 'MSE=', 4.9188019e+09)
('Epoch', 1500, 'MSE=', 4.8696166e+09)
('Epoch', 2000, 'MSE=', 4.8575012e+09)
('Epoch', 2500, 'MSE=', 4.8540948e+09)
('Epoch', 3000, 'MSE=', 4.8529981e+09)
('Epoch', 3500, 'MSE=', 4.8525957e+09)
('Epoch', 4000, 'MSE=', 4.8524242e+09)
('Epoch', 4500, 'MSE=', 4.852352e+09)


### Implementing Newton-Raphson in TF

In [53]:
tf.reset_default_graph()

In [54]:
x = tf.Variable(2.0, name="x")
training_op = tf.assign(x, x - (x * x - 3.0) / (2.0 * x))

In [55]:
with tf.Session() as sess:
     x.initializer.run()
     for i in range(10):
          sess.run(training_op)
     root = x.eval()
print root, root / 3**0.5

1.73205 0.999999982052


### Working with placeholders

In [56]:
tf.reset_default_graph()

In [57]:
A = tf.placeholder(tf.float32, shape=(None, 3))
B = A + 5
with tf.Session() as sess:
     B_val_1 = B.eval(feed_dict={A:[[1, 2, 3]]})
     B_val_2 = B.eval(feed_dict={A:[[1, 2, 3], [4, 5, 6]]})
print(B_val_1)
print(B_val_2)

[[ 6.  7.  8.]]
[[  6.   7.   8.]
 [  9.  10.  11.]]


An exception will be thrown if you don't specify a value for a placeholder at run time. With a placeholder node you can implement mini-batch optimization.

Let's try mini-batch gradient descent:

In [58]:
tf.reset_default_graph()

In [59]:
def fetch_batch(epoch, batch_index, batch_size):
     #np.random.seed(epoch * n_batches + batch_index)  # not shown in the book
     #indices = np.random.randint(m, size=batch_size)  # not shown
     #X_batch = scaled_housing_data_plus_bias[indices] # not shown
     #y_batch = housing.target.reshape(-1, 1)[indices] # not shown
     indices = np.random.choice(range(m), size=batch_size, replace=False)
     return housing[indices], housing_labels[indices].values.reshape(-1, 1)

In [60]:
X = tf.placeholder(shape=(None, 9), name="X", dtype=tf.float32)
y = tf.placeholder(shape=(None, 1), name="y", dtype=tf.float32)
theta = tf.Variable(tf.random_uniform([n, 1], -1.0, 1.0), name='theta')
y_pred = tf.matmul(X, theta, name='predictions')
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name='mse')
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate) # define optimizer
training_op = optimizer.minimize(mse) # tell it what to minimize
init = tf.global_variables_initializer()

In [61]:
epochs = 10
batch_size = 100
n_batches = int(np.ceil(m / batch_size))

with tf.Session() as sess:
     sess.run(init)
     for epoch in xrange(epochs):
          for batch in xrange(n_batches):
               X_batch, y_batch = fetch_batch(epoch, batch, batch_size)
               sess.run(training_op, feed_dict={X:X_batch, y:y_batch})
          best_theta = theta.eval()
best_theta

array([[ 206239.171875  ],
       [ -78551.421875  ],
       [ -84910.5       ],
       [  16154.18359375],
       [ -13690.53027344],
       [  33491.1875    ],
       [ -42058.05078125],
       [  37615.6953125 ],
       [  75720.9453125 ]], dtype=float32)

### Tensorboard

In [62]:
tf.reset_default_graph()

In [63]:
from datetime import datetime

now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir = "tf_logs"
logdir = "{}/run-{}/".format(root_logdir, now)

In [64]:
n_epochs = 1000
learning_rate = 0.01

X = tf.placeholder(tf.float32, shape=(None, n), name="X")
y = tf.placeholder(tf.float32, shape=(None, 1), name="y")
theta = tf.Variable(tf.random_uniform([n, 1], -1.0, 1.0, seed=42), name="theta")
y_pred = tf.matmul(X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(mse)

init = tf.global_variables_initializer()

In [65]:
mse_summary = tf.summary.scalar('MSE', mse)
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

In [66]:
n_epochs = 10
batch_size = 100
n_batches = int(np.ceil(m / batch_size))

In [67]:
with tf.Session() as sess:
    sess.run(init)
    for epoch in range(n_epochs):
        for batch_index in range(n_batches):
            X_batch, y_batch = fetch_batch(epoch, batch_index, batch_size)
            if batch_index % 10 == 0:
                summary_str = mse_summary.eval(feed_dict={X: X_batch, y: y_batch})
                step = epoch * n_batches + batch_index
                file_writer.add_summary(summary_str, step)
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})

    best_theta = theta.eval()    

In [68]:
file_writer.close()

In [69]:
best_theta

array([[ 207612.71875   ],
       [ -79206.9921875 ],
       [ -83784.3515625 ],
       [  16509.82617188],
       [ -16438.02734375],
       [  32183.95507812],
       [ -41060.65234375],
       [  31065.28320312],
       [  78242.8515625 ]], dtype=float32)

### Name scopes

In [70]:
tf.reset_default_graph()

In [71]:
x = tf.constant(3.0, name='x')
y = tf.constant(6.0, name='y')
z = tf.constant(4.0, name='z')
idx = tf.Variable(1.0, name='i')
with tf.name_scope('print') as printing:
     simple_op = tf.assign(idx, x)
with tf.name_scope('cmb') as printing:
     simple_add = tf.add(x, y, name='add')

In [72]:
f = x + y + z

In [73]:
with tf.Session() as sess:
     idx.initializer.run()
     simple_op.eval()
     print(f.eval())
     print(idx.eval())
     result = simple_add.eval()
print result

13.0
3.0
9.0


In [74]:
print(simple_op.op.name)

print/Assign


In [75]:
print(f.op.name)

add_1


In [76]:
print(simple_add.op.name)

cmb/add


### Modularity

In [77]:
seed = 42
tf.reset_default_graph()
tf.set_random_seed(seed)
np.random.seed(seed)

In [78]:
n_features = 3
X = tf.placeholder(tf.float32, shape=(None, n_features), name='X')

In [79]:
w1 = tf.Variable(tf.random_normal((n_features, 1)), name='weights1')
w2 = tf.Variable(tf.random_normal((n_features, 1)), name='weights2')

In [80]:
b1 = tf.Variable(0.0, name='bias1')
b2 = tf.Variable(0.0, name='bias2')

In [81]:
z1 = tf.add(tf.matmul(X, w1), b1, name='z1')
z2 = tf.add(tf.matmul(X, w2), b2, name='z2')

In [82]:
relu1 = tf.maximum(z1, 0.0, name='relu1')
relu2 = tf.maximum(z2, 0.0, name='relu2')

In [83]:
output = tf.add(relu1, relu2, name='output')
init = tf.global_variables_initializer()

In [84]:
ipt = np.array([[4.5, 2.1, -6.0], [1.2, -0.3, -9.1]])
with tf.Session() as sess:
     init.run()
     print relu1.eval(feed_dict={X:ipt})
     print relu2.eval(feed_dict={X:ipt})
     print output.eval(feed_dict={X:ipt})
     print w1.get_shape()

[[ 0.]
 [ 0.]]
[[ 5.1492486 ]
 [ 1.33924854]]
[[ 5.1492486 ]
 [ 1.33924854]]
(3, 1)


#### Stay DRY

In [85]:
tf.reset_default_graph()

In [86]:
def relu(X):
     with tf.name_scope('relu') as scope:
          w_shape = (int(X.get_shape()[1]), 1)
          w = tf.Variable(tf.random_normal(w_shape), name='weights')
          b = tf.Variable(0.0, name='bias')
          z = tf.add(tf.matmul(X, w), b, name='z')
          return tf.maximum(z, 0.0, name='relu')

In [87]:
n_features = 3
X = tf.placeholder(tf.float32, name='X', shape=(None, n_features))
relus = [relu(X) for i in range(5)]
output = tf.add_n(relus, name='output')

init = tf.global_variables_initializer()

In [88]:
with tf.Session() as sess:
     init.run()
     result = output.eval(feed_dict={X:ipt})
print(result)

[[ 0.        ]
 [ 2.71456933]]


In [89]:
mse_summary = tf.summary.scalar('output', output)
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())
file_writer.close()