# Linear Regression with Tensorflow

In [1]:
# It starts by fetching the dataset
import tensorflow as tf
import numpy as np
from sklearn.datasets import fetch_california_housing

# Adding an extra bias input feature (x0 = 1) to all training instances
housing = fetch_california_housing()
m, n = housing.data.shape # (20640, 8)
housing_data_plus_bias = np.c_[np.ones((m, 1)), housing.data] # (20640, 9)

# Creating two TensorFlow constant nodes, X and y
X = tf.constant(housing_data_plus_bias, dtype=tf.float32, name="X") # (20640, 9)
y = tf.constant(housing.target.reshape(-1, 1), dtype=tf.float32, name='y') # (20640, 1)
XT = tf.transpose(X)
theta = tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(XT, X)), XT), y) # Review: The Normal Equation

with tf.Session() as sess:
    theta_value = theta.eval()

### Review: The Normal Equation

A mathematical equation that gives the result directly

$$ \hat{\theta} = (\mathbf{X^T} \cdot  \mathbf{X})^{-1} \cdot \mathbf{X^T} \cdot y $$

* $\hat{\theta}$ is the value of $\theta$ that minimizes the cost function.
* $y$ is the vector of target values containing $y^{(1)}$ to $y^{(m)}$

### NumPy vs. TensorFlow: The main benefit
TensorFlow will automatically run this on your GPU card


# Implementing Gradient Descent

Using Batch Gradient Descent instead of the Normal Equation, we will do this:

1. by manually computing the gradients
2. by using TensorFlow's autodiff feature
3. by using Tensorflow's out-of-the-box optimizers

**important notes: normalize the input feature vectors first**

## Manually Computing the Gradient

In [3]:
from sklearn.preprocessing import StandardScaler

# Normalize the feature vector: scaled_housing_data_plus_bias
scaled_housing = StandardScaler().fit_transform(housing.data)
scaled_housing_data_plus_bias = np.c_[np.ones((m, 1)), scaled_housing]

In [4]:
n_epochs = 1000
learning_rate = 0.01

X = tf.constant(scaled_housing_data_plus_bias, dtype=tf.float32, name="X")
y = tf.constant(housing.target.reshape(-1,1), dtype=tf.float32, name="y")
theta = tf.Variable(tf.random_uniform([n + 1, 1], -1.0, 1.0), name="theta") # random_uniform() generating a tensor containing random values
y_pred = tf.matmul(X, theta, name="prediction")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")
gradients = 2/m * tf.matmul(tf.transpose(X), error)
training_op = tf.assign(theta, theta - learning_rate * gradients)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    
    for epoch in range(n_epochs):
        if epoch % 100 == 0:
            print("Epoch", epoch, "MSE =", mse.eval())
        sess.run(training_op)
    
    best_theta = theta.eval()   
    

Epoch 0 MSE = 11.1691
Epoch 100 MSE = 0.901225
Epoch 200 MSE = 0.659847
Epoch 300 MSE = 0.622223
Epoch 400 MSE = 0.598159
Epoch 500 MSE = 0.580397
Epoch 600 MSE = 0.567124
Epoch 700 MSE = 0.557157
Epoch 800 MSE = 0.549641
Epoch 900 MSE = 0.543947


# Using autodiff
Simply replcae the `gradients = ...` line in the Gradient Descent code in the previous section

In [4]:
n_epochs = 1000
learning_rate = 0.01

X = tf.constant(scaled_housing_data_plus_bias, dtype=tf.float32, name="X")
y = tf.constant(housing.target.reshape(-1,1), dtype=tf.float32, name="y")
theta = tf.Variable(tf.random_uniform([n + 1, 1], -1.0, 1.0), name="theta") # random_uniform() generating a tensor containing random values
y_pred = tf.matmul(X, theta, name="prediction")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")
#gradients = 2/m * tf.matmul(tf.transpose(X), error)
gradients = tf.gradients(mse, [theta])[0]
training_op = tf.assign(theta, theta - learning_rate * gradients)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    
    for epoch in range(n_epochs):
        if epoch % 100 == 0:
            print("Epoch", epoch, "MSE =", mse.eval())
        sess.run(training_op)
    
    best_theta = theta.eval()   
    

Epoch 0 MSE = 3.60388
Epoch 100 MSE = 0.683695
Epoch 200 MSE = 0.613145
Epoch 300 MSE = 0.593266
Epoch 400 MSE = 0.578705
Epoch 500 MSE = 0.567447
Epoch 600 MSE = 0.558677
Epoch 700 MSE = 0.55181
Epoch 800 MSE = 0.546405
Epoch 900 MSE = 0.54213


# Using an Optimizer
You can simply replace the preceding `gradients = ...` and `training_op = ...` lines

In [5]:
n_epochs = 1000
learning_rate = 0.01

X = tf.constant(scaled_housing_data_plus_bias, dtype=tf.float32, name="X")
y = tf.constant(housing.target.reshape(-1,1), dtype=tf.float32, name="y")
theta = tf.Variable(tf.random_uniform([n + 1, 1], -1.0, 1.0), name="theta") # random_uniform() generating a tensor containing random values
y_pred = tf.matmul(X, theta, name="prediction")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")
# gradients = 2/m * tf.matmul(tf.transpose(X), error)
# gradients = tf.gradients(mse, [theta])[0]
optimizer = tf.train.GradientDescentOptimizer(learning_rate = learning_rate)
training_op = optimizer.minimize(mse)
#training_op = tf.assign(theta, theta - learning_rate * gradients)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    
    for epoch in range(n_epochs):
        if epoch % 100 == 0:
            print("Epoch", epoch, "MSE =", mse.eval())
        sess.run(training_op)
    
    best_theta = theta.eval()   
    

Epoch 0 MSE = 5.33706
Epoch 100 MSE = 0.698064
Epoch 200 MSE = 0.619365
Epoch 300 MSE = 0.593583
Epoch 400 MSE = 0.57533
Epoch 500 MSE = 0.561998
Epoch 600 MSE = 0.552235
Epoch 700 MSE = 0.545069
Epoch 800 MSE = 0.539798
Epoch 900 MSE = 0.535911


# Feeding Data to the Training Algorithm

To implement Mni-batch GRadient Descent, you need a way to replcae X and y at every iteration with the next mini-batch. The placeholder nodes are used to pass the training data to TensorFlow during training. To create a placeholder node,
1. call the `placeholder()` function.
2. specify the output tensor's data type.
3. optionally, you can specify its shape, if you want to enforce it. `None` means "any size."

In [6]:
A = tf.placeholder(tf.float32, shape=(None, 3))
B = A + 5

with tf.Session() as sess:
    B_val_1 = B.eval(feed_dict = {A:[[1,2,3]]})
    B_val_2 = B.eval(feed_dict = {A:[[4,5,6], [7,8,9]]})
    
    print(B_val_1)
    print(B_val_2)

[[ 6.  7.  8.]]
[[  9.  10.  11.]
 [ 12.  13.  14.]]


In [7]:
def fetch_batch(epoch, batch_index, batch_size):
    np.random.seed(epoch * batch_index)
    batch_mark = np.random.choice(m, batch_size)
    X_batch = scaled_housing_data_plus_bias[batch_mark]
    y_batch = housing.target.reshape(-1,1)[batch_mark]
    return X_batch, y_batch

In [8]:
n_epochs = 1000
learning_rate = 0.01

# Changing the definition of X and y in the construction phase to make them placeholder nodes

# X = tf.constant(scaled_housing_data_plus_bias, dtype=tf.float32, name="X")
X = tf.placeholder(tf.float32, shape=(None, n+1), name="X")
# y = tf.constant(housing.target.reshape(-1,1), dtype=tf.float32, name="y")
y = tf.placeholder(tf.float32, shape=(None, 1), name="y")

theta = tf.Variable(tf.random_uniform([n + 1, 1], -1.0, 1.0), name="theta") # random_uniform() generating a tensor containing random values
y_pred = tf.matmul(X, theta, name="prediction")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")

optimizer = tf.train.GradientDescentOptimizer(learning_rate = learning_rate)

training_op = optimizer.minimize(mse)

init = tf.global_variables_initializer()

# define the batch size and compute the total number of batches 
batch_size = 100
n_batches = int(np.ceil(m / batch_size))

with tf.Session() as sess:
    sess.run(init)
    
    for epoch in range(n_epochs):
        for batch_index in range(n_batches):
            # fetch the mini-batches one by one, then provide the value of X and y via the feed_dict parameter
            X_batch, y_batch = fetch_batch(epoch, batch_index, batch_size)
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
    
    best_theta = theta.eval()   
    