# Based on the Chapter 9 of [Hands-on Machine Learning with Scikit-Learn and TensorFlow](http://shop.oreilly.com/product/0636920052289.do) by A. Geron

### Example of a simple computation

In [None]:
import tensorflow as tf

x = tf.Variable(3, name = 'x')
y = tf.Variable(4, name = 'y')

# Computation, not yet evaluated
f = x*x*y + y + 2

### Initialize a TensorFlow session, initialize the variables, and run the computation for the expected result of 42
TensorFlow session places the operations onto devices such as CPUs and GPUs runs them. Session needs to be closed to free the resources.

In [None]:
sess = tf.Session()
sess.run(x.initializer)
sess.run(y.initializer)
result = sess.run(f)
print('Result %.1f' % result)
sess.close()

Another way to do the same thing using `with`:

In [None]:
# Set `sess` as the default session, obtained from `tf.get_default_session()`
with tf.Session() as sess:
    x.initializer.run()
    y.initializer.run()
    result = f.eval() # Equivalent to `tf.get_default_session().run(f)
    print('Result %.1f' % result)

One can also create an initializer that initializes all variables when it is run:

In [None]:
init = tf.global_variables_initializer()

with tf.Session() as sess:
    init.run()
    result = f.eval()

In Jupyter, it's useful to create an `InteractiveSession` that is set as the default session:

In [None]:
sess = tf.InteractiveSession()
init.run()
result = f.eval()
print('Result %.1f' % result)
sess.close()

### Managing graphs
Any created node is automatically added to the default graph:

In [None]:
x1 = tf.Variable(1)
assert x1.graph is tf.get_default_graph()

It is possible to create multiple independent computation graphs:

In [None]:
graph = tf.Graph()
with graph.as_default():
    x2 = tf.Variable(2)
    
assert x2.graph is graph
assert x2.graph is not tf.get_default_graph()

Running the same code multiple times as duplicate nodes to the default graph. Graph can be reseted as follows:

In [None]:
x1 = tf.Variable(1, name = 'x')
x1 = tf.Variable(1, name = 'x')

tf.reset_default_graph()
assert len(tf.get_default_graph().as_graph_def().node) == 0

### Lifecycle of node values
For each node added to the graph, TF automatically determines the set of nodes it depends on and evaluates these nodes first.

In [None]:
w = tf.constant(3)
x = w + 2
y = x + 5 
z = x * 3

with tf.Session() as sess:
    print(y.eval()) # Evaluates x and then y
    print(z.eval()) # Evalutes x -> y -> z without re-using the previously computed values

Re-evaluation can be avoided by evaluating `y` and `z` in one graph run:

In [None]:
with tf.Session() as sess:
    y_val, z_val = sess.run([y, z])
    print(y_val)
    print(z_val)

### Managing tensors
Inputs and outputs in TF operations are tensors, represented as NumPy `ndarray`s in the Python API. Previous examples only involved scalars. One can perform computations on arrays of any shape. The example below performs linear regression for the California housing data set having 8 input features and one target (price). The optimal parameters are found by solving the normal equation $\Theta = (\mathbf{X}^T\mathbf{X})^{-1} \mathbf{X}\mathbf{y}$, where $\mathbf{X}$ is the matrix of input features (including also the "bias" column of ones), $\mathbf{y}$ contains the target values, and $\Theta$ minimizes $||\mathbf{X}\Theta - \mathbf{y}||^2$.

In [None]:
import numpy as np
from sklearn.datasets import fetch_california_housing

tf.reset_default_graph()

housing = fetch_california_housing()
n_samples, n_features = housing.data.shape # Number of samples and features

assert n_features == 8

# Add bias input feature `x_0 = 1` in the feature set
housing_data_plus_bias = np.c_[np.ones((n_samples ,1)), housing.data]

assert housing_data_plus_bias.shape[1] == n_features + 1

X = tf.constant(housing_data_plus_bias, dtype = tf.float32, name = 'X')
y = tf.constant(housing.target.reshape(-1, 1), dtype = tf.float32, name = 'y')

XT = tf.transpose(X)

theta = tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(XT, X)), XT), y)

with tf.Session() as sess:
    theta_value = theta.eval() # Runs in GPU if one available
    
print('Theta ', theta_value)

Real problems require gradient descent, which can also be performed "manually" with TensorFlow. Here the gradient for the cost function is evaluated as 
$$
\nabla_{\Theta} L = \frac{2}{N} \mathbf{X}^T (\mathbf{y}^{pred} - \mathbf{y}),
$$
or 
$$
\partial L/\partial \theta_i = \frac{2}{N} \sum_j X_{ji} (y_j^{pred} - y_j).
$$

In [None]:
from sklearn.preprocessing import StandardScaler

tf.reset_default_graph()

# Scaling required for GD methods
X_scaler = StandardScaler()
scaled_housing_data = X_scaler.fit_transform(housing.data)

scaled_housing_data_plus_bias = np.c_[np.ones((n_samples ,1)), scaled_housing_data]

n_epochs = 1000
learning_rate = 0.01

X = tf.constant(scaled_housing_data_plus_bias, dtype = tf.float32, name = 'X')
y = tf.constant(housing.target.reshape(-1, 1), dtype = tf.float32, name = 'y')
theta = tf.Variable(tf.random_uniform([n_features + 1, 1], -1.0, 1.0), name = 'theta')

y_pred = tf.matmul(X, theta, name = 'predictions')
error = y_pred - y

mse = tf.reduce_mean(tf.square(error), name = 'mse')

gradients = 2 / n_samples * tf.matmul(tf.transpose(X), error)
# gradients = tf.gradients(mse, [theta])[0] # See below

training_op = tf.assign(theta, theta - learning_rate * gradients)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    for epoch in range(n_epochs):
        if epoch % 100 == 0:
            print('Epoch %d, mse %.2f' % (epoch, mse.eval()))
        sess.run(training_op)
        
    best_theta = theta.eval()
    
print('Theta ', best_theta)

Of course, we do not generally need to evaluate the gradients manually. TensorFlow can do it using [reverse-mode autodiff](https://en.wikipedia.org/wiki/Automatic_differentiation#Reverse_accumulation) as:
```python
gradients = tf.gradients(mse, [theta])[0]
```
Replacing the analytical expression for `gradients` in the cell above leaves the result unchanged.

Similarly, one does not need to implement the gradient descent themselves as TensorFlow provides the optimizers. One can replace the `training_op` in the above cell with the following code (and also delete `gradients`):

```python
optimizer = tf.train.GradientDescentOptimizer(learning_rate = learning_rate)
training_op = optimizer.minimize(mse)
```

One can also implement Mini-batch GD using special `placeholder` nodes. `placeholder` nodes do not perform any computation, they only output the data that they are told to output in runtime:

In [None]:
A = tf.placeholder(tf.float32, shape = (None, 3)) # First dimension of A can have any size
B = A + 5
with tf.Session() as sess:
    B_val_1 = B.eval(feed_dict = { A: [[1, 2, 3]]})
    B_val_2 = B.eval(feed_dict = { A: [[4, 5, 6], [7, 8, 9]]})
print(B_val_1)
print(B_val_2)

The below code implements Mini-batch GD using `placeholder` nodes. In addition, it demonstrates saving the model and writing logs for Tensorboard visualization.

In [None]:
import os
from datetime import datetime

batch_size = 100
n_batches = int(np.ceil(n_samples / batch_size))

n_epochs = 100
learning_rate = 0.01

models_dir = 'models/'

def tb_logdir():   
    now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
    root_logdir = 'tf_logs'

    return os.path.join(root_logdir, 'run-%s' % now)

logdir = tb_logdir()
print('Using %s for TensorBoard logs' % logdir)

tf.reset_default_graph()

X = tf.placeholder(tf.float32, shape = (None, n_features + 1), name = 'X')
y = tf.placeholder(tf.float32, shape = (None, 1), name = 'y')

theta = tf.Variable(tf.random_uniform([n_features + 1, 1], -1.0, 1.0), name = 'theta')

y_pred = tf.matmul(X, theta, name = 'predictions')

# Error and MSE defined with "loss" namescope to reduce clutter
with tf.name_scope("loss") as scope:
    error = y_pred - y
    mse = tf.reduce_mean(tf.square(error), name = 'mse')
    
assert error.op.name == 'loss/sub'

optimizer = tf.train.GradientDescentOptimizer(learning_rate = learning_rate)
training_op = optimizer.minimize(mse)

mse_summary = tf.summary.scalar('MSE', mse)
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

saver = tf.train.Saver()

init = tf.global_variables_initializer()

y_targets = housing.target.reshape(-1, 1)

def fetch_batch(epoch, batch_index, batch_size):
    inds = range(batch_index * batch_size, min((batch_index + 1) * batch_size, n_samples))
    X_batch = scaled_housing_data_plus_bias[inds, :]
    y_batch = y_targets[inds, :]
    return X_batch, y_batch

    
with tf.Session() as sess:
    sess.run(init)
    for epoch in range(n_epochs):
        for batch_index in range(n_batches):
            X_batch, y_batch = fetch_batch(epoch, batch_index, batch_size)
            
            feed_dict = { X: X_batch, y: y_batch }
            
            # Write summary logs for TensorBoard visualization
            if batch_index % 10 == 0:
                summary_str = mse_summary.eval(feed_dict = feed_dict)
                step = epoch * n_batches + batch_index
                file_writer.add_summary(summary_str, step)
            
            sess.run(training_op, feed_dict = feed_dict)

        if epoch == 0 or (epoch + 1) % 10 == 0:
            print('Epoch %d/%d, mse %.3f' % (epoch + 1, n_epochs, 
                                             mse.eval(feed_dict = {X: scaled_housing_data_plus_bias, 
                                                                   y: y_targets })))
        if epoch % 25 == 0:
            save_path = saver.save(sess, os.path.join(models_dir, 'mini-batch-gd.ckpt'))
            print('Saved model to %s' % save_path)
        
    best_theta = theta.eval()
    save_path = saver.save(sess, os.path.join(models_dir, 'mini-batch-gd-final.ckpt'))

file_writer.close()
print('Theta', best_theta)

## Reusing variables
The below example defines a graph that computes the sum of five ReLU nodes. Each ReLU uses the same variable `threshold`, which is initialized in the first call to `relu()` function. Latter calls then re-use the value initialized in the first run (`reuse = tf.AUTO_REUSE`).

In [None]:
def relu(X):
    threshold = tf.get_variable('threshold', shape = (), initializer=tf.constant_initializer(0.0))
    w_shape = (int(X.get_shape()[1]), 1)
    w = tf.Variable(tf.random_normal(w_shape), name = 'weights')
    b = tf.Variable(0.0, name = 'bias')
    z = tf.add(tf.matmul(X, w), b, name = 'z')
    return tf.maximum(z, threshold, name = 'max')

tf.reset_default_graph()
X = tf.placeholder(tf.float32, shape = (None, n_features), name = 'X')
relus = []
for relu_index in range(5):
    # Define variable scope "relu", allowing for the re-use of variable "threshold" with `reuse = tf.AUTO_REUSE`.
    # Setting this either `True` or `False` will raise an error because the variable 
    # is either not initialized in the first run or because it's initialized multiple times, respectively.
    
    # Note that each `with tf.variable_scope(...)` creates a new namescope for nodes such as `tf.Variable()`.
    # Namescopes avoid name clashes by using running enumeration such as `relu_1`, `relu_2`, etc.
    with tf.variable_scope('relu', reuse = tf.AUTO_REUSE) as scope:
        relus.append(relu(X))
output = tf.add_n(relus, name = 'output')

file_writer = tf.summary.FileWriter(tb_logdir(), tf.get_default_graph())