# Exercise 7: Discovering Tensorflow

In [None]:
# Load packages we need
import sys
import os
import datetime

import numpy as np
import sklearn

import scipy as sp
import pandas as pd

import tensorflow as tf

# we'll use keras for neural networks
import tensorflow.keras as keras
from tensorflow.keras.datasets import mnist

%matplotlib inline
from matplotlib import pyplot as plt
plt.rcParams.update({'font.size': 20})

# Let's check our software versions
print('### Python version: ' + sys.version)
print('### Numpy version: ' + np.__version__)
print('### Scikit-learn version: ' + sklearn.__version__)
print('### Tensorflow version: ' + tf.__version__)
print('------------')

# load our packages / code
sys.path.insert(1, '../common/')
import utils
import plots

In [None]:
# global parameters to control behavior of the pre-processing, ML, analysis, etc.

seed = 42 # deterministic seed
np.random.seed(seed) 
tf.random.set_seed(seed)

prop_vec = [24, 2, 2]

## How to think of Tensorflow? Is it like scikit-learn but for neural networks?

### Not really, think of Tensorflow as a kind of NumPy with additional features (i.e., ability to create computational graphs on tensors, automatically compute derivative, run operations on GPUs). (Tensorflow also has many high-level APIs.)

### What are tensors? Well formally they are multilinear maps from vector spaces to reals; but that doesn't matter the point is that tensors can represent scalars, vectors, matrices, etc.. 

### Beware that Tensorflow 2.0 is different from Tensorflow 1.0! In this course we'll use Tensorflow 2.0.

### Compared to TF 1.0:
### - TF 2.0 incorporates Keras as a high-level API
### - TF 2.0 does *eager* execution by default!
#### In TF 1.0 you would first build the computational graph (construction phase) and then you would execute it in a session (execution phase).

In [None]:
tf.executing_eagerly()

## How do we set the seed for Tensorflow?

In [None]:
tf.random.set_seed(seed)

## Let's get familiar with Tensorflow

In [None]:
scalar = 7 # a scalar in Python

scalar_tf = tf.constant(7) # a TF scalar

print(scalar)
print(scalar_tf)

### Just like numpy array, tensors have a shape and dtype property

In [None]:
vector_np = np.array([3, -5, 9, 1])
print(vector_np)

vector_tf = tf.constant([3, -5, 9, 1])
print(vector_tf)

### We can get the dtype, shape of tensor. We can also get at the underlying numpy array using numpy().

In [None]:
print('dtype: ' + str(vector_tf.dtype))
print('shape: ' + str(vector_tf.shape))

numpy_arr = vector_tf.numpy()
print('numpy array: {}, type: {}'.format(str(numpy_arr), type(numpy_arr)))

In [None]:
# we can also build a tensor out of a numpy array
matrix_np = np.array([[3, -7], [0, 9]])
matrix_tf = tf.constant(matrix_np)

print(matrix_tf)

In [None]:
# We can construct tensors in similar ways to how we construct some numpy arrays. For example:

tf_ones = tf.ones((3,3))
print(tf_ones)
print()

# and

tf_unifrand = tf.random.uniform((2, 4))
print(tf_unifrand)
print()

tf_zeros_like_ones = tf.zeros_like(tf_ones)
print(tf_zeros_like_ones)

### We can check if something is a Tensor. For example:

In [None]:
print(tf.is_tensor(matrix_tf))

In [None]:
print(tf.is_tensor(matrix_tf.numpy()))

### We can also place tensors onto devices. For example:

In [None]:
with tf.device('/gpu:0'):
    matrix_on_gpu0 = tf.identity(matrix_tf) # won't work if you don't have a GPU
    
print(matrix_on_gpu0.device)

In [None]:
print("Num CPUs Available: ", len(tf.config.list_physical_devices('CPU')))
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

### We can do operations as follow

In [None]:
x = tf.constant([1, 3])
y = tf.constant([-1, 2])

add_x_y = tf.add(x, y)
print(add_x_y)

In [None]:
x = 7
y = np.array([8, 9])

x + y

In [None]:
tf.add_n(x, y)

In [None]:
x = np.zeros((1,3), dtype=np.int32)
y = 7.1
x+y


tf.add(x, y)

In [None]:
# Can we do x + y?
x_plus_y = x + y
print(x_plus_y)

In [None]:
# multiplication by a scalar
x_mult_mone = x * -1
print(x_mult_mone)

In [None]:
# elementwise multiplication
x_mult_y = x * y
# or: x_mult_y = tf.multiply(x,y)
print(x_mult_y)

### what about matrix multiplication and similar ops?

In [None]:
A = tf.constant([[1, 0, 3], [0, -2, 5]])
B = tf.constant([2, -3])

print(A.shape)
print(B.shape)

A_transposed = tf.transpose(A)
print(A_transposed.shape)

B_reshaped = tf.reshape(B, (-1, 1))

print(B_reshaped.shape)

In [None]:
A_T_matrix_mult_B = tf.linalg.matmul(A_transposed, B_reshaped)
# or A_transposed @ B_reshaped

print(A_T_matrix_mult_B)

### Because tensors are immutable, we cannot change their values in place. This seems like it could be a problem because parameters of a model are variables whose values should change frequently.
### For this we can use: tf.Variable

#### We'll typically use those for model parameters and other variables that need to change often in place.

In [None]:
# Let's declare a variable
# variables in TF represent tensors and you change their values by running operations (ops) on them
x = tf.Variable([7, 3], name="x")   # we can name variables (we don't have to, but we can)

In [None]:
print(x)

In [None]:
# Variables also have shape and dtype, etc.
print(x.shape, x.dtype, x.name)

In [None]:
# if you do ops on a variable the result is a tensor not a variable!
xsquared = tf.square(x)
print(xsquared)

In [None]:
# but variables unlike constant can have their values changed in-place (e.g., using one of the assign*() methods). 
# For example:
x.assign(tf.constant([-1, 0]))
print(x)

x.assign_add(tf.constant([3, 3]))
print(x)

In [None]:
# However, shapes must be compatible!
x.assign(tf.constant([5, 9, -17]))

## Cool (and important) feature: automatic differentiation

In [None]:
x = tf.Variable(2, name="x")

### Suppose we want to compute the derivative of x ** 3. Clearly it's 3 x ** 2
### We can do it using tf.GradientTape to keep track of the operations on tensor and then compute the gradient afterwards

In [None]:
# Note: to watch a tensor it must be floating point, so we'll cast x
x = tf.cast(x, dtype=tf.float16)

with tf.GradientTape() as tape:
    tape.watch(x) # we tell the tape to watch variable 'x'
    # now we can do operations like x ** 3
    y = x ** 3
    
    
## What is y?
print(y)

In [None]:
## What is the gradient of y wrt x?
# we want the gradient of y (x**3) with respect to x
grad_xcube = tape.gradient(target=y, sources=x)

In [None]:
print(grad_xcube)

In [None]:
print((3 * x**2).numpy())

### Note: once we get the gradients from the tape, the resources are released.

In [None]:
# This will cause an error
grad_xcube2 = tape.gradient(target=y, sources=x)

### But we can create a persistent tape if we want. For example (a bit more complicated example):

In [None]:
x_np = np.array([1, 2, 3, 4, 5])
x = tf.Variable(x_np, name="x", dtype=tf.float32)

with tf.GradientTape(persistent=True, watch_accessed_variables=True) as tape:
    # watch_accessed_variables=True allows us to not have to set each variable we want to watch
    
    z = tf.constant(7, dtype=tf.float32)
    #z = tf.Variable([7, 7, 7, 7, 7], dtype=tf.float32, name='z')
    
    y = z * tf.math.log(x)
    
print(y)

In [None]:
grad_y_wrt_x = tape.gradient(target=y, sources=x)
print(grad_y_wrt_x)

In [None]:
grad_y_wrt_x2 = tape.gradient(target=y, sources=x) # we can grab it again

In [None]:
# we can even grab the gradient with respect to something else (e.g., z)
grad_y_wrt_z = tape.gradient(target=y, sources=z)
print(grad_y_wrt_z)

## So this is nice but what can we do with it? Let's train linear regression model with Tensorflow!

### For this, we'll create some simple data

In [None]:
# First make up a model
true_theta = tf.constant([-1, 5, 2, -7, 3], dtype=tf.float32)[:, tf.newaxis]
true_theta

In [None]:
n = 1500
ntr = 1000

# make some random data
x = tf.constant(tf.random.uniform((n, 5), minval=-1, maxval=+1), dtype=tf.float32)

# now calculate the y based on the true parameters
y = tf.constant(x @ true_theta, dtype=tf.float32)

# split the data
train_x = x[:ntr,:]
train_y = y[:ntr]

val_x = x[ntr:,:].numpy()
val_y = y[ntr:].numpy()

In [None]:
# This is batch gradient descent
def train_lr_tf(x, y, eta=0.05, num_iter=250, verbose=False):
    
    n, m = x.shape
    
    # weights / parameters (randomly initialized)
    theta = tf.Variable(tf.random.uniform((m, 1), minval=-1, maxval=1), dtype=tf.float32)
        
    for i in range(0, num_iter):
        
        with tf.GradientTape() as tape:
            y_pred = tf.linalg.matmul(x, theta) # prediction
            mse = tf.reduce_mean(tf.square(y - y_pred)) 
        
        # extract the gradients 
        gradient_vec = tape.gradient(mse, theta)

        # do a gradient descent step (we use assign_sub() to update theta in place)
        theta.assign_sub(tf.constant([eta], dtype=tf.float32) * gradient_vec) 


        if verbose and i % int(num_iter/10) == 0:
            print('Iteration {}: the (training) loss (MSE) is {:.5f}'.format(i, mse))
    
    return theta

In [None]:
# Let's do the training
theta = train_lr_tf(x, y, verbose=True)

In [None]:
print(theta)

In [None]:
# given model parameters 'theta' and a feature matrix 'x', this will return predictions
def predict_theta(theta, x):
    return np.dot(x, theta) # note: there is no bias 'b' in this case
    
from sklearn.metrics import r2_score, mean_squared_error, median_absolute_error

def print_scores(desc, true_y, pred_y):
    r2 = r2_score(true_y, pred_y)
    rmse = mean_squared_error(true_y, pred_y, squared=False)
    medae = median_absolute_error(true_y, pred_y)
    
    print('[{}] R^2: {:.2f}, RMSE: {:.2f}, MedAE: {:.2f}'.format(desc, r2, rmse, medae))
        
print_scores('TF-GD Train', train_y, predict_theta(theta.numpy(), train_x))
print_scores('TF-GD Val', val_y, predict_theta(theta.numpy(), val_x))

## This is nice but it seems tedious. Do we have to implement the gradient descent ourselves and do all the low-level stuff?
### => No, we can use a higher-level API like Keras.

In [None]:
# This is the function to define the architecture
def create_model(input_shape, num_outputs=1):
    
    model = keras.models.Sequential()
    
    # declare input layer (keras needs to know the number of input features to expect)
    model.add(keras.Input(shape=(input_shape[1],))) 
    
    # next add our output layer (1 output, linear activation function)
    model.add(keras.layers.Dense(num_outputs, activation='linear'))
    
    return model

In [None]:
# first we create the model (i.e., define the architecture)
model = create_model(train_x.shape)

# Tip: before you go on, use summary() to check that the architecture is what you intended
model.summary()

In [None]:
# then we compile it to specify optimizer, loss, and metrics
model.compile(optimizer='sgd', loss='mse', metrics=['mae'])

In [None]:
# finally, we train the model
model.fit(train_x, train_y, epochs=100, batch_size=50, validation_data=(val_x, val_y))

In [None]:
# can we extract the parameters?
def extract_weights(model):
    for layer in model.layers:
        return layer.get_weights()

### What are the weights? Are they similar as before?

In [None]:
weights = extract_weights(model)
print(weights)

## Let's try a more complex problem with a more complex neural network architecture

### We'll use the Adult data

In [None]:
### In this case, we'll directly load the Adult dataset pre-processed in a similar way as for assignment 1
### and we'll immediately split it into train, test, validation.

data_fp = '../data/adult.preproc.npz'
data = np.load(data_fp)

train_x = data['train_x']; train_y = data['train_y']
test_x = data['test_x']; test_y = data['test_y']
val_x = data['val_x']; val_y = data['val_y']
features = data['features']; labels = data['labels']


# check that we have what we expect
print('Training: {}, {}'.format(train_x.shape, train_y.shape))
print('Test: {}, {}'.format(test_x.shape, test_y.shape))
print('Validation: {}, {}'.format(val_x.shape, val_y.shape))

### Let's train a neural network

In [None]:
# This is the function to define the architecture
def create_model_adult(input_shape, hidden_widths=[96, 32], num_outputs=1):
    
    model = keras.models.Sequential()
    
    # declare input layer (keras needs to know the number of input features to expect)
    model.add(keras.Input(shape=(input_shape[1],))) 
    
    # add two hidden layers with ReLU activation
    model.add(keras.layers.Dense(hidden_widths[0], activation='relu'))
    model.add(keras.layers.Dense(hidden_widths[1], activation='relu'))
    
    # next add our output layer (binary classification with 1 output, so sigmoid makes the most sense)
    model.add(keras.layers.Dense(num_outputs, activation='sigmoid'))
    
    return model

In [None]:
# create the model (i.e., define the architecture)
model = create_model_adult(train_x.shape)

# Tip: before you go on, use summary() to check that the architecture is what you intended
model.summary()

# then we compile it to specify optimizer, loss, and metrics
model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# we train the model
model.fit(x=train_x, y=train_y, epochs=100, batch_size=100, validation_data=(val_x, val_y))

In [None]:
loss, accuracy = model.evaluate(x=test_x, y=test_y, verbose=0)
print('Test accuracy: {:.2f}%'.format(accuracy*100))

## Let's use TensorBoard

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [None]:
model = create_model_adult(train_x.shape)
#model.summary()
model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])

# set up tensorboard log directory and callback
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

model.fit(x=train_x, y=train_y, epochs=100, batch_size=100, validation_data=(val_x, val_y), 
          callbacks=[tensorboard_callback])

In [None]:
# Start tensorboard (notebook experience)
%tensorboard --logdir logs/fit