In [None]:
# execute this cell before you start

import tensorflow as tf
from tensorflow.keras import layers

# Helper libraries
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras

print(tf.VERSION)
print(tf.keras.__version__)


#  CA4
## due on 02/04/2019

to submit the assignment, please do the following:

- do `Cell -> All output -> Clear` to clear all your output
- save the notebook (CA3.ipynb)

# The Boston Housing Data

Consider the data in  `keras.datasets.boston_housing`. In this case, there are only about 400 training datasets, where each dataset consists of 13 input values which are characteristic for a given property. The output corresponds to the property price. The meaning of the various columns is explained in https://www.kaggle.com/c/boston-housing.

In contrast to the previous examples, which were categorisation problems, this is now a regression problem. The challenge is to train a network, which is able to predict the price of the property. 

You will again find lots of examples on the internet, and it is okay to use inspiration as long as you provide the source. 

Adhere to the following rules:

a) Train the network on the logarithm of the price, not on the price itself. Explain why this makes sense. 

b) You will find many examples, which use `sci-kit learn` or other packages, which we did not do in the course. Do not use them, and restrict yourself to methods and libraries which we covered

c) Try to find a network, which has the smallest amount of trainable parameters, while still providing good predictions of the price.  Discuss, how small you can go. 

d) Once you have trained the network, explore the correlations which this network predicts:
    - Which inputs have a positive price correlation? 
    - Which inputs have a negative price correlation? 
    - Which inputs have little/no influence on the price?
    
  Investigate this by feeding into the network some artificial data, which you obtain from the testing data by varying one of the input columns.
  
  
Optional challenge (no extra points but extra insight!):

Compare the results with standard regression methods, for example as in ST4060/ST4061 in case you have covered them. 



In [None]:
(train_x, train_y), (test_x, test_y) = keras.datasets.boston_housing.load_data()

# from https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html
var_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE',
             'DIS', 'RAD', 'TAX', 'PTRATIO', 'BLACK', 'LSTAT', 'MEDV']

In [None]:
# plt.scatter(range(1, len(train_y) + 1), train_y)

In [None]:
# log_train_y = np.log(train_y)
# log_test_y = np.log(test_y)

Method to print first n entries in the training data

In [None]:
def printFirstN(n, x=train_x, y=train_y):
    print(("{:8}"*len(var_names)).format(*var_names))
    for i in range(10):
        print(("{:<8.4}"*len(x[i])).format(*x[i]),
         "{:<8.3}".format(y[i]))

In [None]:
#Explore the data
print('Train X shape', train_x.shape)
print('Train Y shape', train_y.shape)
print('Test X shape', test_x.shape)
print('Train Y shape', test_y.shape)
print()

#inspect a few elements to get an idea of the data

printFirstN(10)

    

It looks like the values of each predictor are very different in scale when compared to each other.
This may lead to difficulties in building a good enough model.
Hence scale them uniformly.

In [None]:
# train_x = (train_x - train_x.min(axis=0)) / (train_x.max(axis=0) - train_x.min(axis=0))
# test_x = (test_x - test_x.min(axis=0)) / (test_x.max(axis=0) - test_x.min(axis=0))

In [None]:
# Scaling criteria from https://www.kaggle.com/shanekonaung/boston-housing-price-dataset-with-keras
mean = train_x.mean(axis=0)
train_x = train_x - mean
std = train_x.std(axis=0)
train_x = train_x/std

test_x = test_x-mean
test_x = test_x/std

In [None]:
printFirstN(10)

In [None]:
%%time

model = keras.models.Sequential()
model.add(keras.layers.Dense(512, activation=tf.nn.relu, input_shape=(train_x.shape[1],)))
model.add(keras.layers.Dense(512, activation=tf.nn.relu))
model.add(keras.layers.Dense(1))

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

fit_result = model.fit(train_x, train_y, epochs=100, batch_size=1, validation_data=(test_x, test_y))


In [None]:
history = fit_result.history

In [None]:
plt.plot(fit_result.epoch, history['mean_absolute_error'], 'b', label='Training MAE')
plt.plot(fit_result.epoch, history['val_mean_absolute_error'], 'r', label='Validation MAE')
plt.title('Epoch vs MAE')
plt.xlabel('Epochs')
plt.ylabel('Mean Absolute Error')
plt.legend()

The plot says that although the training MAE goes down almost monotonously, the test MAE remains more or less the same after about 60 epochs. This hints that the model has been overfitted. 
This is confirmed by the Epochs vs Loss plot below:

In [None]:
plt.plot(fit_result.epoch, history['loss'], 'b', label='Training loss')
plt.plot(fit_result.epoch, history['val_loss'], 'r', label='Validation loss')
plt.title('Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

d) Once you have trained the network, explore the correlations which this network predicts:

In [None]:
print(range(10))

In [None]:

print(model.predict(np.reshape(test_x[0], (1,test_x[0].shape[0]))))
print(test_y[0])

In [None]:
original_predictions = model.predict(test_x)
new_predictions = [];
for j in range(len(var_names) - 1):
    # Make a copy of the original so that its column can be modified
    test_x_copy = np.copy(test_x)
    
    # change each column by a measure delta
    for i in range(len(test_x)):
        #delta = (test_x.max(axis=0)[j] - test_x[i][j]) * 0.75;
        #test_x_copy[i][j] += delta;  
        test_x_copy[i][j] += test_x.max(axis=0)[j];  
        
    # Now make the prediction again with the modified data
    new_prediction = model.predict(test_x_copy)
    new_predictions.append(new_prediction)
    
num_predictions = len(original_predictions)

plt.figure(figsize=(40,40))
for i in range(13):
    plt.subplot(5,3, i + 1)
    x = list(range(num_predictions))
    plt.plot(x, original_predictions, label='Original Predictions')
    plt.plot(x, new_predictions[i], label='Modified Predictions')
    plt.title(var_names[i])
    plt.legend()

From the plots, comclusions on the correlations of each of the predictors on the price can be made as follows:

* Positive Price Correlation : RM, RAD
* Negative Price Correlation : CRIM
* Little/No Price Correlation : All others



In [86]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y
