# Bostong Housing Price with Keras

In [None]:
import keras
from keras import models, layers
from keras.datasets import boston_housing

import numpy as np
import matplotlib.pyplot as plt

In [None]:
(train_data, train_labels), (test_data, test_labels) = boston_housing.load_data()

In [None]:
train_data.shape

In [None]:
test_data.shape

## Normalization
Vamos a normalizar los datos ya que siendo todas las columnas numéricas, los valores entre los que oscilan son muy diferentes, esto puede causar que columnas con valores más altos tengan más peso que otras de valores más pequeños.

In [None]:
mean = train_data.mean(axis=0)
train_data -= mean
std = train_data.std(axis=0)
train_data /= std

test_data -= mean
test_data /= std

In [None]:
train_data[0]

## Building model
En este caso vamos a crear una función que cree nuestro model, de este modo no tenemos que reproducir el mismo código tantas veces.

In [None]:
def build_model():
    model = models.Sequential()
    model.add(layers.Dense(64, activation='relu', input_shape=(train_data.shape[1], )))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(1))
    model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])
    
    return model

## Cross-validation
Como nuestros datos tienen un tamaño pequeño, tendríamos que validar con un conjunto de datos que podría ser insuficiente, ya que mostraría dependiendo de los datos que usaramos podrían cambiar mucho los resultados. Para evitar esto usaremos k-fold cross-validation.

In [None]:
k = 4
num_val_samples = len(train_data) // k
num_epochs = 100
all_scores = []

for i in range(k):
    print('Processing fold #', i)
    val_data = train_data[i * num_val_samples : (i+1) * num_val_samples]
    val_targets = train_labels[i * num_val_samples : (i+1) * num_val_samples]
    
    partial_train_data = np.concatenate(
        [train_data[ : i * num_val_samples],
         train_data[(i + 1) * num_val_samples : ]],
        axis = 0)
    
    partial_train_targets = np.concatenate( 
        [train_labels[:i * num_val_samples],
         train_labels[(i + 1) * num_val_samples:]], 
        axis=0)
    
    model = build_model()
    model.fit(partial_train_data, 
              partial_train_targets, 
              epochs=num_epochs, 
              batch_size=1, 
              verbose=0)
    val_mse, val_mae = model.evaluate(val_data, val_targets, verbose=0)
    all_scores.append(val_mae)

In [None]:
all_scores

In [None]:
np.mean(all_scores)

In [None]:
# num_epochs = 500
all_mae_histories = []

for i in range(k):
    print('Processing fold #', i)
    val_data = train_data[i * num_val_samples : (i+1) * num_val_samples]
    val_targets = train_labels[i * num_val_samples : (i+1) * num_val_samples]
    
    partial_train_data = np.concatenate(
        [train_data[ : i * num_val_samples],
         train_data[(i + 1) * num_val_samples : ]],
        axis = 0)
    
    partial_train_targets = np.concatenate( 
        [train_labels[:i * num_val_samples],
         train_labels[(i + 1) * num_val_samples:]], 
        axis=0)
    
    model = build_model()
    history = model.fit(partial_train_data, 
                        partial_train_targets, 
                        epochs=num_epochs,
                        validation_data=(val_data, val_targets),
                        batch_size=1, 
                        verbose=0)
    
    mae_history = history.history['val_mae']
    all_mae_histories.append(mae_history)

In [None]:
average_mae_history = [
    np.mean([x[i] for x in all_mae_histories]) for i in range(num_epochs)
]

In [None]:
plt.plot(range(1, len(average_mae_history) + 1), average_mae_history)
plt.xlabel("Epochs")
plt.ylabel("Validation MAE")
plt.show()

Como puede ser dificil ver la curva debido a la alta varianza que hay vamos a realizar lo siguiente:

- Eliminar los 10 primeros puntos, ya que están a una escala diferente que el resto
- Sustituir cada punto por una media exponencial movil de los puntos anteriores, para de este modo ver una curva suavizada

In [None]:
def smooth_curve(points, factor=0.9):
    smoothed_points = []
    for point in points:
        if smoothed_points:
            previous = smoothed_points[-1]
            smoothed_points.append(previous * factor + point * (1 - factor))
        else:
            smoothed_points.append(point)
    return smoothed_points

In [None]:
smooth_mae_history = smooth_curve(average_mae_history[10:])

In [None]:
plt.plot(range(1, len(smooth_mae_history) + 1), smooth_mae_history)
plt.xlabel('Epochs')
plt.ylabel('Validation MAE')
plt.show()

In [None]:
model = build_model()
model.fit(train_data, 
          train_labels,
          epochs=50,
          batch_size=16,
          verbose=0)

In [None]:
test_mse_score, test_mae_score = model.evaluate(test_data, test_labels)

In [None]:
test_mae_score