# Deep Learning with Python - Chapter 3

In [1]:
# A Dense layer with 32 outputs units
from keras import layers
layer = layers.Dense(32, input_shape=(784,))

Using TensorFlow backend.


We’re creating a layer that will only accept as input 2D tensors where the first dimen- sion is 784 (axis 0, the batch dimension, is unspecified, and thus any value would be accepted). This layer will return a tensor where the first dimension has been trans- formed to be 32. Thus this layer can only be connected to a downstream layer that expects 32- dimensional vectors as its input. When using Keras, you don’t have to worry about compatibility, because the layers you add to your models are dynamically built to match the shape of the incoming layer. For instance, suppose you write the following:


In [2]:
from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(32, input_shape=(784,)))
model.add(layers.Dense(32))

The second layer didn’t receive an input shape argument—instead, it automatically inferred its input shape as being the output shape of the layer that came before.

In [3]:
# A two-layered model using the Sequential class
from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(32, activation='relu', input_shape=(784,)))
model.add(layers.Dense(10, activation='softmax'))

In [4]:
# Same model as above using the functional API
input_tensor = layers.Input(shape=(784,))
x = layers.Dense(31, activation='relu')(input_tensor)
output_tensor = layers.Dense(10, activation='softmax')(x)

model = models.Model(input=input_tensor, outputs=output_tensor)




In [5]:
# compile a model with a single loss function
from keras import optimizers

model.compile(optimizer=optimizers.RMSprop(lr=0.001),
             loss='mse',
             metrics=['accuracy'])

In [6]:
# model.fit(input_tensor, target_tensor, batch_size=128, epochs=10)

### Binary Classificatioin Example: IMDB dataset

In [7]:
from keras.datasets import imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

The argument num_words=10000 means you’ll only keep the top 10,000 most fre- quently occurring words in the training data. Rare words will be discarded. This allows you to work with vector data of manageable size.

In [8]:
# The variables train_data and test_data are lists of reviews; 
#each review is a list of word indices (encoding a sequence of words).
train_data[0]

[1,
 1028,
 4,
 22,
 6437,
 9,
 4064,
 448,
 23,
 4,
 6159,
 7,
 2560,
 178,
 7735,
 6437,
 7684,
 2,
 2,
 4,
 2,
 1796,
 2850,
 8155,
 3052,
 168,
 3109,
 37,
 2178,
 56,
 6,
 603,
 7,
 17,
 76,
 17,
 6609,
 8626,
 5,
 2,
 2,
 5,
 2397,
 2,
 1248,
 1098,
 315,
 27,
 107,
 2,
 11,
 2,
 63,
 287,
 43,
 89,
 2495,
 5,
 1134,
 6,
 4415,
 1248,
 6437,
 66,
 9,
 10,
 10,
 827,
 8461,
 9,
 646,
 2,
 5,
 2495,
 17,
 6334,
 2102,
 2,
 871,
 33,
 4,
 130,
 7,
 27,
 611,
 17,
 6,
 350,
 178,
 7735,
 6437,
 21,
 37,
 303,
 11,
 4,
 20,
 2503,
 15,
 6,
 113,
 17,
 6,
 2,
 80,
 30,
 1149,
 237,
 225,
 164,
 1005,
 18,
 90,
 8,
 81,
 19,
 27,
 1959,
 15,
 29,
 2051,
 11,
 4,
 178,
 8853,
 894,
 29,
 1068,
 8,
 413,
 6,
 3024,
 569,
 132,
 2,
 7708,
 5446,
 27,
 1949,
 17,
 6,
 2646,
 1624,
 455,
 18,
 27,
 704,
 10,
 10,
 4,
 65,
 7,
 4,
 22,
 6437,
 9,
 2462,
 23,
 6334,
 2,
 19,
 4,
 7988,
 7,
 1138,
 2455,
 1205,
 3390,
 2,
 5,
 178,
 1372,
 1544,
 745,
 3088,
 1514,
 6344,
 112,
 1412,
 933,
 10

In [9]:
# train_labels and test_labels are lists of 0s and 1s, 
#where 0 stands for negative and 1 stands for positive:
train_labels[0]

1

In [10]:
max([max(sequence) for sequence in train_data])

9999

In [11]:
# Decoding one of these review back to English
word_index = imdb.get_word_index()
reverse_word_index = dict (
    [(value, key) for (key, value) in word_index.items()])
decoded_review = ' '.join(
    [reverse_word_index.get(i - 3, '?') for i in train_data[0]])

##### Prepating the Data

In [12]:
# Encoding the integer sequence into a binary matrix
import numpy as np

def vectorize_sequence(sequences, dimension=10000):
    results = np.zeros((len(sequences),dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results

x_train = vectorize_sequence(train_data)
x_test = vectorize_sequence(test_data)

In [16]:
x_train.shape

(25000, 10000)

In [None]:
# Here is what the sample look like now
x_train[0]

In [None]:
# You should also vectorize youk labesl, which is straightforward
train_labels

In [None]:
y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')

In [None]:
y_train

##### Building your network
The input data is vectors, and the labels are scalars (1s and 0s): this is the easiest setup you’ll ever encounter. A type of network that performs well on such a problem is a simple stack of fully connected (Dense) layers with relu activations: Dense(16, activation='relu'). The argument being passed to each Dense layer (16) is the number of hidden units of the layer.

In [None]:
# Model implementation in keras
from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(61, activation='relu', ))
model.add(layers.Dense(1, activation='sigmoid'))

Here’s the step where you configure the model with the rmsprop optimizer and the binary_crossentropy loss function. Note that you’ll also monitor accuracy during training.

In [None]:
# Compiling the model
model.compile(optimizer='rmsprop',
             loss='binary_crossentropy',
             metrics=['accuracy'])

You’re passing your optimizer, loss function, and metrics as strings, which is possible because rmsprop, binary_crossentropy, and accuracy are packaged as part of Keras. Sometimes you may want to configure the parameters of your optimizer or pass a cus- tom loss function or metric function. The former can be done by passing an optimizer class instance as the optimizer argument, as shown in listing 3.5; the latter can be done by passing function objects as the loss and/or metrics arguments, as shown in listing 3.6.

In [None]:
# Configuring a custom optimizer (optional)
from keras import optimizers

model.compile(optimizer=optimizers.RMSprop(lr=0.0001),
             loss='binary_crossentropy',
             metrics=['accuracy'])

In [None]:
# Using customr losses and metrics
from keras import losses
from keras import metrics

model.compile(optimizer=optimizers.RMSprop(lr=0.0001),
             loss=losses.binary_crossentropy,
             metrics=[metrics.binary_accuracy])

In [None]:
# Setting aside a validation set
x_val = x_train[:10000]
partial_x_train = x_train[10000:]

y_val = y_train[:10000]
partial_y_train = y_train[10000:]

You’ll now train the model for 20 epochs (20 iterations over all samples in the x_train and y_train tensors), in mini-batches of 512 samples. At the same time, you’ll monitor loss and accuracy on the 10,000 samples that you set apart. You do so by passing the validation data as the validation_data argument.

In [None]:
# Training your model

model.compile(optimizer='rmsprop',
             loss='binary_crossentropy',
             metrics=['acc'])

history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=20,
                    batch_size=512,
                    validation_data = (x_val, y_val))

Note that the call to model.fit() returns a History object. This object has a mem- ber history, which is a dictionary containing data about everything that happened during training. Let’s look at it:

In [None]:
history_dict = history.history
history_dict.keys()

The dictionary contains four entries: one per metric that was being monitored during training and during validation. In the following two listing, let’s use Matplotlib to plot the training and validation loss side by side (see figure 3.7), as well as the training and validation accuracy (see figure 3.8). Note that your own results may vary slightly due to a different random initialization of your network.

In [None]:
# Plotting the training and validation loss
import matplotlib.pyplot as plt

history_dict = history.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']

# Fix: Use "history_dict['acc']" instead for the variable "acc"
acc = history_dict['acc']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, loss_values, 'bo', label='Training loss')
plt.plot(epochs, val_loss_values, 'b', label='Valdation loss')
plt.title('Training and Vatlidaiton loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# Plotting the training and test validation accuracy 
plt.clf() #Clear figure
acc_values = history_dict['acc']
val_acc_values = history_dict['val_acc']

plt.plot(epochs, acc_values, 'bo', label='Training acc')
plt.plot(epochs, val_acc_values, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

As you can see, the training loss decreases with every epoch, and the training accuracy increases with every epoch. That’s what you would expect when running gradient- descent optimization—the quantity you’re trying to minimize should be less with every iteration. But that isn’t the case for the validation loss and accuracy: they seem to peak at the fourth epoch. This is an example of what we warned against earlier: a model that performs better on the training data isn’t necessarily a model that will do better on data it has never seen before. In precise terms, what you’re seeing is overfit- ting: after the second epoch, you’re overoptimizing on the training data, and you end up learning representations that are specific to the training data and don’t generalize to data outside of the training set.In this case, to prevent overfitting, you could stop training after three epochs. More discussed in Chapter 4

In [None]:
# Retraining the model from scratch

model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy',
             metrics=['accuracy'])
model.fit(x_train, y_train, epochs=4, batch_size=512)
results = model.evaluate(x_test, y_test)

In [None]:
results

This fairly naive approach achieves an accuracy of 88%. With state-of-the-art approaches, you should be able to get close to 95%.

##### Using a trained network

After having trained a network, you’ll want to use it in a practical setting. You can gen- erate the likelihood of reviews being positive by using the predict method:

In [None]:
model.predict(x_test)

### Multiclass Classification: Classifying Newswires

In [None]:
# Loading the Reuters dataset
from keras.datasets import reuters
(train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000)

In [None]:
len(train_data)

In [None]:
len(test_data)

In [None]:
train_data[10]

In [None]:
# Decoding newswires back to text
word_index = reuters.get_word_index()
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) 
decoded_newswire = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]])

In [None]:
# Training labels are in between 0 and 45
train_labels[10]

##### Preparing the data

In [None]:
# Encoding Sequences into Tensors
import numpy as np

def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1
    return results

x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

In [None]:
# One-Hot Encoding Categorical Values
def to_one_hot(labels, dimension=46):
    results = np.zeros((len(labels), dimension))
    for i, label in enumerate(labels):
        results[i, labels] = 1
    return results

one_hot_train_labels = to_one_hot(train_labels)
one_hot_test_labels = to_one_hot(test_labels)

In [None]:
from keras.utils.np_utils import to_categorical

one_hot_train_labels = to_categorical(train_labels)
one_hot_test_labels = to_categorical(test_labels)

##### Model Definition

In [None]:
from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(46, activation='softmax'))

##### Compile the Model

In [None]:
model.compile(optimizer='rmsprop',
             loss='categorical_crossentropy',
             metrics=['accuracy'])

##### Setting aside a validation set

In [None]:
x_val = x_train[:1000]
partial_x_train = x_train[1000:]

y_val = one_hot_train_labels[:1000]
partial_y_train = one_hot_train_labels[1000:]

##### Training the Model

In [None]:
history = model.fit(partial_x_train,
                   partial_y_train,
                   epochs=20,
                   batch_size=512,
                   validation_data=(x_val, y_val))

##### Plotting and training and validation loss

In [None]:
# Plotting the training and validation loss
import matplotlib.pyplot as plt

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(loss) + 1)

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Valdation loss')
plt.title('Training and Vatlidaiton loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# Plotting the training and test validation accuracy 
plt.clf()

acc_values = history.history['acc']
val_acc_values = history.history['val_acc']

plt.plot(epochs, acc_values, 'bo', label='Training acc')
plt.plot(epochs, val_acc_values, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

##### Retraining the model from scratch (nine epochs)

In [None]:
model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(46, activation='softmax'))

model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
             metrics=['accuracy'])
model.fit(partial_x_train,
         partial_y_train,
         epochs=9,
         batch_size=512,
         validation_data=(x_val, y_val))

results = model.evaluate(x_test, one_hot_test_labels)

In [None]:
results


This approach reaches an accuracy of ~80%. With a balanced binary classification problem, the accuracy reached by a purely random classifier would be 50%. But in this case it’s closer to 19%, so the results seem pretty good, at least when compared to a random baseline:

In [None]:
import copy
test_labels_copy = copy.copy(test_labels)
np.random.shuffle(test_labels_copy)
hits_array = np.array(test_labels) == np.array(test_labels_copy)
float(np.sum(hits_array)) / len(test_labels)

##### Generating predictions for new data

In [None]:
predictions = model.predict(x_test)

In [None]:
# Each entry in predictions is a vector of length 46:
predictions[0].shape

In [None]:
# The coefficients in this vector sum to 1:
np.sum(predictions[0])

In [None]:
# View all the probabilities for each class
predictions[0]

In [None]:
# The largest entry is the predicted class—the class with the highest probability:
np.argmax(predictions[0])

##### A different way to handle the labels and the loss

In [None]:
y_train = np.array(train_labels)
x_train = np.array(test_labels)

The only thing this approach would change is the choice of the loss function. The loss function used in listing 3.21, categorical_crossentropy, expects the labels to follow a categorical encoding. With integer labels, you should use sparse_categorical_ crossentropy:


In [None]:
model.compile(optimizer='rmsprop', 
              loss='sparse_categorical_crossentropy',
              metrics=['acc'])

This new loss function is still mathematically the same as categorical_crossentropy;
it just has a different interface.

##### The importance of having sufficiently large intermediate layer 

We mentioned earlier that because the final outputs are 46-dimensional, you should avoid intermediate layers with many fewer than 46 hidden units. Now let’s see what happens when you introduce an information bottleneck by having intermediate layers that are significantly less than 46-dimensional: for example, 4-dimensional.

In [None]:
# A model with a informational bottleneck
model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(4, activation='relu'))
model.add(layers.Dense(46, activation='softmax'))

model.compile(optimizer='rmsprop',
             loss='categorical_crossentropy',
             metrics=['accuracy'])

model.fit(partial_x_train,
         partial_y_train,
         epochs =20, 
         batch_size=128, 
         validation_data=(x_val, y_val))

The network now peaks at ~71% validation accuracy, an 8% absolute drop. This drop is mostly due to the fact that you’re trying to compress a lot of information (enough information to recover the separation hyperplanes of 46 classes) into an intermediate space that is too low-dimensional. The network is able to cram most of the necessary information into these eight-dimensional representations, but not all of it.


### A Regression Example: Predicting House Prices

In [None]:
# loading the boston dataset
from keras.datasets import boston_housing

(train_data, train_targets), (test_data, test_targets) = boston_housing.load_data()

In [None]:
train_data.shape

In [None]:
test_data.shape

In [None]:
train_targets

##### Preparing Data

In [None]:
# Normalize the data
mean = train_data.mean(axis=0)
train_data -= mean
std = train_data.std(axis=0)
train_data /= std

test_data -= mean
test_data /= mean

##### Building your network

In [None]:
from keras import models
from keras import layers

def build_model():
    model = models.Sequential()
    model.add(layers.Dense(64, activation='relu', input_shape=(train_data.shape[1],)))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(1))
    model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])
    return model

The network ends with a single unit and no activation (it will be a linear layer). This is a typical setup for scalar regression (a regression where you’re trying to predict a single continuous value). 

You’re also monitoring a new metric during training: mean absolute error (MAE). It’s the absolute value of the difference between the predictions and the targets. For instance, an MAE of 0.5 on this problem would mean your predictions are off by $500 on average.


##### K-Fold Cross Validation

In [None]:
import numpy as np
k = 4

num_val_samples = len(train_data)//k
num_epochs = 100
all_scores = []

for i in range(k):
    print('processing fold #', i)
    val_data = train_data[i * num_val_samples: (i+1) * num_val_samples]
    val_targets = train_targets[i * num_val_samples: (i+1) * num_val_samples]
    
    partial_train_data = np.concatenate(
        [train_data[:i * num_val_samples],
        train_data[(i + 1) * num_val_samples:]], 
        axis =0)
    
    partial_train_targets = np.concatenate(
        [train_targets[:i * num_val_samples],
        train_targets[(i + 1) * num_val_samples:]], 
        axis =0)
    
    model = build_model()
    model.fit(partial_train_data, partial_train_targets,
             epochs=num_epochs, batch_size=1, verbose=0)
    val_mse, val_mae = model.evaluate(val_data, val_targets, 
                                      verbose=0)
    all_scores.append(val_mae)
    

In [None]:
all_scores

In [None]:
np.mean(all_scores)

Let’s try training the network a bit longer: 500 epochs. To keep a record of how well the model does at each epoch, you’ll modify the training loop to save the per- epoch validation score log.

In [None]:
# Saving the validation logs at each fold
import numpy as np
k = 4

num_val_samples = len(train_data)//k
num_epochs = 500
all_mae_histories = []

for i in range(k):
    print('processing fold #', i)
    val_data = train_data[i * num_val_samples: (i+1) * num_val_samples]
    val_targets = train_targets[i * num_val_samples: (i+1) * num_val_samples]
    
    partial_train_data = np.concatenate(
        [train_data[:i * num_val_samples],
        train_data[(i + 1) * num_val_samples:]], 
        axis =0)
    
    partial_train_targets = np.concatenate(
        [train_targets[:i * num_val_samples],
        train_targets[(i + 1) * num_val_samples:]], 
        axis =0)
    
    model = build_model()
    history = model.fit(partial_train_data, partial_train_targets,
             validation_data=(val_data, val_targets),
             epochs=num_epochs, batch_size=1, verbose=0)
    mae_history = history.history['val_mean_absolute_error']
    all_mae_histories.append(mae_history)

##### Building the history of successive mean K-fold validation scores

In [None]:
average_mae_history = [np.mean([x[i] for x in all_mae_histories]) for i in range(num_epochs)]

##### Plotting Validation Scores

In [None]:
import matplotlib.pyplot as plt

plt.plot(range(1, len(average_mae_history)+1), average_mae_history)
plt.xlabel('Epochs')
plt.ylabel('Validation MAE')
plt.show()

It may be a little difficult to see the plot, due to scaling issues and relatively high vari- ance. Let’s do the following:
- Omit the first 10 data points, which are on a different scale than the rest of the curve.
- Replace each point with an exponential moving average of the previous points,to obtain a smooth curve.

##### Plotting Validation Scores, excluding the first 10 data points

In [None]:
def smooth_curve(points, factor=0.9):
    smoothed_points = []
    for point in points:
        if smoothed_points:
            previous = smoothed_points[-1]
            smoothed_points.append(previous * factor + point * (1 - factor))
        else:
            smoothed_points.append(point)
    return smoothed_points

smooth_mae_history = smooth_curve(average_mae_history[10:])

plt.plot(range(1, len(smooth_mae_history) + 1), smooth_mae_history)
plt.xlabel('Epochs')
plt.ylabel('Validation MAE')
plt.show()

##### Training the Final Model

In [None]:
model = build_model()
model.fit(train_data, train_targets, epochs=80,
         batch_size=16, verbose=0)
test_mse_score, test_mae_score = model.evaluate(test_data, test_targets)

In [None]:
test_mae_score