In [35]:
from __future__ import print_function
import json
import random
import numpy as np
import math
import datetime
from collections import Counter
import matplotlib as pyplot

# Scikit-learn imports
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

# Tensorflow and keras imports
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K

In [3]:
K.clear_session()
tf.compat.v1.reset_default_graph()

In [11]:
def data():
    maxlen = 50
    embedding_size = 50
    
    # importing the glove embeddings path 
    embeddings_path = '../data/glove.6B.50d-char.txt'
    
    # Indexing character vectors using glove word vectors
    embedding_vectors = {}
    with open(embeddings_path, 'r') as f:
        for line in f:
            line_split = line.strip().split(" ")
            vec = np.array(line_split[1:], dtype=float)
            char = line_split[0]
            embedding_vectors[char] = vec
#     print('Found %s char vectors.' % len(embedding_vectors))
    
    # loading the dataset
    with open('../data/dataset.json', 'r') as f:
        dataset = json.load(f)
        positives = dataset['positives']
        negatives = dataset['negatives']
    
    # dividing the dataset to make small models
    data_fraction = 0.5
    positives = positives[:int(data_fraction * len(positives))]
    negatives = negatives[:int(data_fraction * len(negatives))]
    
    # Dividing the negatives dataset between train, dev and test
    negatives_train = negatives[0: int(len(negatives) * .8)]
    negatives_dev = negatives[int(len(negatives) * .8): int(len(negatives) * .9)]
    negatives_test = negatives[int(len(negatives) * .9): ]
    print("Split sizes:")
    print(len(positives), len(negatives_train), len(negatives_dev), len(negatives_test))
    
    # Shuffling the data
    a = [(i, 0) for i in negatives_train]
    b = [(i, 1) for i in positives]
    combined = a + b
    random.shuffle(combined)
    shuffled = list(zip(*combined))
    text_X = shuffled[0]
    labels = shuffled[1]
    
    # tokenizing the input url's
    tk = keras.preprocessing.text.Tokenizer(char_level=True)
    tk.fit_on_texts(text_X)
    
    # List the vocabulary
    word_index = tk.word_index
    vocab_size = len(word_index) + 1
    
    # integer encode the documents
    sequences = tk.texts_to_sequences(text_X)

    # pad documents to a max length of 4 words
    data = keras.preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen) # by default the padding is post.
    labels = np.asarray(labels)
    
    # Dividing the dataset into train and test.
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)
    
    # split the training data into a training set and a validation set
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
    
    # create a weight matrix for words in training docs
    embedding_matrix = np.zeros((vocab_size, 50))
    for char, i in word_index.items():
        embedding_vector = embedding_vectors.get(char)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    
    # PCA Embedding dimension
    pca_embedding_dim = 16
    pca = PCA(n_components = pca_embedding_dim)
    pca.fit(embedding_matrix[1:])
    embedding_matrix_pca = np.array(pca.transform(embedding_matrix[1:]))
    embedding_matrix_pca = np.insert(embedding_matrix_pca, 0, 0, axis=0)
    print("PCA matrix created")
    
    return (X_train, X_test, y_train, y_test, X_val, y_val, embedding_matrix_pca, maxlen, vocab_size,
            positives, negatives_train, negatives_dev, negatives_test)

### Define GRU Model

##### Defining embedding layer
    Defining the embedding layer as the first layer of the model and using the original embedding dimension and not the reduced dimensions calculated using PCA. 

1. Use weights= embedding_matrix, if using the original embedding size else,
2. Use weights= embedding_matrix_pca if using PCA.

##### Defining a GRU layer with necessary arguments

1. argument_1 = gru_size
2. argument_2 = return sequences is False if there is no second gru layer, True otherwise

In [12]:
def create_model(embedding_matrix, vocab_size, maxlen):
    model = keras.Sequential([
            keras.layers.Embedding(vocab_size, 16, input_length=50, weights=[embedding_matrix]),
            keras.layers.GRU(16, return_sequences = False),
            keras.layers.Dense(8, activation='relu'),
            keras.layers.Dense(1, activation = 'sigmoid')
    ])

    optimizer = keras.optimizers.Adam(lr = 0.001, decay = 0.0001)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    earlyStopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=1, min_delta = 0.001)
    file_path = '../model_weights/keras_weights_GRU.hdf5'
    checkpoint = keras.callbacks.ModelCheckpoint(filepath=file_path, verbose=1, save_best_only=True)
    callbacks_list = [earlyStopping, checkpoint]
    print(model.summary())
    return model, callbacks_list

In [13]:
before_dataset = datetime.datetime.now()
(X_train, X_test, y_train, y_test, X_val, y_val, embedding_matrix, maxlen, vocab_size, 
    positives, negatives_train, negatives_dev, negatives_test) = data()
after_dataset = datetime.datetime.now()
delta_dataset = after_dataset - before_dataset
print("Data Preprocessing time:", delta_dataset)

Split sizes:
745589 574210 71776 71777
PCA matrix created
Data Preprocessing time: 0:00:35.468132


In [14]:
training_start = datetime.datetime.now()
model, callbacks_list = create_model(embedding_matrix, vocab_size, maxlen)
history = model.fit(X_train, y_train, batch_size = 2048, epochs = 50, verbose=2, 
          validation_data=(X_val, y_val), callbacks = callbacks_list)
training_stop = datetime.datetime.now()
delta_training = training_stop - training_start
print("Model training time:", delta_training)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 16)            624       
_________________________________________________________________
gru (GRU)                    (None, 16)                1632      
_________________________________________________________________
dense (Dense)                (None, 8)                 136       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 9         
Total params: 2,401
Trainable params: 2,401
Non-trainable params: 0
_________________________________________________________________
None
Train on 950255 samples, validate on 105584 samples
Epoch 1/50

Epoch 00001: val_loss improved from inf to 0.54705, saving model to ../model_weights/keras_weights_GRU.hdf5
950255/950255 - 42s - loss: 0.5792 - accuracy: 0.6737 - val_loss: 0.5471 - 

Epoch 35/50

Epoch 00035: val_loss improved from 0.35265 to 0.35088, saving model to ../model_weights/keras_weights_GRU.hdf5
950255/950255 - 39s - loss: 0.3527 - accuracy: 0.8351 - val_loss: 0.3509 - val_accuracy: 0.8350
Epoch 36/50

Epoch 00036: val_loss did not improve from 0.35088
950255/950255 - 39s - loss: 0.3518 - accuracy: 0.8355 - val_loss: 0.3510 - val_accuracy: 0.8355
Epoch 37/50

Epoch 00037: val_loss improved from 0.35088 to 0.34959, saving model to ../model_weights/keras_weights_GRU.hdf5
950255/950255 - 39s - loss: 0.3512 - accuracy: 0.8357 - val_loss: 0.3496 - val_accuracy: 0.8359
Epoch 38/50

Epoch 00038: val_loss improved from 0.34959 to 0.34907, saving model to ../model_weights/keras_weights_GRU.hdf5
950255/950255 - 39s - loss: 0.3504 - accuracy: 0.8362 - val_loss: 0.3491 - val_accuracy: 0.8354
Epoch 39/50

Epoch 00039: val_loss improved from 0.34907 to 0.34847, saving model to ../model_weights/keras_weights_GRU.hdf5
950255/950255 - 39s - loss: 0.3496 - accuracy: 0.836

In [40]:
# evaluate the model
before_train_evaluation = datetime.datetime.now()
train_loss, train_acc = model.evaluate(X_train, y_train, verbose=0)
after_train_evaluation = datetime.datetime.now()
delta_train_evaluation = after_train_evaluation - before_train_evaluation
print("Model evaluation time on training data", delta_train_evaluation)

before_test_evaluation = datetime.datetime.now()
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
after_test_evaluation = datetime.datetime.now()
delta_test_evaluation = after_test_evaluation - before_test_evaluation
print("Model evaluation time on testing data", delta_test_evaluation)

print('Training Loss: %.3f, Testing loss: %.3f' % (train_loss, test_loss) )
print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))

Train: 0.835, Test: 0.834


In [None]:
# plot training history
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()

In [16]:
# Saving the model and loading the saved model

model.save('../saved_models/model_GRU_PCA_prune.h5')
keras.models.load_model('../saved_models/model_GRU_PCA_prune.h5')

<tensorflow.python.keras.engine.sequential.Sequential at 0x7fe00224dc88>

## Pruning - Trim Insignificant Weights

Magnitude-based weight pruning gradually zeroes out model weights during the training process to achieve model sparsity. Sparse models are easier to compress, and we can skip the zeroes during inference for latency improvements.

This technique brings improvements via model compression. In the future, framework support for this technique will provide latency improvements. We've seen up to 6x improvements in model compression with minimal loss of accuracy.

The technique is being evaluated in various speech applications, such as speech recognition and text-to-speech, and has been experimented on across various vision and translation models.

In [17]:
import tempfile
import os

%load_ext tensorboard

In [18]:
# Evaluate baseline model accuracy and saving it for later usage

_, baseline_model_accuracy = model.evaluate(
    X_test, y_test, verbose=0)

print('Baseline test accuracy:', baseline_model_accuracy)

_, keras_file = tempfile.mkstemp('.h5')
model.save(keras_file, include_optimizer=False)
print('Saved baseline model to:', keras_file)

Baseline test accuracy: 0.8394188
Saved baseline model to: /tmp/tmpgtmlehn4.h5


### Fine-tune pre-trained model with pruning  
Define the model  
You will apply pruning to the whole model and see this in the model summary.

In this example, you start the model with 50% sparsity (50% zeros in weights) and end with 80% sparsity.

In the comprehensive guide, you can see how to prune some layers for model accuracy improvements.

In [36]:
import tensorflow_model_optimization as tfmot

prune_low_magnitude = tfmot.sparsity.keras.prune_low_magnitude

# Compute end step to finish pruning after 2 epochs.
batch_size = 2048
epochs = 50
validation_split = 0.1 # 10% of training set will be used for validation set. 

num_train = X_train.shape[0] * (1 - validation_split)
end_step = np.ceil(num_train / batch_size).astype(np.int32) * epochs

# Define model for pruning.
pruning_params = {'pruning_schedule': tfmot.sparsity.keras.PolynomialDecay(initial_sparsity=0.30,
                                                               final_sparsity=0.50,
                                                               begin_step=0,
                                                               end_step=end_step)
}

model_for_pruning = prune_low_magnitude(model, **pruning_params)
optimizer = keras.optimizers.Adam(lr = 0.001, decay = 0.0001)
# `prune_low_magnitude` requires a recompile.
model_for_pruning.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

model_for_pruning.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
prune_low_magnitude_embeddin (None, 50, 16)            1250      
_________________________________________________________________
prune_low_magnitude_gru (Pru (None, 16)                3171      
_________________________________________________________________
prune_low_magnitude_dense (P (None, 8)                 266       
_________________________________________________________________
prune_low_magnitude_dense_1  (None, 1)                 19        
Total params: 4,706
Trainable params: 2,401
Non-trainable params: 2,305
_________________________________________________________________


#### Train and evaluate the model against baseline  
Fine tune with pruning for two epochs.

In [24]:
logdir = tempfile.mkdtemp()

callbacks = [
  tfmot.sparsity.keras.UpdatePruningStep(),
  tfmot.sparsity.keras.PruningSummaries(log_dir=logdir),
]
  
model_for_pruning.fit(X_train, y_train,
                  batch_size=batch_size, epochs=epochs, validation_split=validation_split,
                  callbacks=callbacks)

Train on 855229 samples, validate on 95026 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7fdfda70f048>

In [25]:
_, model_for_pruning_accuracy = model_for_pruning.evaluate(
   X_test, y_test, verbose=0)

print('Baseline test accuracy:', baseline_model_accuracy) 
print('Pruned test accuracy:', model_for_pruning_accuracy)

Baseline test accuracy: 0.8394188
Pruned test accuracy: 0.83370584


In [26]:
%tensorboard --logdir={logdir}

### Create 3x smaller models from pruning

Both tfmot.sparsity.keras.strip_pruning and applying a standard compression algorithm (e.g. via gzip) are necessary to see the compression benefits of pruning.  

First, create a compressible model for TensorFlow.

In [30]:
model_for_export = tfmot.sparsity.keras.strip_pruning(model_for_pruning)

_, pruned_keras_file = tempfile.mkstemp('.h5')
tf.keras.models.save_model(model_for_export, pruned_keras_file, include_optimizer=False)
print('Saved pruned Keras model to:', pruned_keras_file)

Saved pruned Keras model to: /tmp/tmpwdrwbcl7.h5


Then, create a compressible model for TFLite.

In [39]:
converter = tf.lite.TFLiteConverter.from_keras_model(model_for_export)
converter.experimental_new_converter = True
pruned_tflite_model = converter.convert()

_, pruned_tflite_file = tempfile.mkstemp('.tflite')

with open(pruned_tflite_file, 'wb') as f:
    f.write(pruned_tflite_model)

print('Saved pruned TFLite model to:', pruned_tflite_file)

KeyError: 'kernel'

Define a helper function to actually compress the models via gzip and measure the zipped size.

In [None]:
def get_gzipped_model_size(file):
  # Returns size of gzipped model, in bytes.
    import os
    import zipfile

    _, zipped_file = tempfile.mkstemp('.zip')
    with zipfile.ZipFile(zipped_file, 'w', compression=zipfile.ZIP_DEFLATED) as f:
        f.write(file)

    return os.path.getsize(zipped_file)

In [None]:
print("Size of gzipped baseline Keras model: %.2f bytes" % (get_gzipped_model_size(keras_file)))
print("Size of gzipped pruned Keras model: %.2f bytes" % (get_gzipped_model_size(pruned_keras_file)))
print("Size of gzipped pruned TFlite model: %.2f bytes" % (get_gzipped_model_size(pruned_tflite_file)))

In [None]:
# def predict(text_x):
# 	x = np.zeros((1, maxlen), dtype=np.int)
# 	offset = max(maxlen - len(text_x), 0)
# 	for t, char in enumerate(text_x):
# 	    if t >= maxlen:
# 	        break
# 	    x[0, t + offset] = char_indices[char]
#     pred = model.predict(x)
# 	return pred[0][0]


# Like predict, but you pass in an array of URLs, and it is all
# vectorized in one step, making it more efficient
def predicts(text_X):
    X = np.zeros((len(text_X), maxlen), dtype=np.int)
    for i in range(len(text_X)):
        offset = max(maxlen - len(text_X[i]), 0)
        for t, char in enumerate(text_X[i]):
            if t >= maxlen:
                break
            X[i, t + offset] = char_indices[char]
    preds = [pred[0] for pred in model.predict(X)]
    return preds

In [None]:
# Model Evaluation
# define a threshold value so that values below threshold will be classified as false_positive
threshold = 0.5

def evaluate_model(positives, negatives_train, negatives_dev, negatives_test, threshold):
    false_negatives = 0.0
    preds = predicts(positives)
    for pred in preds:
        if pred <= threshold:
            false_negatives += 1
    print(false_negatives / len(positives), "false negatives for positives set.")

    false_positives_train = 0.0
    preds = predicts(negatives_train)
    for pred in preds:
        if pred > threshold:
            false_positives_train += 1

    false_positives_dev = 0.0
    preds = predicts(negatives_dev)
    for pred in preds:
        if pred > threshold:
            false_positives_dev += 1

    false_positives_test = 0.0
    preds = predicts(negatives_test)
    for pred in preds:
        if pred > threshold:
            false_positives_test += 1

    print(false_positives_train / len(negatives_train), "false positive rate for negative train.")
    print(false_positives_dev / len(negatives_dev), "false positive rate for negative dev.")
    print(false_positives_test / len(negatives_test), "false positive rate for negative test.")

evaluate_model(positives, negatives_train, negatives_dev, negatives_test, threshold)

In [None]:
# Getting predictions on negative_dev set to find a suitable threshold value.

# defining the false positive rate which we can change.
fp_rate = 0.01

print("Getting threshold for fp_rate", fp_rate)
preds = predicts(negatives_dev)
preds.sort()
fp_index = math.ceil((len(negatives_dev) * (1 - fp_rate)))
threshold = preds[fp_index]

print("Using threshold", threshold) 

evaluate_model(positives, negatives_train, negatives_dev, negatives_test, threshold)

### Bloom Filter 

In [None]:
##  Adapted from https://www.geeksforgeeks.org/bloom-filters-introduction-and-python-implementation/ 

import math 
import mmh3 
from bitarray import bitarray 

class BloomFilter(object): 

	''' 
	Class for Bloom filter, using murmur3 hash function 
	'''

	def __init__(self, items_count,fp_prob): 
		''' 
		items_count : int 
			Number of items expected to be stored in bloom filter 
		fp_prob : float 
			False Positive probability in decimal 
		'''
		# False posible probability in decimal 
		self.fp_prob = fp_prob 

		# Size of bit array to use 
		self.size = self.get_size(items_count,fp_prob) 

		# number of hash functions to use 
		self.hash_count = self.get_hash_count(self.size,items_count) 

		# Bit array of given size 
		self.bit_array = bitarray(self.size) 
        
        # Return the size of bitarray in bytes
        self.byte_size = self.bit_array.tobyte()

		# initialize all bits as 0 
		self.bit_array.setall(0) 

	def add(self, item): 
		''' 
		Add an item in the filter 
		'''
		digests = [] 
		for i in range(self.hash_count): 

			# create digest for given item. 
			# i work as seed to mmh3.hash() function 
			# With different seed, digest created is different 
			digest = mmh3.hash(item,i) % self.size 
			digests.append(digest) 

			# set the bit True in bit_array 
			self.bit_array[digest] = True

	def check(self, item): 
		''' 
		Check for existence of an item in filter 
		'''
		for i in range(self.hash_count): 
			digest = mmh3.hash(item,i) % self.size 
			if self.bit_array[digest] == False: 

				# if any of bit is False then,its not present 
				# in filter 
				# else there is probability that it exist 
				return False
		return True

	@classmethod
	def get_size(self,n,p): 
		''' 
		Return the size of bit array(m) to used using 
		following formula 
		m = -(n * lg(p)) / (lg(2)^2) 
		n : int 
			number of items expected to be stored in filter 
		p : float 
			False Positive probability in decimal 
		'''
		m = -(n * math.log(p))/(math.log(2)**2) 
		return int(m)
    
	@classmethod
	def get_hash_count(self, m, n): 
		''' 
		Return the hash function(k) to be used using 
		following formula 
		k = (m/n) * lg(2) 

		m : int 
			size of bit array 
		n : int 
			number of items expected to be stored in filter 
		'''
		k = (m/n) * math.log(2) 
		return int(k) 


### Defining Deep Bloom Model

In [None]:
# Defining the threshold value
fp_rate = 0.01
print("Getting threshold for fp_rate", fp_rate)
preds = predicts(negatives_dev)
preds.sort()
fp_index = math.ceil((len(negatives_dev) * (1 - fp_rate/2)))
threshold = preds[fp_index]
print("The threhold value to use is:", threshold)

In [None]:
def create_bloom_filter(data):
    print("Creating bloom filter")
    false_negatives = []
    # calling the predicts function 
    preds = predicts(data)
    for i in range(len(data)):
        if preds[i] <= threshold:
            false_negatives.append(data[i])
    print("Number of false negatives at bloom time", len(false_negatives))
    bloom_filter = BloomFilter(len(false_negatives), fp_rate / 2)
    for fn in false_negatives:
        bloom_filter.add(fn)
    print("Created bloom filter")
    return bloom_filter

bloom_filter = create_bloom_filter(positives)

In [None]:
# Function to predict the output from the machine learning model
def predict(text_x):
    x = np.zeros((1, maxlen), dtype=np.int)
    offset = max(maxlen - len(text_x), 0)
    for t, char in enumerate(text_x):
        if t >= maxlen:
            break
        x[0, t + offset] = char_indices[char]
    pred = model.predict(x)
    return pred[0][0]


def check_item(item):
    if predict(item) > threshold:
        return True
    return bloom_filter.check(item)

#### Test Deep Bloom Model

In [None]:
print("Bloom filter bits needed", bloom_filter.size)
print("Bloom fiter size in bytes", bloom_filter.byte_size)
print("Hash functions needed", bloom_filter.hash_count)
    
false_positives = 0.0
for neg in negatives_test:
    if check_item(neg):
        false_positives += 1
print("Test false positive rate: ", str(false_positives / len(negatives_test)))