In [None]:
import tensorflow as tf
import numpy
import pandas
import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os

In [None]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_table.html
input_data = pandas.read_table(r'Inputs/SMSSpamCollection.txt',
                                  header=None,
                                  names=['Class','Text'])

In [None]:
input_data.head()

In [None]:
input_data['labels'] = input_data.apply(lambda r:0 if r['Class']=='ham' else 1,axis=1)

In [None]:
input_data['lower'] = input_data['Text'].str.lower()

In [None]:
print("{} Records".format(len(input_data))) 

In [None]:
max_words=150

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_words, lower=True)

In [None]:
tokenizer.fit_on_texts(input_data['lower'])

In [None]:
class_data_tokens = tokenizer.texts_to_sequences(input_data['lower'])

In [None]:
class_data_tokens = tf.keras.preprocessing.sequence.pad_sequences(class_data_tokens, maxlen=max_words)

In [None]:
input_data['preprocessed'] = list(class_data_tokens)

In [None]:
input_data.head()

In [None]:
train_df, test_df = train_test_split(input_data, test_size=0.2)
train_df, val_df = train_test_split(train_df, test_size=0.2)

In [None]:
train_labels = numpy.array(train_df['labels'].values)
val_labels = numpy.array(val_df['labels'].values)
test_labels = numpy.array(test_df['labels'].values)

In [None]:
train_labels

In [None]:
train_features = numpy.array([a for a in train_df['preprocessed'].values])
val_features = numpy.array([a for a in val_df['preprocessed'].values])
test_features = numpy.array([a for a in test_df['preprocessed'].values])

In [None]:
print('Training labels shape:', train_labels.shape)
print('Validation labels shape:', val_labels.shape)
print('Test labels shape:', test_labels.shape)

print('Training features shape:', train_features.shape)
print('Validation features shape:', val_features.shape)
print('Test features shape:', test_features.shape)

In [None]:
#https://nlp.stanford.edu/projects/glove/
#https://github.com/stanfordnlp/GloVe
#In use is the embedding from wikipedia
#Wikipedia 2014 + Gigaword 5 (6B tokens, 400K vocab, uncased, 300d vectors, 822 MB download): glove.6B.zip
GLOVE_EMBEDDING = "Inputs/glove.6B.50d.txt"

In [None]:
embeddings_index = {}
 
with open(GLOVE_EMBEDDING, encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        embed = numpy.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embed
 

In [None]:
word_index = tokenizer.word_index
 
num_words = min(max_words, len(word_index) + 1)

In [None]:
embed_size = 50
embedding_matrix = numpy.zeros((num_words, embed_size), dtype='float32')

In [None]:
for word, i in word_index.items():
 
    if i >= max_words:
        continue
 
    embedding_vector = embeddings_index.get(word)
 
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
input = tf.keras.layers.Input(shape=(max_words,))

In [None]:
x = tf.keras.layers.Embedding(max_words, embed_size, weights=[embedding_matrix], trainable=False)(input)

In [None]:
x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128, return_sequences=True, dropout=0.1,
                                                      recurrent_dropout=0.1))(x)

In [None]:
x = tf.keras.layers.Conv1D(64, kernel_size=3, padding="valid", kernel_initializer="glorot_uniform")(x)

In [None]:
avg_pool = tf.keras.layers.GlobalAveragePooling1D()(x)
max_pool = tf.keras.layers.GlobalMaxPooling1D()(x)

In [None]:
x = tf.keras.layers.concatenate([avg_pool, max_pool])

In [None]:
preds = tf.keras.layers.Dense(1, activation="sigmoid")(x)

In [None]:
model = tf.keras.Model(input, preds)

In [None]:
model.summary()

In [None]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall'),
      tf.keras.metrics.AUC(name='auc'),
] 

In [None]:
model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), metrics=METRICS)

In [None]:
batch_size = 128
 
checkpoint_path = "training_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

In [None]:
cp_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

In [None]:
callbacks = [
    tf.keras.callbacks.TensorBoard(log_dir='logs'),
    cp_callback
]

In [None]:
history = model.fit(train_features, train_labels, batch_size=batch_size,
          epochs=20, validation_data=(val_features, val_labels), callbacks=callbacks,shuffle=True)

In [None]:
import matplotlib.pyplot as plt

history_dict = history.history


epochs = range(1, len(history_dict['accuracy']) + 1)

plt.figure(figsize=(12,9))
plt.plot(epochs, history_dict['loss'], 'bo', label='Training loss')
# plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(12,9))
plt.plot(epochs, history_dict['accuracy'], 'bo', label='Training acc')
# plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim((0.5,1))
plt.show()

In [None]:
plt.figure(figsize=(12,9))
plt.plot(epochs, history_dict['precision'], 'bo', label='Training prec')
# plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training precision')
plt.xlabel('Epochs')
plt.ylabel('Precision')
plt.legend(loc='lower right')
plt.ylim((0.5,1))
plt.show()

In [None]:
plt.figure(figsize=(12,9))
plt.plot(epochs, history_dict['recall'], 'bo', label='Training reca')
#plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training recall')
plt.xlabel('Epochs')
plt.ylabel('Recall')
plt.legend(loc='lower right')
# plt.ylim((0.5,1))
plt.show()

In [None]:
latest = tf.train.latest_checkpoint(checkpoint_dir)
 
model.load_weights(latest)

val_results = model.evaluate(test_features,test_labels, batch_size=batch_size)
print("Loss: {:0.4f}".format(val_results[0]))