In [278]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import string
import docx

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers.experimental import preprocessing

In [279]:
csv_ds = tf.data.experimental.make_csv_dataset(
    'train.csv',
    batch_size=5, # Artificially small to make examples easier to show.
    label_name='spam',
    num_epochs=1,
    ignore_errors=True)

In [252]:
train_ds = csv_ds.take(1200)
val_ds = csv_ds.skip(1200)

In [253]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [254]:
train_ds

<PrefetchDataset shapes: (OrderedDict([(id, (None,)), (subject, (None,)), (email, (None,))]), (None,)), types: (OrderedDict([(id, tf.int32), (subject, tf.string), (email, tf.string)]), tf.int32)>

In [255]:
# Embed a 1,000 word vocabulary into 5 dimensions.
embedding_layer = tf.keras.layers.Embedding(1000, 5)

In [256]:
# Create a custom standardization function to strip HTML break tags '<br />'.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation), '')

In [257]:
# Vocabulary size and number of words in a sequence.
vocab_size = 10000
sequence_length = 100

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Note that the layer uses the custom standardization defined above.
# Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
text_ds = train_ds.map(lambda x, y: x['email'])
vectorize_layer.adapt(text_ds)

text_ds

<MapDataset shapes: (None,), types: tf.string>

In [258]:
embedding_dim=16

model = Sequential([
  vectorize_layer,
  Embedding(vocab_size, embedding_dim, name="embedding"),
  GlobalAveragePooling1D(),
  Dense(16, activation='relu'),
  Dense(1)
])

In [259]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [260]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [280]:
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=5,
    callbacks=[tensorboard_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f36c051bb50>

In [206]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_23 (TextV (None, 100)               0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 16)           16000     
_________________________________________________________________
global_average_pooling1d_5 ( (None, 16)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 16)                272       
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 17        
Total params: 16,289
Trainable params: 16,289
Non-trainable params: 0
_________________________________________________________________
