In [0]:
from google.colab import files
files.upload()

In [0]:
!ls

In [0]:
!unzip ling-spam.zip

In [0]:
import pathlib
import tensorflow as tf
import re
import io
import pandas as pd
import tensorflow_datasets as tfds

from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Bidirectional, LSTM
from tensorflow.keras import Sequential

In [0]:
TRAIN_DIR = pathlib.Path('train-mails')
TEST_DIR  = pathlib.Path('test-mails')

In [0]:
# Get text files.
train_df = pd.DataFrame([str(a) for a in TRAIN_DIR.glob("*.txt")])
train_df.columns = ['file_path']

test_df = pd.DataFrame([str(a) for a in TEST_DIR.glob("*.txt")])
test_df.columns = ['file_path']

In [0]:
train_df.head()

In [0]:
# remove useless text
import unicodedata
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')


def remove_junk_text(w):
  w = unicode_to_ascii(w.lower().strip())
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

  # w = re.sub(r'\n', " \n ", w)

  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

  w = w.strip()
  return w

In [0]:
def get_text(file_path):
  text = io.open(file_path, encoding='UTF-8').read().strip()
  text = remove_junk_text(text)
  return text

In [0]:
def get_label(file_path):
  parts    = file_path.split("/")
  part_str = parts[-1][0: 3]
  if part_str == "spm":
    return 0
  else:
    return 1
    
train_df['text']  = train_df['file_path'].apply(get_text)
train_df['label'] = train_df['file_path'].apply(get_label)

test_df['text']  = test_df['file_path'].apply(get_text)
test_df['label'] = test_df['file_path'].apply(get_label)

In [0]:
test_df.head()

In [0]:
train_df = train_df.drop(columns=['file_path'])
test_df = test_df.drop(columns=['file_path'])

In [0]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

In [0]:
train_list_ds = tf.data.Dataset.from_tensor_slices((train_df.text, train_df.label)).shuffle(BUFFER_SIZE, reshuffle_each_iteration=False)
test_list_ds  = tf.data.Dataset.from_tensor_slices((test_df.text, test_df.label)).shuffle(BUFFER_SIZE, reshuffle_each_iteration=False)

In [0]:
for file_name in test_list_ds.take(10):
  print(file_name[1].numpy(),file_name[0].numpy())

In [0]:
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (text.numpy() for text, label in train_list_ds), target_vocab_size=2**15)

In [0]:
tokenizer.vocab_size

In [0]:
sample_string = 'Transformer is awesome.'

tokenized_string = tokenizer.encode(sample_string)
print ('Tokenized string is {}'.format(tokenized_string))

original_string = tokenizer.decode(tokenized_string)
print ('The original string: {}'.format(original_string))

assert original_string == sample_string

In [0]:
def encode(text_tensor, label):
  encoded_text = tokenizer.encode(text_tensor.numpy())
  return encoded_text, label

def encode_map_fn(text, label):
  encoded_text, label = tf.py_function(
      encode,
      inp=[text, label],
      Tout=(tf.int64, tf.int64)
  )
  encoded_text.set_shape([None])
  label.set_shape([])
  return encoded_text, label

all_encoded_train_data = train_list_ds.map(encode_map_fn)
all_encoded_test_data = test_list_ds.map(encode_map_fn)

In [0]:
train_data = all_encoded_train_data.shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE)

test_data = all_encoded_test_data.shuffle(BUFFER_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE)


In [0]:
for ex in train_data.take(1):
  text = ex[0][10]
  label = ex[1][10]
  print(ex[0].shape,ex[1].shape)

In [0]:
label

In [0]:
tokenizer.decode(text.numpy())

In [0]:
embedding_dim = 64

model = Sequential([
                  Embedding(tokenizer.vocab_size, embedding_dim),
                  Bidirectional(LSTM(64)),
                  Dense(64, activation='relu'),
                  Dense(64, activation='relu'),
                  Dense(1)
])
model.summary()

In [0]:
model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),metrics=['accuracy'])

history = model.fit(
    train_data,
    epochs=10,
    validation_data=test_data
)

In [0]:
for ex in train_data.take(1):
  text = ex[0]
  label = ex[1]
  print(text)
  print(label)
  print(ex[0].shape,ex[1].shape)

In [0]:
string = "spmsgc94.txt"

In [0]:
string[:3]

In [0]:
import unicodedata

def remove_junk_text(w):
  # w = w.numpy()
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

  w = re.sub(r"[^a-zA-Z?.!,¿]+", "", w)

  w = w.strip()
  w = w.lower()

  unicodedata.normalize('NFKD', w).encode('ascii','ignore')
  return w

In [0]:
def get_label(file_path):
  parts = tf.strings.split(file_path, "/")
  part_str = tf.strings.substr(parts[-1], 0, 3)
  if part_str == "spm":
    return 0
  else:
    return 1

In [0]:
# def get_email_text(file_path)

In [0]:
def process_path(file_path):
  label = get_label(file_path)
  text  = tf.io.read_file(file_path)
  # text  = remove_junk_text(text)
  return text, label

In [0]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
labeled_ds = train_list_ds.map(process_path, num_parallel_calls=AUTOTUNE)

In [0]:
for text, label in labeled_ds.take(5):
  print(text)
  print(label)

In [0]:
def remove_junk(text, label):
  junkless_text = remove_junk_text(text.numpy())
  return junkless_text, label

In [0]:
def remove_junk_map_fn(text, label):
  encoded_text, label = tf.py_function(
      remove_junk,
      inp=[text, label],
      Tout=(tf.int64, tf.int8)
  )
  encoded_text.set_shape([None])
  label.set_shape([])
  return encoded_text, label

labeled_ds = labeled_ds.map(remove_junk_map_fn)

In [0]:
a,b = labeled_ds.take(1)