In [None]:
import os
import string
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [None]:
# download stopwords
import nltk
nltk.download('stopwords')

In [None]:
# add custom stopwords

print(stopwords.words('english'))

In [None]:
my_stopwords = stopwords.words('english')
my_stopwords.append('subject:')
my_stopwords

## data set path

In [None]:
emails_path = os.path.join('../../data', 'emails.csv')

In [None]:
emailDf = pd.read_csv(emails_path)

In [None]:
emailDf.info()

In [None]:
original_length = len(emailDf)

## explore data

In [None]:
def spamVhamPieChart(df):
    spamValCounts = emailDf['spam'].value_counts()
    hamCount = spamValCounts[0]
    spamCount = spamValCounts[1]
    labels = ['Spam','Ham']
    sizes = [spamCount,hamCount]
    plt.pie(sizes,labels=labels,autopct='%1.1f%%',startangle=90)
    plt.show()

In [None]:
# we can see that as-is, the dataset is about one-quarter spam emails

spamVhamPieChart(emailDf)

In [None]:
# look at some sample email text

for i in range(5):
    print(emailDf['text'][i])
    print()

## remove duplicates

In [None]:
emailDf.drop_duplicates(keep='first',inplace=True)

In [None]:
emailDf.info()

In [None]:
emailDf.reset_index(drop=True,inplace=True)

In [None]:
original_length - len(emailDf)

## text preprocessing

In [None]:
# cleaning raw text data
def cleanEmail(emailText):
    # lower case
    lower = emailText.lower()
    # remove stop words
    removeStop = ' '.join([word for word in lower.split() if word not in my_stopwords])
    # remove hyperlinks
    removeUrl = re.sub(r"http\S+", "", removeStop)
    # remove special characters
    removePunc = removeUrl.translate(str.maketrans(dict.fromkeys(string.punctuation)))
    return removePunc

In [None]:
copyDf = emailDf.copy()

In [None]:
%%time

emailDf['cleaned_text'] = copyDf.text.apply(lambda x: cleanEmail(x))

In [None]:
emailDf.head()

## split into train/test sets

In [None]:
x_train,x_test,y_train,y_test = train_test_split(emailDf.cleaned_text, emailDf.spam, test_size=0.2)

In [None]:
x_train.shape

## tokenizing cleaned data

In [None]:
EMBED_SIZE = 100
MAX_FEATURES = 50000
MAX_LEN = 2000

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=MAX_FEATURES)

In [None]:
tokenizer.fit_on_texts(x_train)

In [None]:
x_train_features = np.array(tokenizer.texts_to_sequences(x_train), dtype=object)
x_test_features = np.array(tokenizer.texts_to_sequences(x_test), dtype=object)

## padding

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

x_train_features = pad_sequences(x_train_features, maxlen=MAX_LEN)
x_test_features = pad_sequences(x_test_features, maxlen=MAX_LEN)

## encode target labels

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

In [None]:
y_train_enc = le.fit_transform(y_train.values)
y_test_enc = le.transform(y_test.values)

# model

In [None]:
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.models import Model

In [None]:
# create the model - note to self, will not work with Python 3.7 version tf, does work with Python 3.6
import tensorflow as tf
embedding_vector_length = 32

model = tf.keras.Sequential()
model.add(Embedding(MAX_FEATURES, embedding_vector_length, input_length=MAX_LEN))
# model.add(Bidirectional(tf.keras.layers.LSTM(64))) --- not enough memory on my 1060 ***********************************
model.add(Bidirectional(tf.keras.layers.LSTM(32)))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
history = model.fit(x_train_features, y_train, batch_size=512, epochs=20, validation_data=(x_test_features, y_test))

## model accuracy

In [None]:
from  matplotlib import pyplot as plt
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.grid()
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix,f1_score, precision_score,recall_score

In [None]:
y_predict  = [1 if o > 0.5 else 0 for o in model.predict(x_test_features)]

In [None]:
cf_matrix = confusion_matrix(y_test, y_predict)

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt     

ax= plt.subplot()
sns.heatmap(cf_matrix, annot=True, ax = ax,cmap='Blues',fmt=''); #annot=True to annotate cells

# labels, title and ticks
ax.set_xlabel('Predicted labels');
ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['Not Spam', 'Spam']); ax.yaxis.set_ticklabels(['Not Spam', 'Spam'])

In [None]:
print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_predict)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_predict)))
print("F1 Score: {:.2f}%".format(100 * f1_score(y_test, y_predict)))

In [None]:
f1_score(y_test, y_predict)

## save model

In [None]:
model.save('spam_or_ham.h5')