# Spam detection using a multi-layer perceptron

#### **Import libraries**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud
from sklearn.metrics import classification_report, confusion_matrix, precision_score, accuracy_score

#### **Data Loading**

In [None]:
df = pd.read_table('spam_emails',
                   sep='\t', 
                   header=None, 
                   names=['spam', 'email'])
df.shape

In [None]:
df['spam'] = df.spam.map({'ham':0, 'spam':1})
print(df.shape)
df.head() 

In [None]:
df.info()

In [None]:
X = df['email']

In [None]:
Y = df['spam']

#### **Data Cleaning**

In [None]:
df.isnull().sum()

In [None]:
df.drop_duplicates(inplace=True)

#### **Data Visualization**

In [None]:
Y.value_counts()

In [None]:
colors = ['#BBD2EC', '#A7AED3']
plt.pie(Y.value_counts(), labels=['non-spam','spam'], autopct="%0.2f", colors=colors)
plt.show()
#plt.savefig("spam.png")

In [None]:
wc = WordCloud(width=500,height=500,min_font_size=10,background_color='white')

spam_words = wc.generate(df[Y == 1]['email'].str.cat(sep=" "))
plt.figure(figsize=(15,6))
plt.imshow(spam_words)
#plt.savefig("spam_words.png")

In [None]:
non_spam_words = wc.generate(df[Y == 0]['email'].str.cat(sep=" "))
plt.figure(figsize=(15,6))
plt.imshow(non_spam_words)
#plt.savefig("non_spam_words.png")

#### **TF-IDF matrix**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(max_df=0.8) # ignore terms that appear in more than 80% of the documents
message = tf.fit_transform(X)

In [None]:
tf.get_feature_names_out()

#### **CountVectorizer**

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer

# cv = CountVectorizer()
# message = cv.fit_transform(X)
# cv.get_feature_names_out()

#### **Data Splitting** (test and train)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(message, Y, test_size=0.20, random_state=0)


#### **MLP**

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import regularizers

In [None]:
model = Sequential()
model.add(Dense(input_dim=X_train.shape[1], units=100, kernel_regularizer= regularizers.L1(l1=1e-5), activation='relu'))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(X_train.toarray(), y_train, batch_size=64, epochs=40, verbose=1, validation_split=0.2)

In [None]:
epochs = history.epoch
plt.plot(epochs, history.history['loss'], label="training set") # blue
plt.plot(epochs, history.history['val_loss'], label="validation set") #orange
plt.title('loss')
plt.legend()
plt.savefig("loss.png")

In [None]:
plt.plot(epochs, history.history['accuracy'], label="training set") #blue
plt.plot(epochs, history.history['val_accuracy'], label="validation set") #orange
plt.title('accuracy')
plt.savefig("accuracy.png")
plt.legend()

In [None]:
model.evaluate(X_test.toarray(), y_test, batch_size=64)

In [None]:
y_pred = model.predict(X_test.toarray())

#### **Metrics**

In [None]:
import seaborn as sns
y_pred_new = [int(not(i<0.5)) for i in y_pred]

cm=confusion_matrix(y_test, y_pred_new)
plt.figure(dpi=100)
sns.heatmap(cm, annot=True)
plt.title("Confusion matrix")
plt.show()

In [None]:
print(classification_report(y_test, y_pred_new))
print("Accuracy: {:.2f}%".format(accuracy_score(y_test,y_pred_new)*100))

In [None]:
precision_score(y_test, y_pred_new)

In [None]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred3 = mnb.predict(X_test)
print("Accuracy: {:.2f}%".format(accuracy_score(y_test,y_pred3)*100))
print(classification_report(y_test, y_pred3))

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C = 2, max_iter = 1000, n_jobs=-1)
lr.fit(X_train, y_train)
y_pred4 = lr.predict(X_test)
print("Accuracy: {:.2f}%".format(accuracy_score(y_test,y_pred4)*100))
print(classification_report(y_test, y_pred4))

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=300, max_depth=15, random_state=42, class_weight='balanced')
rfc.fit(X_train, y_train)
y_pred5= rfc.predict(X_test)
print("Accuracy: {:.2f}%".format(accuracy_score(y_test,y_pred5)*100))
print(classification_report(y_test, y_pred5))