In [None]:
import numpy as np
import pandas as pd
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import nltk
from keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from keras.layers import Dense,LSTM, Embedding, Dropout, Activation, Bidirectional
import matplotlib.pyplot as plt

In [None]:
nltk.download('stopwords')

In [None]:
data= pd.read_csv('spam.csv',encoding = "latin")
data=data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)
data.head()

In [None]:
xdata=data['v2'].copy()
ydata=data['v1'].copy()

In [None]:
def text_preprocess(text):
    text=text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    return " ".join(text)


In [None]:
xdata= xdata.apply(text_preprocess)
xdata.head()

In [None]:
ydata = ydata.astype('category').cat.codes
ydata.head()

In [None]:
xtrain, xtest, ytrain, ytest= train_test_split(xdata,ydata, test_size=0.2, random_state=20)

In [None]:
max_feature = 50000 
tokenizer = Tokenizer(num_words=max_feature)
tokenizer.fit_on_texts(xtrain)
x_train_features = np.array(tokenizer.texts_to_sequences(xtrain))
x_test_features = np.array(tokenizer.texts_to_sequences(xtest))

In [None]:
x_tra

In [None]:
x_train_features = pad_sequences(x_train_features)
x_test_features = pad_sequences(x_test_features)

In [None]:
x_train_features.shape

In [None]:
embedding_length = 32
model = tf.keras.Sequential()
model.add(Embedding(max_feature, embedding_length, input_length=x_train_features.shape[1]))
model.add(Bidirectional(tf.keras.layers.LSTM(64)))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(x_train_features, ytrain, batch_size=1024, epochs=20, validation_data=(x_test_features, ytest))

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
#TESTING

email = "All good things do not cost money. You can learn online for free! Our team of experts have curated a list of some great programming-related online courses that you can access for free. Check them out in this blog post: Link: https://www.starttechacademy.com/post/free-courses-and-audio-books-for-data-analytics-and-ml Stay Safe and Keep Learning!"

email= pd.Series([email])
print(type(email))
email=email.apply(text_preprocess)

In [None]:
email.shape

In [None]:
email_features = np.array(tokenizer.texts_to_sequences(email))
print(email_features.shape)
email_features=pad_sequences(email_features, maxlen=72)

In [None]:
email_features.shape

In [None]:
pred=model.predict(email_features)
print(pred)

In [None]:
# IT WORKS GREAT BECAUSE EMAIL WAS ACTUALLY SPAM 