**Mounting drive to access the dataset**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Importing modules**

In [2]:
import pandas as pd
import numpy as np


**Opening the dataset**

In [3]:
#dataset link https://www.kaggle.com/datasets/purusinghvi/email-spam-classification-dataset/versions/1

data=pd.read_csv("/content/drive/MyDrive/spam email dataset.csv")

In [4]:
data

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...
...,...,...
83443,0,hi given a date how do i get the last date of ...
83444,1,now you can order software on cd or download i...
83445,1,dear valued member canadianpharmacy provides a...
83446,0,subscribe change profile contact us long term ...


In [5]:
data=data.dropna()

In [6]:
data.columns

Index(['label', 'text'], dtype='object')

**Creating x and y**

In [7]:
x=data.drop("label", axis=1)
y=data['label']

In [8]:
x.head()

Unnamed: 0,text
0,ounce feather bowl hummingbird opec moment ala...
1,wulvob get your medircations online qnb ikud v...
2,computer connection from cnn com wednesday es...
3,university degree obtain a prosperous future m...
4,thanks for all your answers guys i know i shou...


In [9]:
x.shape

(83448, 1)

**Importing tensorflow modules for preprocessing**

In [2]:
import tensorflow as tf

from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot


In [47]:
#fixing vocabulary size
vocab_size=13000

**Getting maximum sentence length to set sentence length**

In [19]:
sentenceLength=0
for i in data['text']:
  s=i.split(" ")
  if sentenceLength<len(s):
    sentenceLength=len(s)

print(sentenceLength)

101984


**Natural lang library to remove unwanted words**

In [31]:
import nltk
import re
from nltk.corpus import stopwords

nltk.download('stopwords')
print(len(stopwords.words('english')))

179


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [45]:
print(len(x_Copy['text']))

83448


In [46]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus=[]
x_Copy=x.copy()

for i in range(0, len(x_Copy['text'])):
  if i==len(x_Copy['text'])//4 or i==len(x_Copy['text'])//2 or i==len(x_Copy['text']):
    print("almost done")

  review = re.sub('[^a-zA-Z]', ' ', x_Copy['text'][i])
  review = review.lower()
  review = review.split()

  review=[ps.stem(word) for word in review if not word in stopwords.words('english') and stopwords.words('french') and stopwords.words('spanish')]
  review = ' '.join(review)
  corpus.append(review)

almost done
almost done


**One hot representation**

In [48]:
onehot_repr=[one_hot(words,vocab_size)for words in corpus]

**Pad seqeuncing to make all the sentence of same length**

In [None]:
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sentenceLength)
print(embedded_docs)

**Creating the model with simple architecture including word embedding**

In [None]:
model=Sequential()
embedding_features=40
model.add(Embedding(vocab_size,embedding_features,input_length=sentenceLength)) #layer 1 for embedding
#stacked architecture for extraction of features from data
model.add(LSTM(100)) #layer 2 for capturing low level features
model.add(LSTM(50)) #layer 3 for capturing high level features
model.add(Dense(1, activation='sigmoid')) #last layer for doing final prediction

#using ADAM for better optimization
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())


**Converting the dataset into a numpy array**

In [None]:
X_final=np.array(embedded_docs)
y_final=np.array(y)

**Splitting data into train and test**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

**Training the model**

In [None]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

**Predictions**

In [None]:
y_pred=model.predict_classes(X_test)

**Accuracy score**

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)