In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# NN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout,Embedding, Flatten

# preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence

In [2]:
df = pd.read_csv("spam (1).csv")
df.head()

Unnamed: 0,Category,Message
0,not spam,"Go until jurong point, crazy.. Available only ..."
1,not spam,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,not spam,U dun say so early hor... U c already then say...
4,not spam,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.shape

(5572, 2)

In [4]:
# feature target split
X = df["Message"]
y = df["Category"]

In [5]:
le = LabelEncoder()
y = le.fit_transform(y)
y

array([0, 0, 1, ..., 0, 0, 0])

In [6]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)

In [7]:
# Keras tokenization

tok = Tokenizer()
tok.fit_on_texts(X_train)

In [8]:
vocabulary = tok.index_word
print(vocabulary)



In [9]:
vocab_length = len(vocabulary)
vocab_length

7382

In [10]:
# Sequence
train_sequence = tok.texts_to_sequences(X_train)
print(train_sequence)

[[53, 21, 13, 111, 500, 1013], [918, 5, 1123, 3493, 1295, 301, 74, 13, 3494], [1124, 58, 42, 1, 26, 1125, 72, 7, 188, 43, 36, 51, 1296, 97, 60, 159, 115, 52, 39, 850], [44, 103, 184, 1126, 2, 3, 129], [784, 57, 2, 98, 3, 11, 3495, 3496, 1297, 16, 10, 3497, 38, 2346, 199, 2, 82, 423, 16, 1127, 2347, 424], [1014, 1015, 13, 1816, 3498, 42, 1, 81, 14], [8, 14, 13, 1817, 2348, 18, 400, 9, 5, 3499, 28, 2349], [1509, 851, 20, 3500, 115, 1128, 113, 9, 1510, 3501, 55, 229, 3502, 19, 111, 1298, 524, 24, 1, 63, 3503, 102, 263, 2350, 6, 1016, 30, 595, 244, 11, 140, 3504, 3505, 20, 1017], [3506, 32, 1299, 67, 321, 3507, 1300, 200, 852, 2, 13, 107, 401, 5, 3508, 1511, 711, 356, 79, 3509, 2, 2351, 22, 712, 1301, 1818, 370, 257, 130], [44, 54, 6, 98, 10, 132, 60, 6, 245, 90, 84], [156, 218, 81, 218, 67, 218, 66, 218], [425, 346, 7, 3510, 178, 126, 31, 3511, 3512, 14, 63, 133], [138, 402, 1, 81, 2, 2352, 561, 13, 1819], [104, 104, 36, 73, 2, 2353, 39, 113, 2, 86, 11, 919, 1, 145, 920, 5, 1820, 72, 596,

In [11]:
# Length of all documents
doc_length = []
for doc in train_sequence:
  doc_length.append(len(doc))

In [12]:
print(doc_length)

[6, 9, 20, 7, 22, 9, 12, 35, 29, 11, 8, 12, 9, 32, 29, 17, 4, 24, 6, 13, 7, 13, 11, 12, 13, 11, 27, 32, 6, 21, 23, 11, 9, 5, 13, 18, 26, 9, 22, 7, 14, 10, 6, 14, 12, 6, 22, 5, 10, 9, 8, 8, 5, 16, 8, 6, 12, 31, 14, 27, 26, 9, 18, 29, 9, 59, 32, 10, 7, 31, 5, 6, 34, 6, 5, 15, 5, 7, 50, 9, 28, 10, 23, 8, 27, 24, 6, 22, 14, 6, 6, 23, 19, 23, 21, 13, 10, 8, 49, 8, 22, 27, 14, 15, 26, 15, 9, 9, 33, 14, 6, 16, 5, 14, 8, 17, 26, 13, 16, 35, 30, 29, 5, 4, 28, 12, 20, 11, 8, 4, 27, 5, 26, 12, 10, 11, 7, 17, 48, 9, 13, 9, 5, 24, 6, 20, 6, 22, 6, 5, 24, 10, 19, 8, 7, 5, 10, 9, 11, 9, 19, 27, 5, 7, 39, 9, 7, 23, 26, 8, 5, 17, 10, 32, 20, 14, 1, 26, 16, 9, 29, 30, 31, 9, 14, 22, 16, 7, 4, 8, 8, 7, 12, 15, 29, 9, 21, 10, 17, 16, 18, 7, 32, 7, 22, 6, 17, 7, 9, 21, 29, 18, 5, 10, 8, 22, 10, 23, 33, 3, 30, 10, 8, 22, 9, 14, 4, 11, 25, 18, 15, 8, 18, 25, 8, 14, 24, 7, 24, 22, 26, 28, 7, 24, 4, 7, 9, 25, 2, 23, 7, 17, 17, 17, 24, 12, 6, 6, 34, 13, 29, 27, 17, 28, 65, 7, 26, 15, 15, 18, 31, 6, 12, 10, 6, 2

In [13]:
max(doc_length)

189

In [14]:
# 95% quantile
# 95% document length is less than or equal to 33
np.quantile(doc_length, 0.95)

33.0

In [15]:
# 99% quantile
# 99% document length is less than or equal to 51
np.quantile(doc_length, 0.99)

51.00999999999976

In [16]:
max_length = 51

In [17]:
# Padding
train_matrix = sequence.pad_sequences(train_sequence,maxlen=max_length)
train_matrix

array([[   0,    0,    0, ...,  111,  500, 1013],
       [   0,    0,    0, ...,   74,   13, 3494],
       [   0,    0,    0, ...,   52,   39,  850],
       ...,
       [   0,    0,    0, ...,  121,  741, 7381],
       [   0,    0,    0, ..., 1790, 7382, 1919],
       [   0,    0,    0, ...,  267,   31,   10]])

In [18]:
# Testing data
test_sequence = tok.texts_to_sequences(X_test)
test_matrix = sequence.pad_sequences(test_sequence,maxlen=max_length)
test_matrix

array([[   0,    0,    0, ...,   72,    5,  719],
       [   0,    0,    0, ...,  142,   10, 1592],
       [   0,    0,    0, ..., 5282, 2962,   69],
       ...,
       [   0,    0,    0, ...,    0,  205, 1753],
       [   0,    0,    0, ...,  171,   12,    5],
       [   0,    0,    0, ...,   78,   16,   90]])

In [19]:
# model
model = Sequential()
model.add(Embedding(input_dim=vocab_length+1, # vocabulary length, +1 -> 0 tokens
                    input_length=max_length,  # max length of document -> 51
                    output_dim=100,           # hyperparameter -> vector length of each token
                    mask_zero=True))          # do not learn for 0 token
model.add(Flatten())
model.add(Dense(32,activation="relu"))
model.add(Dense(16,activation="relu"))
model.add(Dense(1,activation="sigmoid"))



In [20]:
model.compile(optimizer="adam",loss="binary_crossentropy")
model.fit(train_matrix,y_train,epochs=20,batch_size=32)

Epoch 1/20




[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - loss: 0.3240
Epoch 2/20
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - loss: 0.0888
Epoch 3/20
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - loss: 0.0089
Epoch 4/20
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - loss: 0.0040
Epoch 5/20
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - loss: 2.8130e-04
Epoch 6/20
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 1.3150e-04
Epoch 7/20
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - loss: 7.8068e-05
Epoch 8/20
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - loss: 5.7056e-05
Epoch 9/20
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - loss: 6.0695e-05
Epoch 10/20
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[

<keras.src.callbacks.history.History at 0x2c520453620>

In [21]:
# prediction
y_pred = model.predict(test_matrix)
y_pred = np.where(y_pred >= 0.5, 1, 0)
print(classification_report(y_test,y_pred))

[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1442
           1       0.99      0.92      0.95       230

    accuracy                           0.99      1672
   macro avg       0.99      0.96      0.97      1672
weighted avg       0.99      0.99      0.99      1672



In [22]:
sms1 = "Hey how are you ? let's catchup"
sms2 = "FREE FREE, claim your prize worth $20000 and click on the following link http:fake.com"

In [23]:
def predict_sms(sms):
  data_seq = tok.texts_to_sequences([sms])
  data_matrix = sequence.pad_sequences(data_seq,maxlen=max_length)
  y_pred = model.predict(data_matrix)
  y_pred = np.where(y_pred >= 0.5, 1, 0)
  output = le.inverse_transform(y_pred[0])[0]
  return output

In [24]:
predict_sms(sms1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step


'not spam'

In [25]:
predict_sms(sms2)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step


'spam'