In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Embedding

from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

In [2]:
spam = pd.read_csv("spam.csv")
spam.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
labelEnc = LabelEncoder()
y = labelEnc.fit_transform(spam['Category'])
print(y)

[0 0 1 ... 0 0 0]


In [15]:
mensagens = spam['Message'].values
X_train, X_test, y_train, y_test = train_test_split(mensagens, y, test_size=0.3)

In [16]:
token = Tokenizer(num_words=1000)
token.fit_on_texts(X_train)

X_train = token.texts_to_sequences(X_train)
X_test = token.texts_to_sequences(X_test)

In [18]:
print(len(X_train))
print(X_train)

3900
[[215, 465, 39, 512, 98, 278, 728, 39, 21], [120, 8, 40, 5, 191, 61, 6, 253, 544, 631, 15, 632], [545, 271, 545, 74, 131, 19, 385, 18, 310, 444, 926, 17, 782, 12, 5, 203, 310, 848, 849, 27, 927], [356, 1, 272, 23, 689, 170, 445, 80], [546, 10, 10, 132, 10, 24, 9, 5, 296, 15, 11, 160, 46, 1, 783, 408, 209, 75, 18, 11, 7, 149, 784, 1, 178, 6, 16, 4, 236, 66, 785], [8, 155, 57, 283, 15, 445, 7, 8, 57], [179, 49, 21, 3, 166, 117, 21, 3, 111, 21, 3, 928, 2, 26, 3, 110, 15, 10, 21, 3, 322, 4, 57, 367, 21, 3, 342, 10, 26, 3, 58, 10], [729, 11, 23, 64, 18, 228, 90], [216, 386, 284, 9, 191, 343], [729, 90, 1, 64, 217, 12, 44, 513, 43, 229], [409, 42, 1, 178, 3, 1, 850, 3, 1, 79, 3], [49, 61, 8, 30, 446], [113, 600, 5, 486, 273, 15, 487, 196, 9, 15, 2, 447, 33, 44, 43, 96, 113, 88], [4, 466, 786, 8, 249, 12, 3, 98, 851, 114, 3, 52, 98, 97, 2, 323, 4, 786, 35, 17], [21, 3, 167, 49, 156, 448, 117, 21, 3], [467, 3, 23, 127, 46, 39, 852, 7, 39, 449, 29, 5, 15, 14, 853, 128, 3, 21, 57, 3, 120, 3

In [19]:
X_train = pad_sequences(X_train, padding='post', maxlen=500)
X_test = pad_sequences(X_test, padding='post', maxlen=500)

In [20]:
print(X_train.shape)
print(X_train)

(3900, 500)
[[215 465  39 ...   0   0   0]
 [120   8  40 ...   0   0   0]
 [545 271 545 ...   0   0   0]
 ...
 [ 92  32  93 ...   0   0   0]
 [ 98   6 459 ...   0   0   0]
 [116  73 149 ...   0   0   0]]


In [34]:
model = Sequential()
model.add(Embedding(input_dim=len(token.word_index),output_dim=50, input_length=500))
model.add(Flatten())

model.add(Dense(units=10,activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(units=1,activation='sigmoid'))

In [35]:
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
model.summary()

In [36]:
model.fit(X_train, y_train, epochs=50, batch_size=10, verbose=True, validation_data=(X_test, y_test))

Epoch 1/50
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.8864 - loss: 0.0971 - val_accuracy: 0.9791 - val_loss: 0.0206
Epoch 2/50
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9813 - loss: 0.0180 - val_accuracy: 0.9827 - val_loss: 0.0172
Epoch 3/50
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9901 - loss: 0.0103 - val_accuracy: 0.9731 - val_loss: 0.0227
Epoch 4/50
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9870 - loss: 0.0123 - val_accuracy: 0.9856 - val_loss: 0.0125
Epoch 5/50
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9970 - loss: 0.0053 - val_accuracy: 0.9862 - val_loss: 0.0135
Epoch 6/50
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9958 - loss: 0.0070 - val_accuracy: 0.9839 - val_loss: 0.0142
Epoch 7/50
[1m390/390[0m 

<keras.src.callbacks.history.History at 0x155d569bef0>

In [37]:
loss, accuracy = model.evaluate(X_test, y_test)
print("Loss:",loss)
print("Accuracy:",accuracy)

[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9886 - loss: 0.0113   
Loss: 0.014131169766187668
Accuracy: 0.9856459498405457


In [38]:
new_prediction = model.predict(X_test)
print(new_prediction)

[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[[9.9996871e-01]
 [2.5771459e-14]
 [1.0859236e-17]
 ...
 [1.0000000e+00]
 [6.0432341e-29]
 [1.0783378e-22]]


In [39]:
prev = (new_prediction> 0.5)
print(prev)

[[ True]
 [False]
 [False]
 ...
 [ True]
 [False]
 [False]]


In [40]:
cm = confusion_matrix(y_test, prev)
print(cm)

[[1438    3]
 [  21  210]]
