<a href="https://colab.research.google.com/github/kebtes/Agent-Ivy/blob/main/spam_model/SpamDetector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# !wget https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
# !unzip smsspamcollection.zip

import urllib.request
import zipfile
import os

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
filename = "smsspamcollection.zip"

if not os.path.exists(filename):
    urllib.request.urlretrieve(url, filename)
    print("Downloaded.")

# Unzip the dataset
with zipfile.ZipFile(filename, 'r') as zip_ref:
    zip_ref.extractall("data")
    print("Unzipped into ./data/")

Unzipped into ./data/


In [20]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [21]:
df = pd.read_csv('/content/data/SMSSpamCollection', sep='\t', header=None, names=['label', 'text'])

df['label'] = df['label'].map({'ham': 0, 'spam': 1})

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [22]:
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [23]:
max_len = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

In [24]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=64, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])



In [25]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X_train_pad, y_train, epochs=10, batch_size=32, validation_split=0.1)

loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f'Test Accuracy: {accuracy:.4f}')

Epoch 1/10
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 124ms/step - accuracy: 0.8723 - loss: 0.3443 - val_accuracy: 0.9664 - val_loss: 0.0946
Epoch 2/10
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 121ms/step - accuracy: 0.9901 - loss: 0.0488 - val_accuracy: 0.9753 - val_loss: 0.1077
Epoch 3/10
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 144ms/step - accuracy: 0.9933 - loss: 0.0284 - val_accuracy: 0.9798 - val_loss: 0.1018
Epoch 4/10
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 121ms/step - accuracy: 0.9984 - loss: 0.0068 - val_accuracy: 0.9776 - val_loss: 0.1094
Epoch 5/10
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 129ms/step - accuracy: 0.9999 - loss: 0.0015 - val_accuracy: 0.9776 - val_loss: 0.1373
Epoch 6/10
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 134ms/step - accuracy: 0.9985 - loss: 0.0060 - val_accuracy: 0.9709 - val_loss: 0.1275
Epoch 7/10

In [29]:
# export datas
model.save("SpamDetectorModel.keras")

import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
