In [1]:
!pip install pandas



In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('./../dataset/spam.csv')

In [4]:
df.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
sentences = df['Message'].tolist()
labels = df['Category'].tolist()

In [7]:
train_size = int(len(sentences)*0.9)

In [8]:
train_size

5014

In [9]:
training_sentences = sentences[0:train_size]
testing_sentences = sentences[train_size:]

In [10]:
training_labels = labels[0:train_size]
testing_labels = labels[train_size:]

In [11]:
import numpy as np

In [12]:
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

Vocab is accumulation of words

In [13]:
vocab = 500
embedding_size = 32
max_length = 50
truncation_type = 'post'
padding_type = 'post'
out_of_vocab_token = '<OOV>'

### Tokenizer

In [16]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [17]:
tokenizer = Tokenizer(num_words=vocab, oov_token=out_of_vocab_token)

In [19]:
tokenizer.fit_on_texts(training_sentences)

In [20]:
word_index = tokenizer.word_index

In [22]:
# word_index

In [23]:
sequences = tokenizer.texts_to_sequences(training_sentences)

In [24]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [25]:
padded = pad_sequences(sequences=sequences, maxlen=max_length, padding=padding_type, truncating=truncation_type)

In [26]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [28]:
def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

In [29]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding

In [30]:
model = Sequential()

In [31]:
model.add(Embedding(vocab, embedding_size, input_length=max_length))
model.add(LSTM(20))
model.add(Dense(1, activation='sigmoid'))

In [32]:
from tensorflow.keras.optimizers import Adam

In [33]:
adam = Adam()

In [34]:
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

In [35]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 32)            16000     
                                                                 
 lstm (LSTM)                 (None, 20)                4240      
                                                                 
 dense (Dense)               (None, 1)                 21        
                                                                 
Total params: 20261 (79.14 KB)
Trainable params: 20261 (79.14 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [37]:
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, padding=padding_type, truncating=truncation_type, maxlen=max_length)

In [38]:
model.fit(padded, training_labels_final, epochs=10, validation_data=(testing_padded, testing_labels_final))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x28ae28e1630>

In [39]:
model.save('./../savedModels/model.h5')

  saving_api.save_model(


In [40]:
import pickle

In [41]:
with open('./../savedModels/tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)