In [13]:
import os
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, Bidirectional

In [14]:
csv_path = os.path.join(os.getcwd(), '../data/spam.csv')
df = pd.read_csv(csv_path, delimiter=',', encoding='latin-1')
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis=1,inplace=True)
df.rename(columns={'v1':'label', 'v2':'text'}, inplace=True)
df['spam']=df['label'].apply(lambda x: 0 if x=='spam' else 1)
df.head()

Unnamed: 0,label,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(list(df['text']), list(df['spam']), test_size=0.2, stratify=df['spam'], random_state=42)

VOCAB_SIZE = 1000
MAX_LEN = 150
EMBEDDING_DIM = 64

tokenizer=Tokenizer(num_words=VOCAB_SIZE,oov_token="SPL")
tokenizer.fit_on_texts(X_train)

train_seq=tokenizer.texts_to_sequences(X_train)
train_pad=pad_sequences(train_seq, maxlen=MAX_LEN, truncating="post")

test_seq=tokenizer.texts_to_sequences(X_test)
test_pad=pad_sequences(test_seq,maxlen=MAX_LEN, truncating="post")

y_train=np.array(y_train)
y_test=np.array(y_test)

In [21]:
inputs = Input(name='inputs', shape=[MAX_LEN])
x = Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LEN)(inputs)
x = Bidirectional(LSTM(units=64, return_sequences=True))(x)
x = Bidirectional(LSTM(units=64))(x)
x = Dense(32, activation="relu")(x)
x = Dropout(0.5)(x)
x = Dense(1, activation="sigmoid")(x)

model = tf.keras.Model(inputs=inputs, outputs=x)

print(model.summary())

early_callback = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=10,
    mode="auto",
    restore_best_weights=True,
    verbose=1
)

model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])
history = model.fit(train_pad, y_train, epochs=30, validation_data=(test_pad, y_test), callbacks=[early_callback], batch_size=32, use_multiprocessing=True, shuffle=True)

Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 150)]             0         
                                                                 
 embedding_6 (Embedding)     (None, 150, 64)           64000     
                                                                 
 bidirectional_10 (Bidirecti  (None, 150, 128)         66048     
 onal)                                                           
                                                                 
 bidirectional_11 (Bidirecti  (None, 128)              98816     
 onal)                                                           
                                                                 
 dense_10 (Dense)            (None, 32)                4128      
                                                                 
 dropout_25 (Dropout)        (None, 32)                0   

In [27]:
save_path = os.path.join(os.getcwd(), '../checkpoints')

json_config = model.to_json()
with open(os.path.join(save_path, 'model_config.json'), 'w') as json_file:
    json_file.write(json_config)
    
tokenizer_json = tokenizer.to_json()
with open(os.path.join(save_path, 'tokenizer.json'), 'w', encoding='utf-8') as json_file:
    json_file.write(tokenizer_json)
        
model.save_weights(os.path.join(save_path, 'best_weights.h5'))