In [None]:
import os
import pathlib
import pandas as pd
import pickle
import random
import numpy as np
import pickle

In [None]:
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint

In [None]:
EXPORT_DIR = pathlib.Path('/content/')
DATASET_CSV_PATH = EXPORT_DIR / 'trainspam.csv'
TRAINING_DATA_PATH = EXPORT_DIR / 'spam-training-data.pkl'

In [None]:
df = pd.read_csv(DATASET_CSV_PATH, on_bad_lines='skip', sep = ';')
df.head()

In [None]:
texts = df['text'].astype(str).tolist()
labels = df['cate'].tolist()

In [None]:
labels_legend = {'False': 0, 'True': 1}
labels_legend_inverted = {f"{v}":k for k,v in labels_legend.items()}

In [None]:
labels_as_int =  [labels_legend[str(x)] for x in labels]
print(labels_legend_inverted)

In [None]:
MAX_NUM_WORDS=280
MAX_SEQUENCE_LENGTH = 280
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
y = to_categorical(np.asarray(labels_as_int))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
training_data = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test,
    'max_words': MAX_NUM_WORDS,
    'max_sequence': MAX_SEQUENCE_LENGTH,
    'legend': labels_legend,
    'labels_legend_inverted': labels_legend_inverted,
    "tokenizer": tokenizer,
}

In [None]:
with open(TRAINING_DATA_PATH, 'wb') as f:
    pickle.dump(training_data, f)

In [None]:
data = {}

with open(TRAINING_DATA_PATH, 'rb') as f:
    data = pickle.load(f)

In [None]:
X_test = data['X_test']
X_train = data['X_train']
y_test = data['y_test']
y_train = data['y_train']
labels_legend_inverted = data['labels_legend_inverted']
legend = data['legend']
max_sequence = data['max_sequence']
max_words = data['max_words']
tokenizer = data['tokenizer']
print(X_train)

In [None]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(MAX_NUM_WORDS, embed_dim, input_length=X_train.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])
print(model.summary())

In [None]:
checkpoint = ModelCheckpoint('model_weights.h5', save_best_only=True, save_weights_only=True, monitor='val_loss', mode='min')

In [None]:
batch_size = 32
epochs = 5
model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=batch_size, verbose=1, epochs=epochs, callbacks=[checkpoint])

In [None]:
MODEL_EXPORT_PATH = EXPORT_DIR / 'spam-model.h5'
model.save(str(MODEL_EXPORT_PATH))
MODEL_EXPORT_PATH = EXPORT_DIR / 'spam-model'
model.save(str(MODEL_EXPORT_PATH))

In [None]:
static_model = Sequential()
static_model.add(Embedding(MAX_NUM_WORDS, embed_dim, input_length=X_train.shape[1]))
static_model.add(SpatialDropout1D(0.4))
static_model.add(LSTM(lstm_out, dropout=0.3, recurrent_dropout=0.3))
static_model.add(Dense(2, activation='softmax'))
static_model.load_weights('model_weights.h5')
static_model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])

static_model.save('spam-model')