In [13]:
import csv
import tensorflow as tf
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# !wget --no-check-certificate \
    # https://storage.googleapis.com/kagglesdsdata/datasets/483/982/spam.csv \
    # -O /tmp/spam.csv

print(tf.__version__)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2.1.0


In [0]:
# pip install tensorflow --upgrade

In [0]:
vocab_size = 1000             # Number of word in local vocabulary
embedding_dim = 16
max_length = 120              # Maximum length of a sentence in words
trunc_type='post'             
padding_type='post'          
oov_tok = "<OOV>"             # Token for unknown words
training_portion = .8         # Training Portion .8 means 80% of all data 

In [16]:
sentences = []
labels = []
# stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
stopwords = set(stopwords.words('english')) 
print(len(stopwords))              # List of words that have no weightage for email being spam or non-spam
# Expected Output
# 153

179


In [17]:
with open("/tmp/spam.csv", 'r', encoding='ISO 8859-1') as csvfile: # Read csv file and pass through stopwords
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        labels.append(row[0])
        sentence = row[1]
        for word in stopwords:
            token = " " + word + " "
            sentence = sentence.replace(token, " ")
        sentences.append(sentence)

print(len(labels))
print(len(sentences))
print(sentences[0])
# Actual sentence[0] in csv is    -     Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
# But it would be printed a little bit different as all data passes through stopwords

5572
5572
Go jurong point, crazy.. Available bugis n great world la e buffet... Cine got amore wat...


In [18]:
train_size = int(len(sentences) * training_portion)

train_sentences = sentences[:train_size]
train_labels = labels[:train_size]

validation_sentences = sentences[train_size:]
validation_labels = labels[train_size:]

print(train_size)
print(len(train_sentences))
print(len(train_labels))
print(len(validation_sentences))
print(len(validation_labels))

4457
4457
4457
1115
1115


In [0]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)

In [0]:
validation_sequences = tokenizer.texts_to_sequences(validation_sentences)
validation_padded = pad_sequences(validation_sequences, padding=padding_type, maxlen=max_length)

In [0]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))

In [22]:
# Create a RNN Model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(6, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 120, 16)           16000     
_________________________________________________________________
global_average_pooling1d_1 ( (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 24)                408       
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 150       
Total params: 16,558
Trainable params: 16,558
Non-trainable params: 0
_________________________________________________________________


In [23]:
num_epochs = 50                    # number of epochs
history = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), verbose=2)

Train on 4457 samples, validate on 1115 samples
Epoch 1/50
4457/4457 - 1s - loss: 1.1466 - accuracy: 0.8185 - val_loss: 0.4470 - val_accuracy: 0.8700
Epoch 2/50
4457/4457 - 1s - loss: 0.4041 - accuracy: 0.8649 - val_loss: 0.3700 - val_accuracy: 0.8700
Epoch 3/50
4457/4457 - 1s - loss: 0.3697 - accuracy: 0.8649 - val_loss: 0.3495 - val_accuracy: 0.8700
Epoch 4/50
4457/4457 - 1s - loss: 0.3491 - accuracy: 0.8649 - val_loss: 0.3266 - val_accuracy: 0.8700
Epoch 5/50
4457/4457 - 1s - loss: 0.3204 - accuracy: 0.8647 - val_loss: 0.2902 - val_accuracy: 0.8691
Epoch 6/50
4457/4457 - 1s - loss: 0.2648 - accuracy: 0.8687 - val_loss: 0.2190 - val_accuracy: 0.8933
Epoch 7/50
4457/4457 - 1s - loss: 0.1885 - accuracy: 0.9228 - val_loss: 0.1469 - val_accuracy: 0.9570
Epoch 8/50
4457/4457 - 1s - loss: 0.1313 - accuracy: 0.9605 - val_loss: 0.1079 - val_accuracy: 0.9695
Epoch 9/50
4457/4457 - 1s - loss: 0.0995 - accuracy: 0.9684 - val_loss: 0.0906 - val_accuracy: 0.9731
Epoch 10/50
4457/4457 - 1s - loss:

In [24]:
test_sentences = ["WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.",
                  "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune"]
for sentence in test_sentences:
    for word in stopwords:
        token = " " + word + " "
        sentence = sentence.replace(token, " ")
test_sentences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sentences, padding=padding_type, maxlen=max_length)
predict_mail = model.predict_classes(test_padded)
print(predict_mail)

[2 1]
