In [1]:
## Table of Content 

# 1. Import Libraries 
# 2. Import dataset
# 3. Check for missing values
# 4. Feature selection and split data 
# 5. Encode and tokenize selected columns 
# 6. Build model 
# 7. Train model
# 8. Evaluation and prediction

In [2]:
# import libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import utils
from tensorflow.keras.layers import Dense, Activation, Dropout, Embedding
from sklearn import preprocessing

In [3]:
# import dataset
sms_data = pd.read_csv('SMS.csv', encoding='unicode_escape')
sms_data

Unnamed: 0,S. No.,Message_body,Label
0,1,Rofl. Its true to its name,Non-Spam
1,2,The guy did some bitching but I acted like i'd...,Non-Spam
2,3,"Pity, * was in mood for that. So...any other s...",Non-Spam
3,4,Will ü b going to esplanade fr home?,Non-Spam
4,5,This is the 2nd time we have tried 2 contact u...,Spam
...,...,...,...
952,953,hows my favourite person today? r u workin har...,Non-Spam
953,954,How much you got for cleaning,Non-Spam
954,955,Sorry da. I gone mad so many pending works wha...,Non-Spam
955,956,Wat time ü finish?,Non-Spam


In [4]:
#check for missing values
sms_data.isnull().sum()

S. No.          0
Message_body    0
Label           0
dtype: int64

In [5]:
# split data into train and test subsets part 1 
train_size = int(len(sms_data) * .8)
print('Train:', train_size)
print('Test:', (len(sms_data))- train_size)

Train: 765
Test: 192


In [6]:
# split data into train and test subsets part 2
train_mb = sms_data['Message_body'][:train_size]
train_label = sms_data['Label'][:train_size]

test_mb = sms_data['Message_body'][train_size:]
test_label = sms_data['Label'][train_size:]

In [7]:
# encode sms_data['Label'] column 
endcoder = preprocessing.LabelEncoder()
endcoder.fit(train_label)
train_label2 = endcoder.transform(train_label)
test_label2 = endcoder.transform(test_label)

In [8]:
# convert encode sms_data['Label'] column to one hot rep categories 
num_c = np.max(train_label2) + 1
train_label2 = utils.to_categorical(train_label2, num_c)
test_label2 = utils.to_categorical(test_label2, num_c)

In [9]:
# tokenize each word in the sms_data['Message_body'] column 
# up to 1000 words 
max_words = 1000
tokenizer = Tokenizer(num_words = max_words, oov_token="<00V>")
tokenizer.fit_on_texts(train_mb)
train_mb2 = tokenizer.texts_to_matrix(train_mb)
test_mb2 = tokenizer.texts_to_matrix(test_mb)

In [10]:
# build model
model = tf.keras.Sequential()
model.add(Dense(100, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_c))
model.add(Activation('softmax'))

model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

In [11]:
# train model
epochs = 7 

history = model.fit(train_mb2, train_label2, epochs = epochs, verbose = 1,validation_split= 0.1)


Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [12]:
# evaluation 
score = model.evaluate(test_mb2, test_label2, verbose = 1)
print('Accuracy score of:',score[1])

Accuracy score of: 0.9739583134651184


In [13]:
# prediction 
text_labels = endcoder.classes_

for x in range(20):
    pred = model.predict(np.array([test_mb2[x]]))
    prediction = text_labels[np.argmax(pred)]
    print(test_mb.iloc[x][:60], '...')
    print('Actual Label:',test_label.iloc[x][:20])
    print('Predicted Label:', prediction + '\n')

The guy at the car shop who was flirting with me got my phon ...
Actual Label: Non-Spam
Predicted Label: Non-Spam

its cool but tyler had to take off so we're gonna buy for hi ...
Actual Label: Non-Spam
Predicted Label: Non-Spam

I dont know why she.s not getting your messages ...
Actual Label: Non-Spam
Predicted Label: Non-Spam

Just come home. I don't want u to be miserable ...
Actual Label: Non-Spam
Predicted Label: Non-Spam

Santa calling! Would your little ones like a call from Santa ...
Actual Label: Spam
Predicted Label: Spam

URGENT This is our 2nd attempt to contact U. Your £900 prize ...
Actual Label: Spam
Predicted Label: Spam

Saw Guys and Dolls last night with Patrick Swayze it was gre ...
Actual Label: Non-Spam
Predicted Label: Non-Spam

Sorry that was my uncle. I.ll keep in touch ...
Actual Label: Non-Spam
Predicted Label: Non-Spam

Or u ask they all if next sat can a not. If all of them can  ...
Actual Label: Non-Spam
Predicted Label: Non-Spam

Gd luck 4 ur exams :-) ..