In [2]:
#Since I created this notebook in google colaboratory, I mounted my google drive to access the dataset.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [272]:
import pandas as pd

data = pd.read_csv('/content/drive/My Drive/SMS_Spam_Dataset/spam.csv',encoding='Windows-1252', usecols=['v1','v2'])

In [320]:
import numpy as np

#Get a count of each label to see if there is a bias towards one.
print("Spam samples: " + str(np.count_nonzero(data.values[:,0] == 'spam', axis=0)))
print("Ham samples: " + str(np.count_nonzero(data.values[:,0] == 'ham', axis=0)))

Spam samples: 747
Ham samples: 4825


In [296]:
# As seem above, the dataset provides far more ham examples than spam examples. This will result in our model being biased towards predicting ham.
# To fix this, we will use only as many ham samples as there are spam samples.

x_data = []
y_data = []
count=0
j=0
while j<(len(data.values[:,0])):
  if data.values[j,0] == 'spam':
    y_data.append(1)
    x_data.append(data.values[j,1])
  if data.values[j,0] == 'ham':
    if count<747:
      y_data.append(0)
      x_data.append(data.values[j,1])
      count+=1
  j+=1

In [297]:
#Confirm equal number of ham and spam samples
print(y_data.count(0))
print(y_data.count(1))

747
747


In [322]:
#Below is code to gather a list of all unique words used throughout the samples.
#This will allow us to convert each message into a one hot encoded vector.

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import nltk
nltk.download('punkt')
from nltk.corpus import words as allwords
from string import punctuation
from nltk.stem.lancaster import LancasterStemmer

st=LancasterStemmer()
customStopWords=set(stopwords.words('english')+list(punctuation)+["’","”","“","",".","..","...","``","**"])

#Tokenize words in every message.
words=[word_tokenize(i) for i in x_data]

#Tokenization produced a list of tokenized words for each message, making a list of lists. This flattens the list.
words=[item for sublist in words for item in sublist]

#Remove all stopwords
wordsWOStopwords=[word for word in words if word not in customStopWords]

#Stem each word
stemmedWords=[st.stem(word) for word in wordsWOStopwords]

wordlist=list(set(stemmedWords))

print("Number of unique words: " + str(len(wordlist)))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Number of unique words: 4173


In [299]:
#Convert messages into one hot encoded vectors using the list of words.
X_data = []
for j in range(len(x_data)):
  Input=[]
  for i in range(len(wordlist)):
      Input.append(x_data[j].count(wordlist[i]))
  X_data.append(np.array(Input))

X_data = np.array(X_data)
Y_data = np.array(y_data)

In [None]:
#Split the data into test and train data.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, Y_data, test_size=0.33, random_state=42)

In [310]:
#We now can define a simple neural network that takes the one hot encoded vector as input.
#It has a single output, which will ideally be either 0 or 1 for our binary classification.
from keras.models import Sequential
from keras import layers

input_dim = len(wordlist)

model = Sequential()
model.add(layers.Dense(128, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(16, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='relu'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_50"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_126 (Dense)            (None, 128)               534272    
_________________________________________________________________
dense_127 (Dense)            (None, 16)                2064      
_________________________________________________________________
dense_128 (Dense)            (None, 1)                 17        
Total params: 536,353
Trainable params: 536,353
Non-trainable params: 0
_________________________________________________________________


In [311]:
history = model.fit(X_train,y_train,epochs=7,verbose=True,batch_size=64)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [313]:
#Make a function that takes in a string message and predicts whether it is spam or not.
def is_spam(message):
  Input=[]
  for i in range(len(wordlist)):
    Input.append(message.count(wordlist[i]))
  output=model.predict([Input])
  if output>0.1: #Since the model does not output only 0 and 1, this threshold is used to determine what counts as a 1 and what counts as a 0.
    return True
  else:
    return False

In [314]:
model.evaluate(X_test,y_test)



[0.14739477634429932, 0.9655870199203491]

In [315]:
is_spam('We have detected fraudulent activity in your account. Please contact us for further information.')

True

In [316]:
is_spam('When do you think we could schedule a meeting?')

False

In [317]:
is_spam('Sorry for the late reply, when could we discuss the details further?')

False

In [318]:
is_spam('You have been selected to recieve a prize. You have 24 hours to claim it.')

True

In [319]:
is_spam('Hey Richard, can we count on your vote in November?')

True