In [1]:
import nltk
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
# nltk.download('stopwords')
# nltk.download('punkt')
from sklearn import model_selection 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

# 1. PREPROCESSING

In [2]:
# load the dataset of SMS messages
df = pd.read_table('SMSSPamCollection', header=None, encoding='utf-8')
print(df.shape)
df.head()

(5572, 2)


Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Labelling

In [3]:
classes = df[0]
classes.value_counts()

ham     4825
spam     747
Name: 0, dtype: int64

In [4]:
# convert class labels to binary values, 0 = ham and 1 = spam
encoder = LabelEncoder()
y = encoder.fit_transform(classes)
y[:5]

array([0, 0, 1, 0, 0])

## Feature selection

In [5]:
# store the SMS message data
text_messages = df[1].str.lower()
text_messages.head()

0    go until jurong point, crazy.. available only ...
1                        ok lar... joking wif u oni...
2    free entry in 2 a wkly comp to win fa cup fina...
3    u dun say so early hor... u c already then say...
4    nah i don't think he goes to usf, he lives aro...
Name: 1, dtype: object

### use regular expressions to replace email addresses, URLs, phone numbers, other numbers


In [6]:
# Replace email addresses with 'email'
processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
'emailaddress')

# Replace URLs with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
'webaddress')

# Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
processed = processed.str.replace(r'£|\$', 'moneysymb')

# Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
'phonenumbr')

# Replace numbers with 'numbr'
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

# Remove punctuation [,:; etc]
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace and change words to lower case
processed = processed.str.strip()
processed.head()

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in numbr a wkly comp to win fa cup ...
3          u dun say so early hor u c already then say
4    nah i don t think he goes to usf he lives arou...
Name: 1, dtype: object

In [7]:
stop_words = set(stopwords.words('english'))
print(len(stop_words))
stop_words_list = list(stop_words)
stop_words_list.sort()
print(stop_words_list)

179
['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn', "shouldn't", 'so', 'so

In [8]:
#removing stopwords
processed = processed.apply(lambda x: [term for term in word_tokenize(x) if term not in stop_words])
processed.head()

0    [go, jurong, point, crazy, available, bugis, n...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, numbr, wkly, comp, win, fa, cup,...
3        [u, dun, say, early, hor, u, c, already, say]
4       [nah, think, goes, usf, lives, around, though]
Name: 1, dtype: object

In [9]:
# Remove word stems using a Porter stemmer
ps = nltk.PorterStemmer()
processed = processed.apply(lambda x:[ps.stem(term) for term in x])
processed.head()

0    [go, jurong, point, crazi, avail, bugi, n, gre...
1                         [ok, lar, joke, wif, u, oni]
2    [free, entri, numbr, wkli, comp, win, fa, cup,...
3        [u, dun, say, earli, hor, u, c, alreadi, say]
4         [nah, think, goe, usf, live, around, though]
Name: 1, dtype: object

In [10]:
# create bag-of-words
all_words = []
for message in processed:
    for w in message:
        all_words.append(w)
all_words = nltk.FreqDist(all_words)
# all_words.items()

# use the 1500 most common words as features
word_features = [word_tuple[0] for word_tuple in all_words.most_common(1500)]
print(word_features[:10])

['numbr', 'u', 'call', 'go', 'get', 'ur', 'gt', 'lt', 'come', 'moneysymbnumbr']


In [11]:
featureset = []
for current_words in processed:
    features = [0]* len(word_features)
    for word in current_words:
        if word in word_features:
            index_value = word_features.index(word)
            features[index_value] += 1
    featureset.append(features)
    
X=np.array(featureset)
print(X.shape)
X

(5572, 1500)


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [3, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# 2. TRAINING & TESTING

In [12]:
X_train, X_test, y_train, y_test = model_selection .train_test_split(X, y, test_size=0.2)
print(len(X_train), len(X_test), len(y_train), len(y_test))

4457 1115 4457 1115


In [13]:
# Define models to train
classifiers = [
        ("K Nearest Neighbors", KNeighborsClassifier()),
        ("Decision Tree"      , DecisionTreeClassifier()),
        ("Random Forest"      , RandomForestClassifier()),
        ("Logistic Regression", LogisticRegression()),
        ("SGD Classifier"     , SGDClassifier(max_iter = 100)),
        ("Naive Bayes"        , MultinomialNB()),
        ("SVM Linear"         , SVC(kernel = 'linear'))
        ]


In [14]:
# train and test for each classifier
for classifier in classifiers:
    clf = classifier[1]
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)*100
    print("{} Accuracy: {}".format(classifier[0], accuracy))
    

K Nearest Neighbors Accuracy: 94.26008968609865
Decision Tree Accuracy: 97.04035874439462
Random Forest Accuracy: 98.38565022421525
Logistic Regression Accuracy: 98.65470852017937
SGD Classifier Accuracy: 98.56502242152466
Naive Bayes Accuracy: 98.83408071748879
SVM Linear Accuracy: 98.38565022421525


In [15]:
# Voting classifier
voting_clf = VotingClassifier(estimators = classifiers, voting='hard', weights=None)
voting_clf = voting_clf.fit(X_train,y_train)
accuracy = voting_clf.score(X_test,y_test)*100
print("Accuracy for voting classifier: {}".format(accuracy))

Accuracy for voting classifier: 98.7443946188341
