In [1]:
import nltk
import pandas as pd
import numpy as np

## Load Data

In [2]:
df = pd.read_table('SMSSpamCollection', header = None, encoding='utf-8')
#print data and check the infomation
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       5572 non-null   object
 1   1       5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [3]:
#Check the class distribution
classes = df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


## Preprocess the Data

In [4]:
from sklearn.preprocessing import LabelEncoder

#Conver the calss label to binary value, ham = 0, sapn = 1
encoder = LabelEncoder()
Y = encoder.fit_transform(classes)

print(Y[:10])

[0 0 1 0 0 1 0 0 1 1]


In [5]:
#Load the message data
text_message = df[1]
print(text_message[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


## Regular Expration
Resourcees: http://regexlib.com/

Some common regular expression metacharacters - from wikipedia.com

**^** Matches the starting position within the string. In line-based tools, it matches the starting position of any line.

**.** Matches any single character (many applications exclude newlines, and exactly which characters are considered newlines is flavor-, character-encoding-, and platform-specific, but it is safe to assume that the line feed character is included). Within POSIX bracket expressions, the dot character matches a literal dot. For example, a.c matches "abc", etc., but [a.c] matches only "a", ".", or "c".

**[ ]** A bracket expression. Matches a single character that is contained within the brackets. For example, [abc] matches "a", "b", or "c". [a-z] specifies a range which matches any lowercase letter from "a" to "z". These forms can be mixed: [abcx-z] matches "a", "b", "c", "x", "y", or "z", as does [a-cx-z]. The - character is treated as a literal character if it is the last or the first (after the ^, if present) character within the brackets: [abc-], [-abc]. Note that backslash escapes are not allowed. The ] character can be included in a bracket expression if it is the first (after the ^) character: []abc].

**[^ ]** Matches a single character that is not contained within the brackets. For example, [^abc] matches any character other than "a", "b", or "c". [^a-z] matches any single character that is not a lowercase letter from "a" to "z". Likewise, literal characters and ranges can be mixed.

**$** Matches the ending position of the string or the position just before a string-ending newline. In line-based tools, it matches the ending position of any line.

**( )** Defines a marked subexpression. The string matched within the parentheses can be recalled later (see the next entry, \n). A marked subexpression is also called a block or capturing group. BRE mode requires ( ).

**\n** Matches what the nth marked subexpression matched, where n is a digit from 1 to 9. This construct is vaguely defined in the POSIX.2 standard. Some tools allow referencing more than nine capturing groups.

**\*** Matches the preceding element zero or more times. For example, abc matches "ac", "abc", "abbbc", etc. [xyz] matches "", "x", "y", "z", "zx", "zyx", "xyzzy", and so on. (ab)* matches "", "ab", "abab", "ababab", and so on.

**{m,n}** Matches the preceding element at least m and not more than n times. For example, a{3,5} matches only "aaa", "aaaa", and "aaaaa". This is not found in a few older instances of regexes. BRE mode requires {m,n}.

In [6]:
#Use regular expression to replease Email, url,Money, Phone number, other number symbols 

#Replease email address with 'emailaddr'
processed = text_message.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', 'emailaddr')

#Replease web address with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', 'webaddress')

#Replease money symbol with 'moneysymb'
processed = processed.str.replace(r'£|\$', 'moneysymb')

#Replease 10 digit phone number with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phonenumber')

#Replease other number with 'numbr'
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

In [7]:
#Remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

#Replease the whitespace between terms with a single spaces
processed = processed.str.replace(r'\s+', ' ')

#Remove the leading and training whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [8]:
#Change the word to lower case
processed = processed.str.lower()
print(processed)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in numbr a wkly comp to win fa cup ...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
                              ...                        
5567    this is the numbrnd time we have tried numbr c...
5568                  will ü b going to esplanade fr home
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i d...
5571                            rofl its true to its name
Name: 1, Length: 5572, dtype: object


In [9]:
#Remove stop word from text messages
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [10]:
# Remove word stems using a Porter stemmer
ps = nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '.join(
    ps.stem(term) for term in x.split()))

In [11]:
print(processed)

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri numbr wkli comp win fa cup final tk...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
                              ...                        
5567    numbrnd time tri numbr contact u u moneysymbnu...
5568                              ü b go esplanad fr home
5569                                    piti mood suggest
5570    guy bitch act like interest buy someth els nex...
5571                                       rofl true name
Name: 1, Length: 5572, dtype: object


In [12]:
from nltk.tokenize import word_tokenize

#Create a bag-of-words
all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
all_words = nltk.FreqDist(all_words)

In [13]:
print('Number of words: {}'.format(len(all_words)))
print('Number of common words: {}'.format(all_words.most_common(15)))

Number of words: 6579
Number of common words: [('numbr', 2648), ('u', 1207), ('call', 674), ('go', 456), ('get', 451), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('moneysymbnumbr', 303), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266)]


In [14]:
#use most 1500 most common words as a feature

word_features = list(all_words.keys())[:1500]

In [15]:
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)
    return features

In [16]:
# Lets see an example!
features = find_features(processed[0])
for key, value in features.items():
    if value == True:
        print(key)

go
jurong
point
crazi
avail
bugi
n
great
world
la
e
buffet
cine
got
amor
wat


In [17]:
#Find Feature for all message
messages = list(zip(processed,Y))

#Define a seed for reproducibility
seed = 1
np.random.seed = seed
np.random.shuffle(messages)

#Call find_feature function for each SMS message
featuresets = [(find_features(text), label) for (text, label) in messages]

In [18]:
#Split Training and testing data using sklearn
from sklearn import model_selection
train, test = model_selection.train_test_split(featuresets, test_size = 0.25, random_state = seed)

In [19]:
print('Train Data: {}'.format(len(train)))
print('Test Data: {}'.format(len(test)))

Train Data: 4179
Test Data: 1393


## Scikit-Learn Classifiers with NLTK

In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [30]:
#Define the model to train
names = ['K Nearest Neighbors', 'Decision Tree', 'Random Forest', 'Logistic Regression', 'SGD Classifier', 'SVM Linear']

classifier = [KNeighborsClassifier(),
             DecisionTreeClassifier(),
             RandomForestClassifier(),
             LogisticRegression(),
             SGDClassifier(max_iter = 100),
             MultinomialNB(),
             SVC(kernel = 'linear')]

models = list(zip(names, classifier))

In [33]:
#Wrap model in NLTK
from nltk.classify.scikitlearn import SklearnClassifier

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(train)
    accuracy = nltk.classify.accuracy(nltk_model, test) * 100
    print('{} Accuracy: {}'.format(name, accuracy))

K Nearest Neighbors Accuracy: 95.76453697056712
Decision Tree Accuracy: 97.12849964106246
Random Forest Accuracy: 98.49246231155779
Logistic Regression Accuracy: 98.63603732950466
SGD Classifier Accuracy: 97.63101220387652
SVM Linear Accuracy: 97.98994974874373


In [37]:
from sklearn.ensemble import VotingClassifier

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))

nltk_ensemble.train(train)
accuracy = nltk.classify.accuracy(nltk_ensemble, test) * 100
print('Enseble Method Accuracy: {}'.format(accuracy))

Enseble Method Accuracy: 98.63603732950466


In [38]:
#Make class label predicting for testing set
txt_features, labels = zip(*test)

prediction = nltk_ensemble.classify_many(txt_features)
print('Done')

Done


In [46]:
#Print Confution matrix and a classification report
print(classification_report(labels,prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual', 'actual'], ['ham','spam']],
    columns = [['prediction','prediction'],['ham','spam']]
)

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1211
           1       0.98      0.91      0.95       182

    accuracy                           0.99      1393
   macro avg       0.98      0.95      0.97      1393
weighted avg       0.99      0.99      0.99      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,prediction,prediction
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1208,3
actual,spam,16,166
