In [2]:
import sys
import nltk
import sklearn
import pandas
import numpy

## 1. Load the Dataset

In [4]:
import pandas as pd
import numpy as np

# load the dataset of sms messages
df = pd.read_table('SMSSpamCollection', header = None, encoding='utf-8')

In [5]:
# print useful information about the dataset

print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       5572 non-null   object
 1   1       5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [6]:
# check class distribution
classes = df[0]
print(classes.value_counts())

0
ham     4825
spam     747
Name: count, dtype: int64


## 2. Preprocess the Data

In [9]:
# convert class labels to binary values, 0 = ham, 1 = spam

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
Y = encoder.fit_transform(classes)

print(classes[:10])
print(Y[:10])

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object
[0 0 1 0 0 1 0 0 1 1]


In [10]:
# store the SMS message data
text_messages = df[1]
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [13]:
# use regular expressions to replace email addresses, urls, phone numbers, other numbers, symbols

# replace email addresses with 'emailaddr'
processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', 'emailaddr')

# replace urls with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', 'webaddress')

# replace money symbols with 'moneysymb'
processed = processed.str.replace(r'£|\$', 'moneysymb')

#replace 10digit phone number with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[s\-]?[\d]{3}[\s-]?[\d]{4}$', 'phonenumbr')

# replace normal number with 'number'
processed = processed.str.replace(r'\d+(\.\d+)?', 'number')

# remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

# change words to lower case = Hello, HELLO, hello are all the same word!
processed = processed.str.lower()

print(processed)

0       go until jurong point, crazy.. available only ...
1                           ok lar... joking wif u oni...
2       free entry in 2 a wkly comp to win fa cup fina...
3       u dun say so early hor... u c already then say...
4       nah i don't think he goes to usf, he lives aro...
                              ...                        
5567    this is the 2nd time we have tried 2 contact u...
5568                 will ü b going to esplanade fr home?
5569    pity, * was in mood for that. so...any other s...
5570    the guy did some bitching but i acted like i'd...
5571                           rofl. its true to its name
Name: 1, Length: 5572, dtype: object


In [14]:
# remove the stop words from text messages

from nltk.corpus import stopwords
sw = set(stopwords.words('english'))
processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in sw))
print(processed)

0       go jurong point, crazy.. available bugis n gre...
1                           ok lar... joking wif u oni...
2       free entry 2 wkly comp win fa cup final tkts 2...
3               u dun say early hor... u c already say...
4                 nah think goes usf, lives around though
                              ...                        
5567    2nd time tried 2 contact u. u £750 pound prize...
5568                         ü b going esplanade fr home?
5569             pity, * mood that. so...any suggestions?
5570    guy bitching acted like i'd interested buying ...
5571                                      rofl. true name
Name: 1, Length: 5572, dtype: object


In [15]:
# remove word stems using Porter stemmer

ps = nltk.PorterStemmer()
processed =  processed.apply(lambda x: ' '.join(ps.stem(term) for  term in x.split()))
print(processed)

0       go jurong point, crazy.. avail bugi n great wo...
1                             ok lar... joke wif u oni...
2       free entri 2 wkli comp win fa cup final tkt 21...
3               u dun say earli hor... u c alreadi say...
4                   nah think goe usf, live around though
                              ...                        
5567    2nd time tri 2 contact u. u £750 pound prize. ...
5568                             ü b go esplanad fr home?
5569             pity, * mood that. so...ani suggestions?
5570    guy bitch act like i'd interest buy someth els...
5571                                      rofl. true name
Name: 1, Length: 5572, dtype: object


In [16]:
from nltk.tokenize import word_tokenize

# creating a bag of words model
all_words = []
for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)

all_words = nltk.FreqDist(all_words)

# print the total number words and 15 most common words
print("Number of words:{}".format(len(all_words)))
print('most common words:{}'.format(all_words.most_common(15)))

Number of words:8921
most common words:[('.', 4759), (',', 1939), ('?', 1550), ('!', 1397), ('...', 1146), ('u', 1138), ('&', 922), (';', 768), (':', 722), ('i', 715), ('..', 697), ('call', 644), ("'", 535), (')', 499), ('2', 478)]


In [18]:
# use the 1500 most common words as features
word_features = list(all_words.keys())[:1500]

# define a find_feature function
def find_feature(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word]=(word in words)
    return features
    
# see results
features = find_feature(processed[0])
for key,value in features.items():
    if value == True:
        print(key)

go
jurong
point
,
crazy
..
avail
bugi
n
great
world
la
e
buffet
...
cine
got
amor
wat


In [19]:
processed[0]

'go jurong point, crazy.. avail bugi n great world la e buffet... cine got amor wat...'

In [20]:
features

{'go': True,
 'jurong': True,
 'point': True,
 ',': True,
 'crazy': True,
 '..': True,
 'avail': True,
 'bugi': True,
 'n': True,
 'great': True,
 'world': True,
 'la': True,
 'e': True,
 'buffet': True,
 '...': True,
 'cine': True,
 'got': True,
 'amor': True,
 'wat': True,
 'ok': False,
 'lar': False,
 'joke': False,
 'wif': False,
 'u': False,
 'oni': False,
 'free': False,
 'entri': False,
 '2': False,
 'wkli': False,
 'comp': False,
 'win': False,
 'fa': False,
 'cup': False,
 'final': False,
 'tkt': False,
 '21st': False,
 'may': False,
 '2005.': False,
 'text': False,
 '87121': False,
 'receiv': False,
 'question': False,
 '(': False,
 'std': False,
 'txt': False,
 'rate': False,
 ')': False,
 't': False,
 '&': False,
 'c': False,
 "'": False,
 'appli': False,
 '08452810075over18': False,
 'dun': False,
 'say': False,
 'earli': False,
 'hor': False,
 'alreadi': False,
 'nah': False,
 'think': False,
 'goe': False,
 'usf': False,
 'live': False,
 'around': False,
 'though': False

In [22]:
# find features for all messages (wrap zip with lst())
messages = list(zip(processed, Y))

# define a seed for reproductibility
seed = 1
np.random.seed = seed
np.random.shuffle(messages)

# call find_features function for each SMS mesgs
featuresets = [(find_feature(text), label) for (text,label) in messages]

In [25]:
from sklearn import model_selection
training, testing = model_selection.train_test_split(featuresets,test_size = 0.25,random_state = seed)
print('training:{}'.format(len(training)))
print('testing:{}'.format(len(testing)))

training:4179
testing:1393


## 4.scikitlearn classfiers with NLTK

In [26]:
from sklearn.neighbors import  KNeighborsClassifier
from sklearn.tree import  DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import  LogisticRegression, SGDClassifier
from sklearn.naive_bayes import  MultinomialNB
from sklearn.svm import  SVC
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

In [29]:
# define models to train
names = ['K nearest neighbors', 'Decision tree', 'Random forest', 'logistic regression', 'SGD classifier', 'Naive bayes', 'SVM linear']
classifiers = [
    KNeighborsClassifier(), 
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter=100),
    MultinomialNB(),
    SVC(kernel='linear')
]

#list(zip) again as python 2>3
models = list(zip(names, classifiers))

print(models)

[('K nearest neighbors', KNeighborsClassifier()), ('Decision tree', DecisionTreeClassifier()), ('Random forest', RandomForestClassifier()), ('logistic regression', LogisticRegression()), ('SGD classifier', SGDClassifier(max_iter=100)), ('Naive bayes', MultinomialNB()), ('SVM linear', SVC(kernel='linear'))]


In [30]:
# wrap models in NLTK
from nltk.classify.scikitlearn import  SklearnClassifier
for name,model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100 
    print('{}:accuracy:{}'.format(name, accuracy))

K nearest neighbors:accuracy:91.60086145010769
Decision tree:accuracy:95.11844938980617
Random forest:accuracy:96.8413496051687
logistic regression:accuracy:97.70279971284997
SGD classifier:accuracy:97.77458722182341
Naive bayes:accuracy:97.91816223977028
SVM linear:accuracy:98.1335247666906


In [33]:
# ensemble method - voting classifier
from sklearn.ensemble import  VotingClassifier
# define models to train
from nltk.classify.scikitlearn import  SklearnClassifier
for name,model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model,testing) * 100 

models = list(zip(names,classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models,voting='hard', n_jobs= -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_ensemble,testing) * 100
print('ensemble method accuracy :{}'.format(accuracy))

ensemble method accuracy :97.63101220387652


In [34]:
# make class label prediction for testing set
txt_features,labels = zip(*testing)
prediction = nltk_ensemble.classify_many(txt_features)

In [35]:
# print a confusion matrix and a classification report
print(classification_report(labels,prediction))
pd.DataFrame(
confusion_matrix(labels,prediction),
index = [['actual','actual'],['ham','spam']],
columns = [['predicted','predicted'], ['ham','spam']])

              precision    recall  f1-score   support

           0       0.97      1.00      0.99      1199
           1       0.99      0.84      0.91       194

    accuracy                           0.98      1393
   macro avg       0.98      0.92      0.95      1393
weighted avg       0.98      0.98      0.98      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1197,2
actual,spam,31,163
