In [1]:
import sys
import nltk
import sklearn
import pandas
import numpy
import warnings
warnings.filterwarnings('ignore')
print('Python: {}'.format(sys.version))
print('NLTK: {}'.format(nltk.__version__))
print('Scikit-learn: {}'.format(sklearn.__version__))
print('Pandas: {}'.format(pandas.__version__))
print('Numpy: {}'.format(numpy.__version__))

Python: 3.5.6 |Anaconda custom (64-bit)| (default, Aug 26 2018, 16:05:27) [MSC v.1900 64 bit (AMD64)]
NLTK: 3.3
Scikit-learn: 0.20.0
Pandas: 0.23.4
Numpy: 1.15.2


## 1. Load the Dataset

In [2]:
import pandas as pd
import numpy as np

# load the dataset 
df = pd.read_table('SMSspamCollection', header=None, encoding='utf-8')

In [3]:
# Useful information about the dataset
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [4]:
# Check class distribution
classes  = df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


# 2. Preprocess the Data

In [5]:
# Label encoding
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
Y = enc.fit_transform(classes)
print(classes[:10])
print(Y[:10])

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object
[0 0 1 0 0 1 0 0 1 1]


In [6]:
# Store the SMS message data
text_messages = df[1]
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [7]:
# Using regular expression(taken from --http://www.regexlib.com ) to replace email, urls, phone numbers, other numbers, symbols 

# Replace email address with 'emailaddr'
processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', 'emailaddr')


# Replace URLs with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress')

# Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
processed = processed.str.replace(r'£|\$', 'moneysymb')
    
# Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr')
    
# Replace numbers with 'numbr'
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')


In [8]:

# Remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace b/w terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace 
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [9]:
# Change words to lower case
processed = processed.str.lower()
print(processed[:10])

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in numbr a wkly comp to win fa cup ...
3          u dun say so early hor u c already then say
4    nah i don t think he goes to usf he lives arou...
5    freemsg hey there darling it s been numbr week...
6    even my brother is not like to speak with me t...
7    as per your request melle melle oru minnaminun...
8    winner as a valued network customer you have b...
9    had your mobile numbr months or more u r entit...
Name: 1, dtype: object


In [10]:
# Remove stop words from text messages
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))


In [11]:
# Stemming using the Porter stemmer
ps = nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()) )

In [12]:
print(processed[:10])

0    go jurong point crazi avail bugi n great world...
1                                ok lar joke wif u oni
2    free entri numbr wkli comp win fa cup final tk...
3                  u dun say earli hor u c alreadi say
4                 nah think goe usf live around though
5    freemsg hey darl numbr week word back like fun...
6        even brother like speak treat like aid patent
7    per request mell mell oru minnaminungint nurun...
8    winner valu network custom select receivea mon...
9    mobil numbr month u r entitl updat latest colo...
Name: 1, dtype: object


In [13]:
# Tokenizing
from nltk.tokenize import word_tokenize

# creating a bag-of-words for features
all_words = []
for message in processed:
    words = word_tokenize(message)
    all_words += [w for w in words]
all_words = nltk.FreqDist(all_words)


In [14]:
# Total number of Words and the 20 most common words
print("Number of words : {}".format(len(all_words)))
print("Most common words : {}".format(all_words.most_common(20)))


Number of words : 6579
Most common words : [('numbr', 2648), ('u', 1207), ('call', 674), ('go', 456), ('get', 451), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('moneysymbnumbr', 303), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266), ('like', 261), ('got', 252), ('time', 252), ('good', 248), ('want', 247)]


In [15]:
# using most common 15 words as features
word_features = list(all_words.keys())[:1500]


In [16]:
# define find_features function 
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)
    return features


In [17]:
# checking an example
print(processed[10])
features = find_features(processed[10])
for key, value in features.items():
    if value == True:
        print(key)

gonna home soon want talk stuff anymor tonight k cri enough today
na


In [18]:
# All messages
messages = list(zip(processed, Y))

# defining seed for reproducibility
seed = 1
np.random.seed = 1
np.random.shuffle(messages) 

# Creating featureset
featureset = [(find_features(text), label) for text, label in messages]

In [19]:
featureset[0]

({'shakespear': False,
  'product': False,
  'surrend': False,
  'trauma': False,
  'upload': False,
  'foot': False,
  'womdarful': False,
  'smokin': False,
  'will': False,
  'profession': False,
  'dom': False,
  'safeti': False,
  'moneysymbnumbrcal': False,
  'numbrxxxxxxxxx': False,
  'moji': False,
  'marvel': False,
  'getz': False,
  'buzz': False,
  'infra': False,
  'oredi': False,
  'wipro': False,
  'disagre': False,
  'escal': False,
  'kalisidar': False,
  'ubi': False,
  'hannaford': False,
  'frm': False,
  'relationship': False,
  'theoret': False,
  'footbl': False,
  'ganesh': False,
  'craziest': False,
  'flew': False,
  'numbrdocd': False,
  'luk': False,
  'who': False,
  'famamu': False,
  'mysteri': False,
  'magic': False,
  'pendent': False,
  'knw': False,
  'rumour': False,
  'chines': False,
  'gossip': False,
  'sian': False,
  'pa': False,
  'ym': False,
  'parent': False,
  'je': False,
  'jet': False,
  'vip': False,
  'vatian': False,
  'garden': Fa

In [20]:
# Spliting featureset in testing and training dataset
from sklearn import model_selection
train, test = model_selection.train_test_split(featureset, test_size=0.4, random_state=seed)

In [21]:
print("Train : {}".format(len(train)))
print("Test : {}".format(len(test)))

Train : 3343
Test : 2229


### 3. Scikit-Learn Classifier with NLTK


In [22]:
# We can use sklearn algorithms in NLTK
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model = SklearnClassifier(SVC(kernel = 'linear'))

#training  the model on the training Data
model.train(train)

# Testing model on the Testing Data
accuracy = nltk.classify.accuracy(model, test)*100
print("SVC Accuracy: {}".format(accuracy))


SVC Accuracy: 95.02018842530283


In [23]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Define model to  Train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))
for m in models:
    print(m)

('K Nearest Neighbors', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))
('Decision Tree', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))
('Random Forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=

In [24]:
# Wrap models in NLTK
from nltk.classify.scikitlearn import SklearnClassifier

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(train)
    accuracy = nltk.classify.accuracy(nltk_model, test)*100
    print("{}: Accuracy: {}".format(name, accuracy))

K Nearest Neighbors: Accuracy: 92.0592193808883
Decision Tree: Accuracy: 94.25751458052939
Random Forest: Accuracy: 94.75100942126514
Logistic Regression: Accuracy: 94.9304620906236
SGD Classifier: Accuracy: 94.34724091520862
Naive Bayes: Accuracy: 94.9304620906236
SVM Linear: Accuracy: 95.02018842530283


In [25]:
# Ensemble methods -VOTING CLASSIFIER
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))



In [26]:
# Voting Classifier  -----HARD
nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(train)
accuracy = nltk.classify.accuracy(nltk_ensemble, test)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))


Voting Classifier: Accuracy: 95.19964109466127


In [27]:
# Class Label Prediction for Test Set
txt_features, labels = zip(*test)

prediction = nltk_ensemble.classify_many(txt_features)

In [28]:
# Confusion Matrix And Classification Report
print(classification_report(labels, prediction))
pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual', 'actual'], ['ham', 'spam']],
    columns = [['predicted', 'predicted'], ['ham', 'spam']])

              precision    recall  f1-score   support

           0       0.96      0.99      0.97      1929
           1       0.91      0.72      0.80       300

   micro avg       0.95      0.95      0.95      2229
   macro avg       0.93      0.85      0.89      2229
weighted avg       0.95      0.95      0.95      2229



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1907,22
actual,spam,85,215
