In [1]:
#importing libraries
import nltk
import sklearn
import pandas as pd
import numpy as np

In [2]:
cd D:\Personal\BusinessProblems\smsspamcollection

D:\Personal\BusinessProblems\smsspamcollection


In [5]:
#loadDataset
data=pd.read_table('SMSSpamCollection.csv',header=None,encoding='utf-8')

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.2+ KB


In [8]:
data.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
#Checking the balance of the dataset
data[0].value_counts()

ham     4825
spam     747
Name: 0, dtype: int64

In [14]:
#Preprocessing the data
from sklearn.preprocessing import LabelEncoder
en=LabelEncoder()
y=en.fit_transform(data[0])

In [15]:
y[:10]

array([0, 0, 1, 0, 0, 1, 0, 0, 1, 1])

In [16]:
text=data[1]

In [17]:
text.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: 1, dtype: object

#### Data Preprocessing

In [18]:
#Now we would be replacing email addresses, URLs, phone numbers ,other numbers , symbols
#replace emails with 'emailaddress'
text_e=text.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailaddress')

In [19]:
#replace URLs with 'webaddress'
text_e = text_e.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress')

#replace money symbols with 'moneysymb' (£ = ALT + 156)
text_e = text_e.str.replace(r'£|\$', 'moneysymb')
    
#replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
text_e = text_e.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr')
    
#replace numbers with 'numbr'
text_e = text_e.str.replace(r'\d+(\.\d+)?', 'numbr')

In [21]:
#remove punctuation
text_e = text_e.str.replace(r'[^\w\d\s]', ' ')

#replace whitespace with a single space
text_e = text_e.str.replace(r'\s+', ' ')

#remove leading and trailing whitespace
text_e = text_e.str.replace(r'^\s+|\s+?$', '')

In [22]:
#lowerCase
text_e = text_e.str.lower()
print(text_e)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in numbr a wkly comp to win fa cup ...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
                              ...                        
5567    this is the numbrnd time we have tried numbr c...
5568                  will ü b going to esplanade fr home
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i d...
5571                            rofl its true to its name
Name: 1, Length: 5572, dtype: object


In [23]:
#remove stop words from text messages
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
text_e = text_e.apply(lambda x: ' '.join(
    term for term in x.split() if term not in stop_words))



In [24]:
#remove word stems using a Porter stemmer
ps = nltk.PorterStemmer()

text_e = text_e.apply(lambda x: ' '.join(
    ps.stem(term) for term in x.split()))

#### Generating Features

- Feature engineering is the process of using domain knowledge of the data to create features for machine learning algorithms. In this project, the words in each text message will be our features. For this purpose, it will be necessary to tokenize each word. We will use the 1500 most common words as features.


In [25]:
from nltk.tokenize import word_tokenize

#creating bag-of-words
all_words = []
for msg in text_e:
    words = word_tokenize(msg)
    for word in words:
        all_words.append(word)
        
all_words = nltk.FreqDist(all_words)



In [26]:
#now we print the total number of words and the 15 most common words
print('Number of words: {}'.format(len(all_words)))
print('Most common words: {}'.format(all_words.most_common(15)))

Number of words: 6579
Most common words: [('numbr', 2648), ('u', 1207), ('call', 674), ('go', 456), ('get', 451), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('moneysymbnumbr', 303), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266)]


In [27]:
#we use the 1500 most common words as features
word_features = list(all_words.keys())[:1500]

In [30]:
# The find_features function will determine which of the 1500 word features are contained in the review
def find_features(msg):
    words = word_tokenize(msg)
    features = {}
    for word in word_features:
        features[word] = (word in words)

    return features

#demo
features = find_features(text_e[0])
for key, value in features.items():
    if value == True:
        print(key)


go
jurong
point
crazi
avail
bugi
n
great
world
la
e
buffet
cine
got
amor
wat


In [35]:
#we now do it for all the messages
messages = list(zip(text_e, y))

# define a seed for reproducibility
seed = 1
np.random.seed = seed
np.random.shuffle(messages)

# call find_features function for each SMS message
featuresets = [(find_features(text), label) for (text, label) in messages]



In [36]:
# we can split the featuresets into training and testing datasets using sklearn
from sklearn import model_selection

# split the data into training and testing datasets
training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state=seed)



In [37]:
print(len(training))
print(len(testing))

4179
1393


#### We apply various Sklearn Algortihms

In [38]:
#we willuse sklearn algorithms in NLTK
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model = SklearnClassifier(SVC(kernel = 'linear'))

# train the model on the training data
model.train(training)

# and test on the testing dataset!
accuracy = nltk.classify.accuracy(model, testing)*100
print("SVC Accuracy: {}".format(accuracy))

SVC Accuracy: 98.42067480258436


In [39]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

#define models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models =list(zip(names, classifiers))

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
    print("{} Accuracy: {}".format(name, accuracy))



K Nearest Neighbors Accuracy: 94.68772433596554
Decision Tree Accuracy: 96.12347451543431




Random Forest Accuracy: 98.1335247666906




Logistic Regression Accuracy: 98.42067480258436
SGD Classifier Accuracy: 97.84637473079684
Naive Bayes Accuracy: 98.7078248384781
SVM Linear Accuracy: 98.42067480258436


In [43]:
#Ensemble methods - Voting classifier
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_model, testing)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))

Voting Classifier: Accuracy: 98.42067480258436


In [44]:
#make class label prediction for testing set
txt_features, labels = zip(*testing)

prediction = nltk_ensemble.classify_many(txt_features)

In [45]:
# print a confusion matrix and a classification report
print(classification_report(labels, prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual', 'actual'], ['ham', 'spam']],
    columns = [['predicted', 'predicted'], ['ham', 'spam']])

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1204
           1       1.00      0.92      0.96       189

    accuracy                           0.99      1393
   macro avg       0.99      0.96      0.98      1393
weighted avg       0.99      0.99      0.99      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1204,0
actual,spam,15,174
