## Import Dataset

In [1]:
import pandas as pd

dataset = pd.read_table('data/SMSSpamCollection', sep='\t', 
                       header=None, names=['label', 'message'])

print(dataset.shape[0], 'records')
dataset.head()

5572 records


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Data Preprocessing

### Check for null values and remove those records

In [2]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
label      5572 non-null object
message    5572 non-null object
dtypes: object(2)
memory usage: 87.2+ KB


### Convert the labels to binary values

In [3]:
dataset['label'] = dataset['label'].map({'ham':0, 'spam':1})
dataset.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


### Convert the message to lower case

In [4]:
dataset['message'] = dataset['message'].map(lambda x: x.lower())
dataset.head()

Unnamed: 0,label,message
0,0,"go until jurong point, crazy.. available only ..."
1,0,ok lar... joking wif u oni...
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor... u c already then say...
4,0,"nah i don't think he goes to usf, he lives aro..."


### Remove punctuations in the message

In [5]:
dataset['message'] = dataset['message'].str.replace('[^\w\s]', '')
dataset.head()

Unnamed: 0,label,message
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor u c already then say
4,0,nah i dont think he goes to usf he lives aroun...


### Tokenize the message to words

In [6]:
import nltk

dataset['message'] = dataset['message'].apply(nltk.word_tokenize)
dataset.head()

Unnamed: 0,label,message
0,0,"[go, until, jurong, point, crazy, available, o..."
1,0,"[ok, lar, joking, wif, u, oni]"
2,1,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,0,"[u, dun, say, so, early, hor, u, c, already, t..."
4,0,"[nah, i, dont, think, he, goes, to, usf, he, l..."


### Perform stemming to normalize by removing the word variations

In [7]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

dataset['message'] = dataset['message'].apply(
    lambda x: [stemmer.stem(w) for w in x]
)

dataset.head()

Unnamed: 0,label,message
0,0,"[go, until, jurong, point, crazi, avail, onli,..."
1,0,"[ok, lar, joke, wif, u, oni]"
2,1,"[free, entri, in, 2, a, wkli, comp, to, win, f..."
3,0,"[u, dun, say, so, earli, hor, u, c, alreadi, t..."
4,0,"[nah, i, dont, think, he, goe, to, usf, he, li..."


### Transform the data into word occurences

In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

dataset['message'] = dataset['message'].apply(lambda x: ' '.join(x))

vectorizer = CountVectorizer()
vector = vectorizer.fit_transform(dataset['message'])

transformer = TfidfTransformer()
transformer.fit(vector)
counts = transformer.transform(vector)

### Split data to Training and Test sets

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    counts, dataset['label'], test_size=0.1, random_state=69
)

print('Training set:', X_train.shape[0])
print('Test set:', X_test.shape[0])

Training set: 5014
Test set: 558


## Create and Train model

In [10]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

model.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

## Evaluate the model

In [11]:
import numpy as np
from sklearn.metrics import confusion_matrix

predictions = model.predict(X_test)

accuracy = np.mean(predictions == y_test)
print('Accuracy:', accuracy*100,'%')

conf_mat = confusion_matrix(y_test, predictions)
print('\nConfusion Matrix:\n', conf_mat)

Accuracy: 94.80286738351255 %

Confusion Matrix:
 [[482   0]
 [ 29  47]]
