In [146]:
import pandas as pd
import numpy as np
import nltk

In [147]:
data = pd.read_csv('SMSSpamCollection', sep = '\t', names = ['labels', 'messages'])

In [148]:
data

Unnamed: 0,labels,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [149]:
import re

In [150]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [151]:
ps = PorterStemmer()
wordnet = WordNetLemmatizer()

In [152]:
corpus = []
for i in range(len(data)):
    review = re.sub('[^a-zA-Z]',' ', data['messages'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [153]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 2500)

In [154]:
X = cv.fit_transform(corpus).toarray()

In [155]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [156]:
y = pd.get_dummies(data['labels'])

In [157]:
y

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0
...,...,...
5567,0,1
5568,1,0
5569,1,0
5570,1,0


In [158]:
y = y.iloc[:,1].values

In [159]:
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [160]:
from sklearn.model_selection import train_test_split

In [161]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 0)

In [162]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [163]:
X_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [164]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [165]:
y_pred = spam_detect_model.predict(X_test)

In [166]:
y_test

array([0, 1, 0, ..., 0, 1, 0], dtype=uint8)

In [167]:
y_pred

array([0, 1, 0, ..., 0, 1, 0], dtype=uint8)

In [168]:
from sklearn.metrics import confusion_matrix
confusion_mat = confusion_matrix(y_test, y_pred)
confusion_mat

array([[946,   9],
       [  7, 153]], dtype=int64)

In [169]:
from sklearn.metrics import accuracy_score

In [170]:
Accuracy = accuracy_score(y_pred, y_test)

In [171]:
print("Accuracy is: " + str(round(Accuracy*100, 2)) + "%")

Accuracy is: 98.57%
