In [1]:
import pandas as pd

In [2]:
messages = pd.read_csv('SMSSpamCollection.tsv', sep='\t', names=["label","messages"])

In [3]:
messages.head(10)

Unnamed: 0,label,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [5]:
lm = WordNetLemmatizer()
corpus = []

In [6]:
for i in range(0, len(messages)):
    review = re.sub('[a-zA-Z]',' ', messages['messages'][i])
    review = review.lower()
    review = review.split()
    
    review = [lm.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [7]:
corpus

[', .. ... ...',
 '... ...',
 "2 21 2005. 87121 ( ) & ' 08452810075 18'",
 '... ...',
 "' ,",
 "' 3 ' ! ' ? ! , £1.50",
 '. .',
 "' ( )' . *9",
 '!! £900 ! 09061701461. 341. 12 .',
 '11 ? ! 08002986030',
 "' ' , ? ' .",
 '! 100 20,000 > 11 87575. 150 / , 6 , 16+ 4',
 '! 1 £100,000 ! : : 81010 & . . 4403 1 7 18',
 "' . . .",
 '!!',
 ': , >> :// . . ? =',
 "... ' :)",
 '2 ... . .',
 '\x92 . \x92',
 '- / . 87077 87077 : , 4 /ú1.20 36504 45 16+',
 '?',
 '‘ 2',
 'ü ... ...',
 '. 3 . ?',
 '. ?',
 ". ' . . . ' .",
 '.',
 "? ? ? ' ? ?",
 "' & ; ' , ' '",
 '. . ! ?',
 "' , ' '",
 "2 . . 2! ' ! . ?",
 '.',
 '?',
 '£5/ .',
 '... ü ... 2 8',
 ", ' '",
 '',
 '... ...',
 "! ' ? ' . ' !",
 '. . .',
 '? , , ... ... ... ...',
 '07732584351 - - = + . 08000930705',
 '?',
 '! . & ;#& ; ...',
 '.. ..',
 "' .",
 ', ?',
 ", '",
 "' . ' . ' . .",
 '. .',
 '& ;#& ; , & ;#& ;',
 "'",
 '. . , " ". \' \' \' . \' . \' .',
 '. : . ? ?',
 '? @ & ; & ;',
 '! 1 2 . 09061209465 ! , 3, 3, 4 ! 420- 4-5 . 150 . !',
 ", ' 

## Creating Bag of Model

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000)
X = cv.fit_transform(corpus).toarray()
y = pd.get_dummies(messages['label'])
y = y.iloc[:,1].values

In [9]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [10]:
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

## Train Test Split

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=0)

In [12]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(4179, 661) (1393, 661) (4179,) (1393,)


## Training model using naive bayes classifier

In [13]:
from sklearn.naive_bayes import MultinomialNB

In [14]:
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [15]:
y_pred = spam_detect_model.predict(X_test)

In [16]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [17]:
from sklearn.metrics import confusion_matrix

In [18]:
conf_mat = confusion_matrix(y_test, y_pred)
conf_mat

array([[1206,    2],
       [ 104,   81]], dtype=int64)

In [19]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.923905240488155