In [34]:
! pip install --user numpy==1.17



In [35]:
import pandas as pd
from sklearn import metrics
import warnings

warnings.filterwarnings('ignore')




In [36]:
#Read SMSSpamCollection dataset
docs = pd.read_csv('SMSSpamCollection.csv')
docs.head()

Unnamed: 0,Class,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [37]:
# counting spam and ham instances
ham_spam= docs['Class'].value_counts()
ham_spam

ham     4825
spam     747
Name: Class, dtype: int64

In [38]:
#print percentage of spam messages in your dataset
print("{:.2f}%".format(ham_spam[1]/len(docs['Class'])*100))

13.41%


In [39]:
# mapping a column labels to ham as 0 and spam as 1
docs['label'] = docs['Class'].map({'ham':0,'spam':1})

#print last 5 records of dataset
docs.tail(5)

Unnamed: 0,Class,sms,label
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0
5571,ham,Rofl. Its true to its name,0


In [40]:
#Fetch all features 
X= docs.sms

#Fetch label
y= docs.label


#print shape of X and y
print(X.shape)
print(y.shape)

(5572,)
(5572,)


In [41]:
# import train_test_split
from sklearn.model_selection import train_test_split
# splitting into test and train with random state as 1 and test size as 25%
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=1)

In [42]:
X_train.head(5)

710     4mths half price Orange line rental & latest c...
3740                           Did you stitch his trouser
2711    Hope you enjoyed your new content. text stop t...
3155    Not heard from U4 a while. Call 4 rude chat pr...
3748    Ü neva tell me how i noe... I'm not at home in...
Name: sms, dtype: object

In [43]:
# import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# We can see a few trivial words such as 'and','is','of', etc. 
# These words don't really make any difference in classyfying a document. These are called 'stop words'. 
# So we would like to get rid of them.
# We can remove them by passing a parameter stop_words='english' while instantiating Countvectorizer()
# Instantiate CountVectorizer with stop_words

vect = CountVectorizer(stop_words='english')


# fit vect on your feature text (X_train)
vect.fit_transform(X_train)

<4179x7204 sparse matrix of type '<class 'numpy.int64'>'
	with 32467 stored elements in Compressed Sparse Row format>

In [44]:
#check count of words in your features
vect.vocabulary_

{'4mths': 509,
 'half': 3089,
 'price': 5027,
 'orange': 4626,
 'line': 3852,
 'rental': 5310,
 'latest': 3763,
 'camera': 1527,
 'phones': 4822,
 'free': 2780,
 'phone': 4818,
 '11mths': 264,
 'mobilesdirect': 4248,
 '08000938767': 50,
 'update': 6673,
 'or2stoptxt': 4624,
 'cs': 1971,
 'did': 2169,
 'stitch': 6028,
 'trouser': 6545,
 'hope': 3260,
 'enjoyed': 2436,
 'new': 4442,
 'content': 1867,
 'text': 6321,
 'stop': 6038,
 '61610': 563,
 'unsubscribe': 6665,
 'help': 3180,
 '08712400602450p': 98,
 'provided': 5089,
 'tones2you': 6473,
 'uk': 6614,
 'heard': 3159,
 'u4': 6608,
 'rude': 5443,
 'chat': 1643,
 'private': 5040,
 '01223585334': 5,
 'cum': 1989,
 'wan': 6852,
 '2c': 374,
 'pics': 4837,
 'gettin': 2913,
 'shagged': 5628,
 'pix': 4858,
 '8552': 660,
 '2end': 378,
 'send': 5590,
 'sam': 5487,
 'xxx': 7127,
 'neva': 4438,
 'tell': 6289,
 'noe': 4477,
 'home': 3244,
 'da': 2015,
 'aft': 826,
 'wat': 6876,
 'wiskey': 7012,
 'brandy': 1389,
 'rum': 5450,
 'gin': 2927,
 'beer':

In [45]:
#Check how feature names separately in form of words
vect.get_feature_names()

['00',
 '000',
 '008704050406',
 '0121',
 '01223585236',
 '01223585334',
 '0125698789',
 '02',
 '0207',
 '02072069400',
 '02073162414',
 '02085076972',
 '021',
 '03',
 '04',
 '0430',
 '05',
 '050703',
 '0578',
 '06',
 '07',
 '07008009200',
 '07090201529',
 '07090298926',
 '07123456789',
 '07732584351',
 '07734396839',
 '07742676969',
 '0776xxxxxxx',
 '07781482378',
 '07786200117',
 '078',
 '07801543489',
 '07808',
 '07808247860',
 '07808726822',
 '07815296484',
 '07821230901',
 '07880867867',
 '0789xxxxxxx',
 '07946746291',
 '0796xxxxxx',
 '07973788240',
 '07xxxxxxxxx',
 '08',
 '0800',
 '08000407165',
 '08000776320',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08002988890',
 '08006344447',
 '0808',
 '08081263000',
 '08081560665',
 '0825',
 '083',
 '0844',
 '08448714184',
 '0845',
 '08450542832',
 '08452810071',
 '08452810073',
 '08452810075over18',
 '0870',
 '08700435505150p',
 '08700469649',
 '08700621170150p',
 '08

In [46]:
# transform feature data
X_train_transformed = vect.transform(X_train)
X_test_tranformed = vect.transform(X_test)

In [47]:
#Now let's see how our X_train data looks like after tranformation
print(X_train_transformed.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [48]:
# printing length of feature names
len(vect.get_feature_names())

7204

In [49]:
# Print shape of X_train_transformed
X_train_transformed.shape

(4179, 7204)

In [50]:
#Print X_train_transformed
print(X_train_transformed)

  (0, 50)	1
  (0, 264)	1
  (0, 509)	1
  (0, 1527)	1
  (0, 1971)	1
  (0, 2780)	2
  (0, 3089)	1
  (0, 3763)	1
  (0, 3852)	1
  (0, 4248)	1
  (0, 4624)	1
  (0, 4626)	1
  (0, 4818)	1
  (0, 4822)	1
  (0, 5027)	1
  (0, 5310)	1
  (0, 6673)	1
  (1, 2169)	1
  (1, 6028)	1
  (1, 6545)	1
  (2, 98)	1
  (2, 563)	1
  (2, 1867)	1
  (2, 2436)	1
  (2, 3180)	1
  :	:
  (4176, 3879)	1
  (4176, 4417)	1
  (4176, 5229)	1
  (4176, 6191)	1
  (4176, 7134)	1
  (4177, 254)	1
  (4177, 307)	1
  (4177, 358)	1
  (4177, 831)	1
  (4177, 2046)	1
  (4177, 2704)	1
  (4177, 3585)	1
  (4177, 3623)	1
  (4177, 4130)	1
  (4177, 4315)	1
  (4177, 4771)	1
  (4177, 5234)	1
  (4177, 5321)	1
  (4177, 5487)	1
  (4177, 5620)	1
  (4177, 6321)	1
  (4177, 6374)	1
  (4177, 6453)	1
  (4178, 1643)	1
  (4178, 5817)	1


Consider first 4 rows of the output: (0,50), (0,264), (0,509) and (0,1552). It says that the first document (index 0) has 50th , 264nd , 509th and 1552th 'word' present in the document, and that they appear only once/twice in the document- indicated by the right hand column entry.

In [51]:
'''converting X_train_transformed matrix to dataframe (Hint:X_train_transformed 
should be in an array form and columns as vector's feature name )'''

pd.DataFrame(X_train_transformed.toarray(), columns=vect.get_feature_names())


Unnamed: 0,00,000,008704050406,0121,01223585236,01223585334,0125698789,02,0207,02072069400,...,zed,zeros,zhong,zindgi,zoe,zoom,zouk,zyada,èn,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4174,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4175,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4176,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4177,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


This table shows how many times a particular word occurs in document. In other words, this is a frequency table of the words.

## Now applying Naive-Bayes

In [52]:
#import BernoulliNB
from sklearn.naive_bayes import BernoulliNB

# instantiate bernoulli NB object
bnb = BernoulliNB()

# fit model on training dataset
bnb.fit(X_train_transformed,y_train)
# predict class of y
y_pred_class = bnb.predict(X_test_tranformed)

# predict probability on y
y_pred_proba = bnb.predict_proba(X_test_tranformed)

# print accuracy score 
print(metrics.accuracy_score(y_test,y_pred_class))

0.9770279971284996


In [53]:
# import MultinomialNB
from sklearn.naive_bayes import MultinomialNB

# instantiate bernoulli NB object
mnb = MultinomialNB()

# fit model on training dataset
mnb.fit(X_train_transformed,y_train)
# predict class of y
y_pred_class = mnb.predict(X_test_tranformed)

# predict probability on y
y_pred_proba =mnb.predict_proba(X_test_tranformed)

# print accuracy score 
print(metrics.accuracy_score(y_test,y_pred_class))

0.9877961234745154


In [54]:
# get confusion metrics
confusion = metrics.confusion_matrix(y_test,y_pred_class)

#print confusion metrics
print(confusion)
#Get True negative, Flase positive, Flase negative and True positive using confusion metrics
TN = confusion[0,0]
FP = confusion[0,1]
FN = confusion[1,0]
TP = confusion[1,1]

[[1201    7]
 [  10  175]]


In [55]:
    #import classification_report
from sklearn.metrics import classification_report

#Print Precision, recall, f1-score and support 
print(classification_report(y_test,y_pred_class))


              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1208
           1       0.96      0.95      0.95       185

    accuracy                           0.99      1393
   macro avg       0.98      0.97      0.97      1393
weighted avg       0.99      0.99      0.99      1393



In [56]:
from sklearn.pipeline import Pipeline
email_clf = Pipeline([('vect',CountVectorizer()),
                      ('mnb',MultinomialNB())])

In [57]:
email_clf.fit(X_train,y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('mnb', MultinomialNB())])

In [58]:
# predict class of y
y_pred_class = email_clf.predict(X_test)

# print accuracy score 
print(metrics.accuracy_score(y_test,y_pred_class))

0.9885139985642498


In [59]:
email_clf.predict(["Buy 1 Get 1 SALE is LIVE on all of your favourite Ustraa products.Now get double swag, double nourishment, double cleansing, for the price of 1."])

array([1], dtype=int64)

In [60]:
import pickle
pickle.dump(email_clf,open('email_classifier.pkl','wb'))