In [58]:
# Importing the dependencies
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics

In [59]:
# loading the dataset
dataset=pd.read_csv('spam.csv',header=None,encoding='latin-1',skiprows=1)
dataset.drop(columns=[2,3,4],inplace=True)

In [60]:
print(dataset)

         0                                                  1
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham              Will Ì_ b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [61]:
# let's see what are all the divisions in messages in the dataset
labels=dataset[0].unique()
print(labels)

['ham' 'spam']


In [62]:
# creating dictionary for the labels
dict={'ham':0,'spam':1}

In [63]:
dataset[2]=dataset[0].map(dict)
print(dataset)

         0                                                  1  2
0      ham  Go until jurong point, crazy.. Available only ...  0
1      ham                      Ok lar... Joking wif u oni...  0
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...  1
3      ham  U dun say so early hor... U c already then say...  0
4      ham  Nah I don't think he goes to usf, he lives aro...  0
...    ...                                                ... ..
5567  spam  This is the 2nd time we have tried 2 contact u...  1
5568   ham              Will Ì_ b going to esplanade fr home?  0
5569   ham  Pity, * was in mood for that. So...any other s...  0
5570   ham  The guy did some bitching but I acted like i'd...  0
5571   ham                         Rofl. Its true to its name  0

[5572 rows x 3 columns]


In [64]:
# Splitting train-test data
X_train,X_test,y_train,y_test=train_test_split(dataset[1],dataset[2],test_size=0.2,random_state=0)

In [65]:
# here converting the textual data into numerical data
vectorizer=TfidfVectorizer()
X_train=vectorizer.fit_transform(X_train)

In [66]:
# Training Naive Bayes(NB) classifier on training data.
# Using sparse matrix directly
clf=BernoulliNB().fit(X_train,y_train)

In [67]:
# transforming test data into numerical format and predictions on test data
X_test=vectorizer.transform(X_test)
predictions=clf.predict(X_test)

In [68]:
# printing accuracy, Confusion Matrix, Precision and Recall
print('Accuracy: ',metrics.accuracy_score(y_test,predictions))
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,predictions))
print('classification report is: ',metrics.classification_report(y_test,predictions))
print('Precision: ',metrics.precision_score(y_test,predictions,average=None))
print('Recall: ',metrics.recall_score(y_test,predictions,average=None))

Accuracy:  0.9668161434977578
Confusion Matrix:  [[948   1]
 [ 36 130]]
classification report is:                precision    recall  f1-score   support

           0       0.96      1.00      0.98       949
           1       0.99      0.78      0.88       166

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115

Precision:  [0.96341463 0.99236641]
Recall:  [0.99894626 0.78313253]


In [69]:
# That's great as the accuracy is 0.97

In [70]:
# That's the Span SMS Detection with Bernoulli Naive Bayes Classifier

In [76]:
from sklearn.naive_bayes import GaussianNB

In [None]:
# As already converted training and testing data into numerical values no need to repeat the step again

In [77]:
# Converting into dense array
clf = GaussianNB().fit(X_train.toarray(), y_train)

In [78]:
# predictions on test data
predictions=clf.predict(X_test.toarray())

In [79]:
# printing accuracy, Confusion Matrix, Precision and Recall
print('Accuracy: ',metrics.accuracy_score(y_test,predictions))
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,predictions))
print('classification report is: ',metrics.classification_report(y_test,predictions))
print('Precision: ',metrics.precision_score(y_test,predictions,average=None))
print('Recall: ',metrics.recall_score(y_test,predictions,average=None))

Accuracy:  0.8968609865470852
Confusion Matrix:  [[858  91]
 [ 24 142]]
classification report is:                precision    recall  f1-score   support

           0       0.97      0.90      0.94       949
           1       0.61      0.86      0.71       166

    accuracy                           0.90      1115
   macro avg       0.79      0.88      0.82      1115
weighted avg       0.92      0.90      0.90      1115

Precision:  [0.97278912 0.60944206]
Recall:  [0.90410959 0.85542169]


In [80]:
# That's the Spam SMS Detection using Naive-Bayes Gaussian classifier