In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pnpreprocessor as preprocessor
%matplotlib inline

In [2]:
data = pd.read_csv("SMSSpamCollection",sep='\t' , names=['label', 'message'])
data

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
label      5572 non-null object
message    5572 non-null object
dtypes: object(2)
memory usage: 87.2+ KB


In [4]:
#Check whether the dataset is balanced or not
data.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [5]:
#Sklearn to create WordVector ! 

from sklearn.feature_extraction.text import CountVectorizer
wordVector = CountVectorizer(analyzer=preprocessor.textProcessing)
finalWordVector = wordVector.fit(data['message'])
finalWordVector

CountVectorizer(analyzer=<function textProcessing at 0x000001A3A5B1F828>,
                binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [6]:
finalWordVector.vocabulary_   #Simple count of each unique words in the dataset 

{'Go': 2060,
 'jurong': 7555,
 'point': 8917,
 'crazy': 5769,
 'Available': 1110,
 'bugis': 5218,
 'n': 8336,
 'great': 6937,
 'world': 11163,
 'la': 7668,
 'e': 6217,
 'buffet': 5217,
 'Cine': 1483,
 'got': 6906,
 'amore': 4653,
 'wat': 10965,
 'Ok': 3064,
 'lar': 7701,
 'Joking': 2451,
 'wif': 11072,
 'u': 10698,
 'oni': 8590,
 'Free': 1941,
 'entry': 6331,
 '2': 423,
 'wkly': 11123,
 'comp': 5619,
 'win': 11084,
 'FA': 1833,
 'Cup': 1551,
 'final': 6557,
 'tkts': 10512,
 '21st': 443,
 'May': 2804,
 '2005': 430,
 'Text': 3953,
 '87121': 871,
 'receive': 9252,
 'questionstd': 9159,
 'txt': 10686,
 'rateTCs': 9200,
 'apply': 4731,
 '08452810075over18s': 73,
 'U': 4068,
 'dun': 6204,
 'say': 9554,
 'early': 6222,
 'hor': 7186,
 'c': 5261,
 'already': 4629,
 'Nah': 2948,
 'dont': 6123,
 'think': 10433,
 'goes': 6877,
 'usf': 10799,
 'lives': 7842,
 'around': 4777,
 'though': 10450,
 'FreeMsg': 1943,
 'Hey': 2222,
 'darling': 5864,
 '3': 543,
 'weeks': 11011,
 'word': 11150,
 'back': 4893

In [7]:
bagOfWords = finalWordVector.transform(data['message'])
bagOfWords

# Go for programming , Go for algos ----> Document
# Go - 2 /2060 = (Go, 9,708737-4 )
# for - 2
# programming - 1
# algos - 1

<5572x11425 sparse matrix of type '<class 'numpy.int64'>'
	with 50548 stored elements in Compressed Sparse Row format>

In [9]:
#Convert BOW to word and its frequencies which will act as Feature in ML model
from sklearn.feature_extraction.text import TfidfTransformer
tfidfTransform = TfidfTransformer().fit(bagOfWords)
featureData = tfidfTransform.transform(bagOfWords)

# (Go,0.09) collaborate tfidfTransformer

In [10]:
featureData

<5572x11425 sparse matrix of type '<class 'numpy.float64'>'
	with 50548 stored elements in Compressed Sparse Row format>

In [11]:
#Training model 
#Naive Bayes ....MultinomialNB(text),BinomiaLNB (Binary Classification), BernoulliNB(Geospatial data - Longitude,Latitude,graph)
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(featureData,data['label'])

In [12]:
model

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [13]:
model.score(featureData,data['label'])

0.9793610911701364

In [14]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(data['label'],model.predict(featureData))
cm

array([[4825,    0],
       [ 115,  632]], dtype=int64)

In [15]:
from sklearn.metrics import classification_report
print(classification_report(data['label'],model.predict(featureData)))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      4825
        spam       1.00      0.85      0.92       747

    accuracy                           0.98      5572
   macro avg       0.99      0.92      0.95      5572
weighted avg       0.98      0.98      0.98      5572



# So let say Client is fine if spam comes to the inbox but they are not fine if ham goes to spam box or Vice Versa

In [19]:
inputData = input ('Enter SMS to clasify: ')
l1 = preprocessor.textProcessing(inputData)
l2 = finalWordVector.transform(l1)
l3 = tfidfTransform.transform(l2)
prediction = model.predict(l3[0])

print(prediction)

Enter SMS to clasify: free entry in malaysian airlines
['ham']


# Check for BernoulliNB

In [31]:
#Training model 
#Naive Bayes ....MultinomialNB(text),BinomiaLNB (Binary Classification), BernoulliNB(Geospatial data - Longitude,Latitude,graph)
from sklearn.naive_bayes import BernoulliNB

model1 = BernoulliNB().fit(featureData,data['label'])

In [32]:
model1

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [33]:
model1.score(featureData,data['label'])

0.9849246231155779

In [34]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(data['label'],model1.predict(featureData))
cm

array([[4819,    6],
       [  78,  669]], dtype=int64)

In [35]:
from sklearn.metrics import classification_report
print(classification_report(data['label'],model1.predict(featureData)))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      4825
        spam       0.99      0.90      0.94       747

    accuracy                           0.98      5572
   macro avg       0.99      0.95      0.97      5572
weighted avg       0.99      0.98      0.98      5572



In [36]:
inputData = input ('Enter SMS to clasify: ')
l1 = preprocessor.textProcessing(inputData)
l2 = finalWordVector.transform(l1)
l3 = tfidfTransform.transform(l2)
prediction = model1.predict(l3[0])

print(prediction)

Enter SMS to clasify: Free entry in 2 a wkly comp to win FA Cup
['ham']
