In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [2]:
smsdata = pd.read_csv("sms_raw_NB.csv",encoding = "ISO-8859-1")

In [3]:
smsdata.shape

(5559, 2)

In [4]:
smsdata.head()

Unnamed: 0,type,text
0,ham,Hope you are having a good week. Just checking in
1,ham,K..give back my thanks.
2,ham,Am also doing in cbe only. But have to pay.
3,spam,"complimentary 4 STAR Ibiza Holiday or å£10,000..."
4,spam,okmail: Dear Dave this is your final notice to...


In [5]:
import re
stop_words = []
with open("stop.txt") as f:
    stop_words = f.read()

In [6]:
#As Stopwards are in a single string, lets convert into list of single words
stop_words = stop_words.split("\n")

In [7]:
stop_words

['a',
 "a's",
 'able',
 'about',
 'above',
 'according',
 'accordingly',
 'across',
 'actually',
 'after',
 'afterwards',
 'again',
 'against',
 "ain't",
 'all',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'an',
 'and',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anyways',
 'anywhere',
 'apart',
 'appear',
 'appreciate',
 'appropriate',
 'are',
 "aren't",
 'around',
 'as',
 'aside',
 'ask',
 'asking',
 'associated',
 'at',
 'available',
 'away',
 'awfully',
 'b',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'believe',
 'below',
 'beside',
 'besides',
 'best',
 'better',
 'between',
 'beyond',
 'both',
 'brief',
 'but',
 'by',
 'c',
 "c'mon",
 "c's",
 'came',
 'can',
 "can't",
 'cannot',
 'cant',
 'cause',
 'causes',
 'certain',
 'certainly',
 'changes',
 'clearly',
 'co',
 'com',
 'come',
 'c

In [8]:
def cleaningdata (i):
    i= re.sub("[^A-Za-z" "]+"," ",i).lower()
    i = re.sub("[0-9" "]+"," ",i)
    w= []
    for word in i.split(" "):
        if len(word)>3:
            w.append(word)
    return(" ".join(w))

In [9]:
#clear the data
smsdata["text"]= smsdata["text"].apply(cleaningdata)

In [10]:
#Removing the empty rows if any generated
smsdata.shape
smsdata = smsdata.loc[smsdata.text != " ",:]

In [11]:
#Creating a matrix of token counts for the entire text document
def split_if_words(i):
    return [word for word in i.split(" ")]

predictors = smsdata.iloc[:,1]
target = smsdata.iloc[:,0]

In [12]:
#Splitting the data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(predictors, target, test_size = 0.3, stratify = target)

In [13]:
##Convert email text into word count matric i.e bag of words
email_bow = CountVectorizer(analyzer = split_if_words).fit(smsdata["text"])
email_bow

CountVectorizer(analyzer=<function split_if_words at 0x0000023934122F78>)

In [15]:
#For all the emails doing the transformation
all_emails_matrix = email_bow.transform(smsdata["text"])
all_emails_matrix.shape

(5559, 6661)

In [16]:
train_emails_matrix = email_bow.transform(x_train)
train_emails_matrix.shape

(3891, 6661)

In [17]:
test_emails_matrix = email_bow.transform(x_test)
test_emails_matrix.shape

(1668, 6661)

In [18]:
from sklearn.naive_bayes import MultinomialNB as MB
from sklearn.naive_bayes import GaussianNB as GB

In [19]:
classifier_nb = MB()
classifier_nb.fit(train_emails_matrix,y_train)
train_pred_nb =classifier_nb.predict(train_emails_matrix) 
accuracy_nb = np.mean(train_pred_nb==y_train)
accuracy_nb

0.9897198663582627

In [20]:
pd.crosstab(train_pred_nb, y_train)

type,ham,spam
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,3345,17
spam,23,506


In [21]:
#predicting on test data
test_pred_nb = classifier_nb.predict(test_emails_matrix)
accuracy_test_nb = np.mean(test_pred_nb == y_test )
accuracy_test_nb

0.9616306954436451

In [22]:
pd.crosstab(test_pred_nb,y_test)

type,ham,spam
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,1402,22
spam,42,202


In [23]:
#Building Gaussian model
classifier_gb = GB()
classifier_gb.fit(train_emails_matrix.toarray(),y_train.values)
train_pred_gb = classifier_gb.predict(train_emails_matrix.toarray())
accuracy_gb = np.mean(train_pred_gb == y_train)
accuracy_gb

0.9113338473400154

In [24]:
pd.crosstab(train_pred_gb,y_train)

type,ham,spam
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,3023,0
spam,345,523


In [25]:
#predicting on test data
test_pred_gb = classifier_gb.predict(test_emails_matrix.toarray())
accuracy_test_gb = np.mean(test_pred_gb == y_test)
accuracy_test_gb

0.8363309352517986

In [26]:
pd.crosstab(test_pred_gb, y_test)

type,ham,spam
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,1205,34
spam,239,190


In [27]:
###Building with TFIDF transformation
tfidf_transformer = TfidfTransformer().fit(all_emails_matrix)

In [28]:
#Preparing Tfidf for train emails
train_tfidf = tfidf_transformer.transform(train_emails_matrix)
train_tfidf.shape

(3891, 6661)

In [29]:
test_tfidf = tfidf_transformer.transform(test_emails_matrix)
test_tfidf.shape

(1668, 6661)

In [30]:
#Building Multinomial Naive Bayes model
classifer_mb_tfidf = MB()
classifer_mb_tfidf.fit(train_tfidf,y_train)
train_predmb_tfidf = classifer_mb_tfidf.predict(train_tfidf)
accuracy_mb_tfidf = np.mean(train_predmb_tfidf == y_train)
accuracy_mb_tfidf

0.9653045489591364

In [31]:
pd.crosstab(train_predmb_tfidf, y_train)

type,ham,spam
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,3368,135
spam,0,388


In [32]:
test_predmb_tfidf = classifer_mb_tfidf.predict(test_tfidf)
accuracy_testmb_tfidf = np.mean(test_predmb_tfidf == y_test)
accuracy_testmb_tfidf

0.947841726618705

In [33]:
#Building gaussiam naive bayes model
classifier_gb_tfidf = GB()
classifier_gb_tfidf.fit(train_tfidf.toarray(),y_train.values)
train_predgb_tfidf = classifier_gb_tfidf.predict(train_tfidf.toarray())
accuracy_gb_tfidf = np.mean(train_predgb_tfidf == y_train)
accuracy_gb_tfidf

0.9113338473400154

In [34]:
pd.crosstab(train_predgb_tfidf,y_train)

type,ham,spam
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,3023,0
spam,345,523


In [35]:
test_predgb_tfidf = classifier_gb_tfidf.predict(test_tfidf.toarray())
accuracy_testgb_tfidf = np.mean(test_predgb_tfidf == y_test)
accuracy_testgb_tfidf

0.8357314148681055

In [36]:
pd.crosstab(test_predgb_tfidf,y_test)

type,ham,spam
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,1207,37
spam,237,187
