In [1]:
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk import PorterStemmer as Stemmer

import pandas as pd
import numpy as np

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score


In [2]:
data = pd.read_csv('SMSSpamCollection.csv',sep="\t", names=['tag','text'],header=None)
data.head()

Unnamed: 0,tag,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:

data.replace('ham', 0, inplace=True)
data.replace('spam', 1,inplace=True)
data.head(5)

Unnamed: 0,tag,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
data.shape



(5572, 2)

In [5]:
data['text'].head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: text, dtype: object

In [6]:
 #Preprocessing 
def process(string): 

# punctuation that needs to be removed
    process = '''!;:,.?()'[]<>'''
    
# Going through the each and every character for removal of the above punctuations
  
    for x in string.lower(): 
        if x in process: 
            string = string.replace(x, "") 

# returns the string in lowercase letters 
    return string.lower()

In [7]:
test_training_dataset=data[0:5]

In [8]:
test_training_dataset

Unnamed: 0,tag,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


This is basic understanding of the first text on how the vocabulary can work. A sample output for understanding

In [9]:
def create_vocab(series):
  vocab = []
  for x in series:
    x_list = x.split()
    for i in x_list: 
      vocab.append(i)
  return list(set(vocab))

In [10]:
test_vocab = create_vocab(test_training_dataset['text'].iloc[0:2])

In [11]:
test_vocab

['Available',
 'lar...',
 'got',
 'wif',
 'Cine',
 'great',
 'Go',
 'buffet...',
 'there',
 'Joking',
 'point,',
 'Ok',
 'u',
 'wat...',
 'world',
 'e',
 'only',
 'la',
 'n',
 'jurong',
 'crazy..',
 'until',
 'in',
 'bugis',
 'amore',
 'oni...']

In [12]:
X=np.array(data['text'])
y=np.array(data['tag'])

This steps is to create vocabulary. This vocabulary contains the set of words in in both spam and ham(not spam) messages. 

In [13]:
# Define and Create a Vocabulary list and then distinguish the spam and ham(not spam)
def Voc_list(create_x,create_y):
#creat vacabulary lists through collection of words
    create_spamlist = []
    create_hamlist = []
#creat dictionary for 1 and 0 group
    spam_base = {}
    ham_base = {}
    
    for i in range(len(create_y)):
        if create_y[i] == 1:
            create_spamlist.append(create_x[i])
            for word in create_x[i]:
                if word in spam_base:
                    spam_base[word] += 1
                else:
                    spam_base[word] = 1
        
        elif create_y[i] == 0:
            create_hamlist.append(create_x[i])
            for word in create_x[i]:
                if word in ham_base:
                    ham_base[word] += 1
                else:
                    ham_base[word] = 1

    return create_spamlist,create_hamlist,spam_base,ham_base

In [14]:

from sklearn.model_selection import StratifiedKFold
accuracy = []
kfold = StratifiedKFold(n_splits=5)
for train_index, test_index in kfold.split(X,y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    voc_a,voc_b,dict_a,dict_b = Voc_list(X_train,y_train)

# calculate probabilities    
    total_sum = len(X_train) #total number of words
    contri_1 = len(voc_a) / total_sum
    contri_0 = len(voc_b) / total_sum
    total_1 = sum(list(dict_a.values()))
    total_0 = sum(list(dict_b.values()))

# make predictions    
    y_pred = []
    for i in range(len(X_test)):
        prob_1 = 1  #initialize
        prob_0 = 1
        
        for words in X_test[i]:
            if words in dict_a:
                word_in_spam = dict_a[words]
            elif words not in dict_a:
                word_in_spam = 0
        prob_1 *= (word_in_spam + 1) / (total_1 + len(dict_a)) #  +1 smoothing
        
        for words in X_test[i]:
            if words in dict_b:
                word_in_ham = dict_b[words]
            elif words not in dict_b:
                word_in_ham = 0
        prob_0 *= (word_in_ham + 1) / (total_0 + len(dict_b))
        
#classify
        P1 = prob_1 * contri_1
        P0 = prob_0 * contri_0
    
        if P1 > P0:
            y_pred.append(1)
        else:
            y_pred.append(0)
            
# calculate accuracy   
    score = accuracy_score(y_test,y_pred)
    accuracy.append(score)
print(accuracy)
print('The average accuracy is:' , np.mean(accuracy))

[0.8995515695067264, 0.8905829596412556, 0.8842010771992819, 0.8877917414721723, 0.8931777378815081]
The average accuracy is: 0.891061017140189


NaiveBayes Sklearn

In [15]:
# create classifier and counter objects 
classifier = MultinomialNB()
counter = CountVectorizer()
X=np.array(data['text'])
y=np.array(data['tag'])
# perform 5-fold CV 
accuracy1 = []
kfold = StratifiedKFold(n_splits=5)
for train_index, test_index in kfold.split(X,y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
# transform data     
    counter.fit(X_train)
    train_counts = counter.transform(X_train)
    test_counts = counter.transform(X_test)
# fit classifier and predict    
    classifier = MultinomialNB()
    classifier.fit(train_counts, y_train)
    y_pred = classifier.predict(test_counts)
    score = accuracy_score(y_test, y_pred)
    accuracy1.append(score)


In [16]:
#  accuracy metrics for the k-folds
print(accuracy1)

[0.9847533632286996, 0.9865470852017937, 0.9829443447037702, 0.9829443447037702, 0.9847396768402155]


In [17]:
# create table to compare average accuracy on hand-made and sklearn classifiers
info = {'Models': ['Hand-Made','Sklearn'],
       'Average Score': [np.mean(accuracy), np.mean(accuracy1)]}
df = pd.DataFrame(info,columns=['Models','Average Score'])

df


Unnamed: 0,Models,Average Score
0,Hand-Made,0.891061
1,Sklearn,0.984386
