In [1]:
import numpy as np
import pandas as pda
from sklearn.model_selection import train_test_split as tts
import re
import warnings

In [2]:
warnings.simplefilter('ignore')

In [3]:
smsSpam = pda.read_csv('SMSSpamCollection',sep = '\t', header = None, names=['Label','Messages'])


In [4]:
smsSpam['Label'].value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: Label, dtype: float64

In [5]:
trainingSet,testingSet = tts(smsSpam,test_size = 0.3, random_state = 45)

In [6]:
trainingSet['Messages'] = trainingSet['Messages'].str.replace('\W', ' ')
trainingSet['Messages'] = trainingSet['Messages'].str.lower()

In [7]:
testingSet['Messages'] = testingSet['Messages'].str.replace('\W', ' ')
testingSet['Messages'] = testingSet['Messages'].str.lower()

In [8]:
trainingSet['Label'].value_counts(normalize=True)

ham     0.865897
spam    0.134103
Name: Label, dtype: float64

In [9]:
testingSet['Label'].value_counts(normalize=True)

ham     0.866029
spam    0.133971
Name: Label, dtype: float64

In [10]:
trainingSet['Messages'] = trainingSet['Messages'].str.split()

vocab = []
for msg in trainingSet['Messages']:
    for word in msg:
        vocab.append(word)
        
vocab = list(set(vocab))

In [11]:
wordCountsPerMsg = {uniqueWord: [0]*len(trainingSet['Messages']) for uniqueWord in vocab}

for idx, msg in enumerate(trainingSet['Messages']):
    for word in msg:
        wordCountsPerMsg[word][idx] +=1

In [12]:
wordCounts = pda.DataFrame(wordCountsPerMsg)

In [13]:
trainingSetFinal = pda.concat([trainingSet,wordCounts],axis = 1)

In [14]:
# MULTINOMIAL NAIVE BAYES CLASSIFIER

In [15]:
spamMsgs = trainingSetFinal[trainingSetFinal['Label']=='spam']
hamMsgs = trainingSetFinal[trainingSetFinal['Label']=='ham']

In [16]:
pSpam = len(spamMsgs)/len(trainingSetFinal)
pHam = len(hamMsgs)/len(trainingSetFinal)

In [17]:
noOfWordsPerSpamMsgs = spamMsgs['Messages'].apply(len)
noSpam = noOfWordsPerSpamMsgs.sum()
noOfWordsPerHamMsgs = hamMsgs['Messages'].apply(len)
noHam = noOfWordsPerHamMsgs.sum()
noOfVocab = len(vocab)
alpha = 1

In [18]:
paramSpam = {uniqueWord: 0 for uniqueWord in vocab}
paramHam = {uniqueWord: 0 for uniqueWord in vocab}

In [19]:
for word in vocab:
    noOfWordsGivenSpam = spamMsgs[word].sum()
    probOfWordsGivenSpam = (noOfWordsGivenSpam + alpha)/(noSpam + alpha*noOfVocab)
    paramSpam[word] = probOfWordsGivenSpam
    noOfWordsGivenHam = hamMsgs[word].sum()
    probOfWordsGivenHam = (noOfWordsGivenHam + alpha)/(noHam + alpha*noOfVocab)
    paramHam[word] = probOfWordsGivenHam

In [20]:
def classifyTestingSet(msg):
    msg = re.sub('\W', ' ', msg)
    msg = msg.lower().split()
    
    probOfSpamGivenMsg = pSpam
    probOfHamGivenMsg = pHam
    
    for word in msg:
        if word in paramSpam:
            probOfSpamGivenMsg *= paramSpam[word]
            
        if word in paramHam:
            probOfHamGivenMsg *= paramHam[word]
            
        if probOfSpamGivenMsg > probOfHamGivenMsg:
            return 'spam'
        elif probOfHamGivenMsg > probOfSpamGivenMsg:
            return 'ham'
        else:
            return 'need human classification'
                
    

In [21]:
testingSet['pred'] = testingSet['Messages'].apply(classifyTestingSet)

In [22]:
right = 0
tot = testingSet.shape[0]

for row in testingSet.iterrows():
    row = row[1]
    if(row['Label']==row['pred']):
        right +=1
        
print('Right: ', right)
print('Wrong: ',tot-right)
print('Accuracy: ',right/tot)
print('Error: ', 1 -(right/tot))

Right:  1448
Wrong:  224
Accuracy:  0.8660287081339713
Error:  0.13397129186602874


In [23]:
# Gaussian Discriminant Analysis 

In [24]:
trainingY = trainingSetFinal['Label'].copy()
trainingX = trainingSetFinal[trainingSetFinal.columns[2:]]

In [25]:
trainingY = trainingY.to_numpy()
trainingX = trainingX.to_numpy()

In [26]:
X_one = []
X_zero = []

for i in range(len(trainingY)):
    if trainingY[i] == 'spam':
        X_one.append(trainingX[i])
    elif trainingY[i] == 'ham':
        X_zero.append(trainingX[i])

In [27]:
phi = float(len(X_one)/(len(X_one)+len(X_zero)))
mu0 = np.sum(np.matrix(X_zero),axis = 0)/len(X_zero)
mu1 = np.sum(np.matrix(X_one),axis = 0)/len(X_one)
print("Phi = ", phi)
print("mu0 = ", mu0)
print("mu1 = ", mu1)

Phi =  0.1341025641025641
mu0 =  [[nan nan nan ... nan nan nan]]
mu1 =  [[nan nan nan ... nan nan nan]]


In [28]:
sigma = np.zeros((trainingX.shape[1],trainingX.shape[1]))
sigma0 = np.zeros((trainingX.shape[1],trainingX.shape[1]))
sigma1 = np.zeros((trainingX.shape[1],trainingX.shape[1]))


In [29]:
for i in range(len(trainingX)):
    if(trainingY[i] == 'spam'):
        sigma1 += np.dot(np.transpose(trainingX[i]-mu1),trainingX[i]-mu1)
    elif trainingY[i]=='ham':
        sigma0 += np.dot(np.transpose(trainingX[i]-mu0),trainingX[i]-mu0)

In [30]:
sigma = (sigma1 + sigma0)/(len(X_one)+len(X_zero))
sigma0 /= len(X_one)
sigma1 /= len(X_zero)

In [31]:
def probFunction(x,mu,sigma):
    m = len(x)
    if m == mu.shape[1] and (m,m) == sigma.shape:
        deter = np.linalg.det(sigma)
        assert deter!=0, "matrix cannot be singular"
        
        temp = 1.0/(np.power((2*np.pi),float(m)/2)*np.power(deter,1.0/2))
        xmu = np.matrix(x-mu)
        siginv = inv(sigma)
        res = np.power(np.e,-0.5*(np.dot(np.dot(xmu,siginv),np.transpose(xmu))))
        return res*temp

In [32]:
testingSet['Messages'] = testingSet['Messages'].str.split()

vocab = []
for msg in testingSet['Messages']:
    for word in msg:
        vocab.append(word)
        
vocab = list(set(vocab))

In [33]:
wordCountsPerMsg = {uniqueWord: [0]*len(testingSet['Messages']) for uniqueWord in vocab}

for idx, msg in enumerate(testingSet['Messages']):
    for word in msg:
        wordCountsPerMsg[word][idx] +=1

In [34]:
wordCounts = pda.DataFrame(wordCountsPerMsg)

In [35]:
testingSetFinal = pda.concat([testingSet,wordCounts],axis = 1)

In [36]:
testingY = testingSetFinal['Label'].copy()
testingX = testingSetFinal[testingSetFinal.columns[2:]]

In [37]:
testingY = testingY.to_numpy()
testingX = testingX.to_numpy()

In [38]:
predictedY = []
for x in testingX:
    pa = probFunction(x,np.squeeze(mu0),np.matrix(sigma0))
    pc = probFunction(x,np.squeeze(mu1),np.matrix(sigma1))
    if (pa<pc):
        predictedY.append('spam')
    else:
        predictedY.append('ham')

In [39]:
count = 0
for i in range(len(testingY)):
    if(predictedY[i]==testingY[i]):
        count = count + 1
        
accuracy = np.multiply(np.divide(count,len(testingY)),100)
print(accuracy)

In [40]:
# MULTIVARIATE BURNOULLI NAIVE BAYES CLASSIFIER

In [41]:
spam = {}
ham = {}
allWords = {}
spamWordsCount = 0
hamWordsCount = 0
totalWords = 0
spamCount = 0
hamCount = 0

In [None]:
for i,msg in enumerate(trainingX):
    if trainingY[i] == 'spam':
        spamCount+=1
        for word in msg:
            if word in allWords:
                allWords[word]=1
            else:
                allWords[word]=1
            totalWords+=1
            spamWordsCount+=1
            if word in spam:
                spam[word] = 1
            else:
                spam[word] = 1
    elif trainingY[i] == 'ham':
        hamCount+=1
        for word in msg:
            if word in allWords:
                allWords[word]=1
            else:
                allWords[word]=1
            totalWords+=1
            hamWordsCount+=1
            if word in ham:
                ham[word] = 1
            else:
                ham[word] = 1
            

In [None]:
alpha = 1

for word in allWords:
    if word in spam:
        prob = (1 + alpha)/(spamCount + alpha*len(allWords))
        spam[word] = prob
    else:
        spam[word] = (alpha/(spamCount + alpha*len(allWords)))
    if word in ham:
        prob = (1 + alpha)/(hamCount + alpha*len(allWords))
        ham[word] = prob
    else:
        ham[word] = (alpha/(hamCount + alpha*len(allWords)))

In [None]:
def classifyTestingSet(msg):
    spamProb = spamCount/len(trainingSet)
    hamProb = hamCount/len(trainingSet) 
    msg = re.sub('\W', ' ', msg)
    msg = msg.lower().split()
    
    probOfSpamGivenMsg = pSpam
    probOfHamGivenMsg = pHam
    
    for word in msg:
        if word in spam:
            probOfSpamGivenMsg *= spam[word]
        if word in ham:
            probOfHamGivenMsg *= ham[word]
            
        if probOfSpamGivenMsg > probOfHamGivenMsg:
            return 'spam'
        elif probOfHamGivenMsg > probOfSpamGivenMsg:
            return 'ham'
        else:
            return 'need human classification'

In [None]:
testingSet['pred'] = testingSet['Messages'].apply(classifyTestingSet)

In [None]:
right = 0
tot = testingSet.shape[0]

for row in testingSet.iterrows():
    row = row[1]
    if(row['Label']==row['pred']):
        right +=1
        
print('Right: ', right)
print('Wrong: ',tot-right)
print('Accuracy: ',right/tot)
print('Error: ', 1 -(right/tot))

In [None]:
# The Accuracy of Multinomial naive bayes is best in Spam Filtering