In [1]:
from os import listdir
from os.path import isfile, join
import re
import nltk
import math
from nltk.corpus import stopwords
import numpy as np
# from nltk.stem.snowball import SnowballStemmer
# from nltk.corpus import wordnet
# from nltk.stem import WordNetLemmatizer
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('snowball')

In [2]:
### Preprocessing: Label
def preProcessingLabel(labelFile = "SPAM.label"):
    labelDict = {}
    hamCount = 0
    spamCount = 0
    with open("SPAM.label") as f:
        for line in f:
            (val, key) = line.split()
            labelDict[key] = int(val)
            # only consider TRAINING label
            if 'TRAIN_' in key:
                if int(val) == 1: 
                    hamCount +=1
                else:
                    spamCount +=1
    f.close()
    return hamCount, spamCount, labelDict
### Preprocessing: building vocabulary + frequency:
def buildingVocabulary(trainingDir = 'processed_traning'):
    # read spam label to identify which doc is spam
    (hamCount, spamCount, labelDict) = preProcessingLabel()    
    # for each file in the training directory
    onlyfiles = [f for f in listdir(trainingDir) if isfile(join(trainingDir, f))]
    wordList = set()     # vocabulary
    spamWordDict = {}    # frequency in spam doc
    hamWordDict = {}     # frequency in ham doc
    stop_words = set(stopwords.words('english'))        #english stop words
    for i in onlyfiles:
        #  ignore random dir
        if i == '.DS_Store':
            continue
        inputFile = trainingDir+'/'+i
        s = open(inputFile,encoding="latin-1").read()
        tokens = s.split()
        for word in tokens:
            ### Processing word here##################
            # ignore stopwords
            if word in stop_words:
                continue
            if word in ':;.,/\(\)\[\]<>': # remove normal character
                continue
            # lemmatize word 
            lmWord = word
            ##########################################
            ### Update vocabulary#####################
            if lmWord in wordList:
                # if in ham, update ham
                if labelDict[i] == 1: # ham
                    if lmWord in hamWordDict:
                        hamWordDict[lmWord] += 1
                    else:
                        hamWordDict[lmWord] = 1
                # if in spam, update spam
                if labelDict[i] == 0: # spam
                    if lmWord in spamWordDict:
                        spamWordDict[lmWord] += 1
                    else:
                        spamWordDict[lmWord] = 1
            else:
                # add new word to word list.
                wordList.add(lmWord)
                # if in ham, update ham
                if labelDict[i] == 1: # ham
                    if lmWord in hamWordDict:
                        hamWordDict[lmWord] += 1
                    else:
                        hamWordDict[lmWord] = 1
                # if in spam, update spam
                if labelDict[i] == 0: # spam
                    if lmWord in spamWordDict:
                        spamWordDict[lmWord] += 1
                    else:
                        spamWordDict[lmWord] = 1
            ##########################################
    return wordList, spamWordDict, hamWordDict
# Naive Bayes for each file 
def NBeachFile(fileName,hamCount,spamCount,vocabulary,spamWordDict,hamWordDict):
    spamWordCount = sum(spamWordDict.values())   #count word freq in spam
    hamWordCount = sum(hamWordDict.values())     #count word freq in ham
    vocabSize = len(vocabulary)
    stop_words = set(stopwords.words('english')) #english stop words
    spamProb = math.log(spamCount/(hamCount+spamCount))
    hamProb = math.log(hamCount/(hamCount+spamCount))
    s = open(fileName,encoding="latin-1").read()
    tokens = s.split()
    for word in tokens:
            ### Processing word here##################
            # ignore stopwords
            if word in stop_words:
                continue
            if word in ':;.,/\(\)\[\]<>': # some normal character
                continue
            lmWord = word # was thinking about lemmatize word
            ##########################################
            ### compute spamscore
            freqInSpam = 0
            freqInHam = 0
            if lmWord in spamWordDict:
                freqInSpam = spamWordDict[lmWord]
            else:
                freqInSpam = 0
            if lmWord in hamWordDict:
                freqInHam = hamWordDict[lmWord]
            else:
                freqInHam = 0
            # log to avoid becoming 0
            spamProb += math.log((freqInSpam+1)/(vocabSize+spamWordCount))
            hamProb += math.log((freqInHam+1)/(vocabSize+hamWordCount))
    if hamProb > spamProb:
        return 1
    else:
        return 0
# Naive bayes implementation
def NB(testDir, vocabulary,spamWordDict,hamWordDict):
    # read spam label to identify which doc is spam
    (hamCount, spamCount, labelDict) = preProcessingLabel()
    # for each file in the testing directory
    onlyfiles = [f for f in listdir(testDir) if isfile(join(testDir, f))]
    tp = 0 # true positive
    fp = 0 # false postive
    fn = 0 # false negative
    tn = 0 # true negative
    for i in onlyfiles:
        if i == '.DS_Store':
            continue
        inputFile = testDir+'/'+i
        ### use naive bayes to get prediction
        score = NBeachFile(inputFile,hamCount,spamCount,vocabulary,spamWordDict,hamWordDict)
        ### compute stat base on labelDict
        if score == 1:
            if labelDict[i] == 1:  # label 1 as positive
                tp += 1
            else: 
                fp += 1
        else:
            if labelDict[i] == 0:  # label 0 as negative
                tn += 1
            else:
                fn += 1
    return tp,fp,fn,tn
### simple stat
def computeStat(tp,fp,fn,tn):
    print('False Positive Rate:\t', fp)
    print('False Negative Rate:\t', fn)
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    print('Recall:\t\t\t', recall)    
    print('Precision:\t\t',precision)
    fscore = 2*precision*recall/(precision + recall)
    print('F-score beta = 1: \t', fscore)

In [3]:
# preprocessing
(vocabulary,spamWordDict,hamWordDict) = buildingVocabulary('processed_training')
# perform NB on test data and compare with label
(tp,fp,fn,tn)=NB('processed_testing',vocabulary,spamWordDict,hamWordDict)
# output stat 
computeStat(tp,fp,fn,tn)

False Positive Rate:	 28
False Negative Rate:	 8
Recall:			 0.9911699779249448
Precision:		 0.9697624190064795
F-score beta = 1: 	 0.980349344978166


In [21]:
print('############################################')
#### Predict duc spam
print('[+] Task 3.4: spam email that past both naive bayes and SVM')
(hamCount, spamCount, labelDict) = preProcessingLabel()
spamLabel = NBeachFile('duc-spam/TEST_01327.eml',hamCount,spamCount, vocabulary,spamWordDict,hamWordDict)
print('[>>>>] Email is labeled as: ',spamLabel)

############################################
[+] Task 3.4: spam email that past both naive bayes and SVM
[>>>>] Email is labeled as:  1


In [12]:
from sklearn import svm
from collections import OrderedDict
# preprocessing label
def preProcessingLabelSVM(labelFile = "SPAM.label"):
    labelDict = {}
    hamCount = 0
    spamCount = 0
    with open("SPAM.label") as f:
        for line in f:
            (val, key) = line.split()
            labelDict[key] = int(val)
    f.close()
    return labelDict
### Preprocessing: building vocabulary + frequency:
def SVMBuildingMatrix(trainingDir='processed_training'):
    # read spam label to identify which doc is spam
    labelDict = preProcessingLabelSVM()
    # for each file in the training directory
    onlyfiles = [f for f in listdir(trainingDir) if isfile(join(trainingDir, f))]
    wordList = {}                                       # vocabulary
    trainMatrix = {}                                    # trainingMatrix
    stop_words = set(stopwords.words('english'))        # english stop words
    for i in onlyfiles:
        # see some random-file
        if i == '.DS_Store': #ignore this file
            continue
        trainMatrix[i] = {}
        inputFile = trainingDir+'/'+i
        s = open(inputFile,encoding="latin-1").read()
        tokens = s.split()
        for word in tokens:
            ### Processing word here##################
            # ignore stopwords
            if word in stop_words:
                continue
            if word in ':;.,/\(\)\[\]<>': # some normal character
                continue
            lmWord = word # was thinking about lemmatize word
            ##########################################
            ### Update vocabulary#####################
            if lmWord in wordList:
                # update vector for each document
                wordList[lmWord]+=1
                if lmWord in trainMatrix:
                    trainMatrix[i][lmWord] += 1
                else:
                    trainMatrix[i][lmWord] = 1
            else:
                # add new word to word list
                wordList[lmWord]=1
                if lmWord in trainMatrix:
                    trainMatrix[i][lmWord] += 1
                else:
                    trainMatrix[i][lmWord] = 1
                # update vector for each document
            ##########################################
    return wordList,trainMatrix

### get top attribute from wordList, by default using 1000 attribute
def getTopAttribute(wordList, n=1000, order=False):
    ### get top words 
    top = sorted(wordList.items(), key=lambda x: x[1], reverse=True)[:n]
    if order:
        return OrderedDict(top)
    return dict(top)
### Output training matrix with number of attribute
def getMatrixWithN(Matrix,wordList,n=1000):
    # get top attribute
    topDict = getTopAttribute(wordList,n,True)
    # read spam label to identify which doc is spam
    labelDict = preProcessingLabelSVM()
    # creating training Matrix
    dimension = (len(Matrix),len(topDict))   
    processedMatrix=np.zeros(dimension)
    labelTrainVector=np.zeros(len(Matrix))
    x = 0 # row
    y = 0 # column
    for doc in Matrix:
        for word in topDict:
            if word not in Matrix[doc]:
                y+=1 # move to next column
                continue
            else:
                processedMatrix[x][y] = Matrix[doc][word]
                y+=1 # move to next column
        if doc in labelDict:
            labelTrainVector[x] = labelDict[doc]
        x+=1
        y=0
    return processedMatrix,labelTrainVector
### computing stat
def SVMComputeStat(prediction, labelTestVector):
    if len(prediction) != len(labelTestVector):
        print('[-] 2 vectors are not in the same length')
        return
    tp = 0 # true positive
    fp = 0 # false postive
    fn = 0 # false negative
    tn = 0 # true negative
    vectorLength = len(prediction)
    for i in range(vectorLength):
        if prediction[i] == 1 and prediction[i] == labelTestVector[i]:
            tp += 1
        elif prediction[i] == 1 and prediction[i] != labelTestVector[i]:
            fp += 1
        elif prediction[i] == 0 and prediction[i] == labelTestVector[i]:
            tn += 1
        elif prediction[i] == 0 and prediction[i] != labelTestVector[i]:
            fn += 1
        else:
            print(i,'[-] something is wrong')
#     print('tp =',tp,', tn =',tn, ', fp =', fp, ', fn =', fn)
    print('False Positive Rate:\t', fp)
    print('False Negative Rate:\t', fn)
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    print('Recall:\t\t\t', recall)    
    print('Precision:\t\t',precision)
    fscore = 2*precision*recall/(precision + recall)
    print('F-score beta = 1: \t', fscore)

In [13]:
print('[+] SVM on testing data')
(wordList,trainMatrix) = SVMBuildingMatrix('processed_training')
(testWordList,testMatrix) = SVMBuildingMatrix('processed_testing') #we don't care about test wordlist
(processedTrainMatrix, labelTrainVector) = getMatrixWithN(trainMatrix,wordList,300)
(processedTestMatrix, labelTestVector) = getMatrixWithN(testMatrix,wordList,300)
clf = svm.SVC()
clf.fit(processedTrainMatrix, labelTrainVector)  
prediction = clf.predict(processedTestMatrix)
SVMComputeStat(prediction, labelTestVector)

[+] SVM on testing data
False Positive Rate:	 37
False Negative Rate:	 16
Recall:			 0.9823399558498896
Precision:		 0.9600862998921251
F-score beta = 1: 	 0.9710856519367157


In [14]:
print('[+] 5-fold Cross Validation')
### 5-fold validation here:
(wordList,trainMatrix) = SVMBuildingMatrix('processed_training')
(processedTrainMatrix, labelTrainVector) = getMatrixWithN(trainMatrix,wordList,300)
# partion training matrix and label vector
partitionMatrices = np.split(processedTrainMatrix, 5)
partitionVectors = np.split(labelTrainVector, 5)
# First fold:
print('[First fold]')
# Consider first-fold as test matrix
firstFoldMatrix = partitionMatrices[0]
firstFoldVector = partitionVectors[0]
# Consider last 4-fold as training matrix
last4Matricies    = np.concatenate((partitionMatrices[1],partitionMatrices[2],partitionMatrices[3],partitionMatrices[4]), axis=0)
last4FoldsVectors = np.concatenate((partitionVectors[1],partitionVectors[2],partitionVectors[3],partitionVectors[4]), axis=0)
# test
clf = svm.SVC()
clf.fit(last4Matricies, last4FoldsVectors)  
fivefoldprediction = clf.predict(firstFoldMatrix)
SVMComputeStat(fivefoldprediction, firstFoldVector)

print('[Second fold]')
# Consider first-fold as test matrix
firstFoldMatrix = partitionMatrices[1]
firstFoldVector = partitionVectors[1]
# Consider last 4-fold as training matrix
last4Matricies    = np.concatenate((partitionMatrices[0],partitionMatrices[2],partitionMatrices[3],partitionMatrices[4]), axis=0)
last4FoldsVectors = np.concatenate((partitionVectors[0],partitionVectors[2],partitionVectors[3],partitionVectors[4]), axis=0)
# test
clf = svm.SVC()
clf.fit(last4Matricies, last4FoldsVectors)  
fivefoldprediction = clf.predict(firstFoldMatrix)
SVMComputeStat(fivefoldprediction, firstFoldVector)

print('[Third fold]')
# Consider first-fold as test matrix
firstFoldMatrix = partitionMatrices[2]
firstFoldVector = partitionVectors[2]
# Consider last 4-fold as training matrix
last4Matricies    = np.concatenate((partitionMatrices[0],partitionMatrices[1],partitionMatrices[3],partitionMatrices[4]), axis=0)
last4FoldsVectors = np.concatenate((partitionVectors[0],partitionVectors[1],partitionVectors[3],partitionVectors[4]), axis=0)
# test
clf = svm.SVC()
clf.fit(last4Matricies, last4FoldsVectors)  
fivefoldprediction = clf.predict(firstFoldMatrix)
SVMComputeStat(fivefoldprediction, firstFoldVector)

print('[Fourth fold]')
# Consider first-fold as test matrix
firstFoldMatrix = partitionMatrices[3]
firstFoldVector = partitionVectors[3]
# Consider last 4-fold as training matrix
last4Matricies    = np.concatenate((partitionMatrices[0],partitionMatrices[1],partitionMatrices[2],partitionMatrices[4]), axis=0)
last4FoldsVectors = np.concatenate((partitionVectors[0],partitionVectors[1],partitionVectors[2],partitionVectors[4]), axis=0)
# test
clf = svm.SVC()
clf.fit(last4Matricies, last4FoldsVectors)  
fivefoldprediction = clf.predict(firstFoldMatrix)
SVMComputeStat(fivefoldprediction, firstFoldVector)

print('[Fifth fold]')
# Consider first-fold as test matrix
firstFoldMatrix = partitionMatrices[4]
firstFoldVector = partitionVectors[4]
# Consider last 4-fold as training matrix
last4Matricies    = np.concatenate((partitionMatrices[0],partitionMatrices[1],partitionMatrices[2],partitionMatrices[3]), axis=0)
last4FoldsVectors = np.concatenate((partitionVectors[0],partitionVectors[1],partitionVectors[2],partitionVectors[3]), axis=0)
# test
clf = svm.SVC()
clf.fit(last4Matricies, last4FoldsVectors)  
fivefoldprediction = clf.predict(firstFoldMatrix)
SVMComputeStat(fivefoldprediction, firstFoldVector)

[+] 5-fold Cross Validation
[First fold]
False Positive Rate:	 35
False Negative Rate:	 9
Recall:			 0.9772151898734177
Precision:		 0.9168646080760094
F-score beta = 1: 	 0.946078431372549
[Second fold]
False Positive Rate:	 21
False Negative Rate:	 9
Recall:			 0.9783132530120482
Precision:		 0.9508196721311475
F-score beta = 1: 	 0.9643705463182898
[Third fold]
False Positive Rate:	 27
False Negative Rate:	 8
Recall:			 0.9809069212410502
Precision:		 0.9383561643835616
F-score beta = 1: 	 0.9591598599766628
[Fourth fold]
False Positive Rate:	 19
False Negative Rate:	 8
Recall:			 0.9801980198019802
Precision:		 0.9542168674698795
F-score beta = 1: 	 0.967032967032967
[Fifth fold]
False Positive Rate:	 22
False Negative Rate:	 6
Recall:			 0.9853658536585366
Precision:		 0.9483568075117371
F-score beta = 1: 	 0.9665071770334929


In [18]:
#### Predict duc spam
print('[+] Task 3.4: spam email that past both naive bayes and SVM')
(wordList,trainMatrix) = SVMBuildingMatrix('processed_training')
(testWordList,testMatrix) = SVMBuildingMatrix('duc-spam') #we don't care about test wordlist
(processedTrainMatrix, labelTrainVector) = getMatrixWithN(trainMatrix,wordList,300)
(processedTestMatrix, labelTestVector) = getMatrixWithN(testMatrix,wordList,300)
clf = svm.SVC()
clf.fit(processedTrainMatrix, labelTrainVector)  
prediction = clf.predict(processedTestMatrix)
print('[>>>>] Email is labeled as: ', prediction[0])

[+] Task 3.4: spam email that past both naive bayes and SVM
[>>>>] Email is labeled as:  1.0


In [11]:
prediction

array([1.])

In [None]:
last4FoldsVectors