Data Preprocessing 
- Removal of stop words 'and', 'the', 'of', etc
- Grouping together (lemmatization) different inflected forms of a word like 'include', 'includes', etc
- Removing non-words like punctutation marks
- We will be looking only at the email content (3rd line) and not subject (1st line)

Dataset : https://aclweb.org/aclwiki/Spam_filtering_datasets
1. Ling-spam Corpus : A dataset that contains spam messages and messages from the Linguist list. 
2. Enron-spam : A collection of encrypted datasets that contain spam messages and ham messages from real users.

In [1]:
import os 
import numpy as np 
import pandas as pd
from collections import Counter
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
import fnmatch

# Ling-spam dataset

In [2]:
def make_dict(train_dir):
    emails = [os.path.join(train_dir, f) for f in os.listdir(train_dir)]
    
    all_words = []
    
    for mail in emails:
        with open(mail) as m:
            for i, line in enumerate(m):
                if i==2:
                    words = line.split()
                    all_words += words
                    
    dictionary = Counter(all_words)
    
    # remove non-words and single characters     
    for item in list(dictionary):
        if item.isalpha() == False:
            del dictionary[item] 
        elif len(item) == 1:
            del dictionary[item]
            
    dictionary = dictionary.most_common(3000)  ## keeping only 3000 most common words in the dict
    
    return dictionary


Feature extraction :

Lets create a feature vector matrix where the numer of rows will be the number of emails in the training set and the number of colums will be the number of words in the dictionary

The location 'ij' in the matrix will tell how many time that particular word 'j' appeared in the mail 'i', i.e. its frequency.


In [3]:
# The most important part of the whole program

def extract_feature(train_dir):
    files = [os.path.join(train_dir, f) for f in os.listdir(train_dir)]
        
    feature_matrix = np.zeros((len(files), 3000))
    docNum = 0
    
    for fil in files:
        with open(fil) as f:
            for i, line in enumerate(f):
                if i==2:  ## content is in line 3
                    words = line.split()
                    for w in words:
                        wordNum = 0
                        for i, d in enumerate(dictionary):
                            if d[0] == w:
                                wordNum = i
                                feature_matrix[docNum, wordNum] = words.count(w)
            docNum = docNum + 1
    
    return feature_matrix
    

In [None]:
dictionary = make_dict('ling-spam/train-mails/')

In [None]:
# 702 mails in train dir 
train_matrix = extract_feature('ling-spam/train-mails/')

train_labels = np.zeros(702)

fileNum = 0

for file in os.listdir('ling-spam/train-mails/'):
    if fnmatch.fnmatch(file, "spm*txt"):
        train_labels[fileNum] = 1
    else:
        train_labels[fileNum] = 0
    fileNum += 1

In [None]:
# Test the model
test_matrix = extract_feature('ling-spam/test-mails/')

test_labels = np.zeros(260)  ## 260 test mails

fileNum = 0

for file in os.listdir('ling-spam/test-mails/'):
    if fnmatch.fnmatch(file, "spm*txt"):
        test_labels[fileNum] = 1
    else:
        test_labels[fileNum] = 0
    fileNum += 1

In [None]:
# Training SVM as model1 and Naive Baye's as model2

# supervised binary classifiers - effective when high number of features
model1 = LinearSVC()
model1.fit(train_matrix, train_labels)
result1 = model1.predict(test_matrix)
print(confusion_matrix(test_labels, result1))
print(model1.score(test_matrix, test_labels))

In [None]:
# one of the popular methods for doc classification, this is a supervised probabilistic classifier - assumes independence btw each feature
model2 = MultinomialNB()  
model2.fit(train_matrix, train_labels)
result2 = model2.predict(test_matrix)
print(confusion_matrix(test_labels, result2))
print(model2.score(test_matrix, test_labels))

# Enron-spam dataset

In [None]:
# Need to make some changes for the different directory structure

def make_dict(train_dir):
    emails_dirs = [os.path.join(train_dir, f) for f in os.listdir(train_dir)]
    
    all_words = []
    
    for email_dir in emails_dirs:
        dirs = [os.path.join(email_dir, f) for f in os.listdir(email_dir)]
        for d in dirs:
            emails = [os.path.join(d, f) for f in os.listdir(d)]
            
            for mail in emails:
                with open(mail, encoding="latin-1") as m:
                    for line in m:
                            words = line.split()
                            all_words += words
                    
    dictionary = Counter(all_words)
    
    # remove non-words and single characters     
    for item in list(dictionary):
        if item.isalpha() == False:
            del dictionary[item] 
        elif len(item) == 1:
            del dictionary[item]
            
    dictionary = dictionary.most_common(3000)  ## keeping only 3000 most common words in the dict
    np.save('dict_enron.npy', dictionary)
    
    return dictionary


In [None]:
def extract_feature(train_dir):
    emails_dirs = [os.path.join(train_dir, f) for f in os.listdir(train_dir)]

    feature_matrix = np.zeros((33716, 3000))
    docNum = 0
    train_labels = np.zeros(33716)

    for email_dir in emails_dirs:
        dirs = [os.path.join(email_dir, f) for f in os.listdir(email_dir)]
        for d in dirs:
            emails = [os.path.join(d, f) for f in os.listdir(d)]
            for mail in emails:
                with open(mail, encoding="latin-1") as m:
                    all_words = []
                    for line in m:
                            words = line.split()
                            all_words += words

                    for word in all_words:
                        wordNum = 0
                        for i, d in enumerate(dictionary):
                            if d[0] == word:
                                wordNum = i
                                feature_matrix[docNum, wordNum] = all_words.count(word)

                train_labels[docNum] = int(mail.split(".")[-2] == 'spam')  ## if spam or not
                docNum = docNum + 1

    return feature_matrix, train_labels

In [None]:
from sklearn.model_selection import train_test_split

dictionary = make_dict('enron-spam/')
 
train_matrix, labels = extract_feature('enron-spam/')

np.save('enron_features.npy', train_matrix)
np.save('enron_labels.npy', train_labels)

X_train, X_test, y_train, y_test = train_test_split(train_matrix, train_labels, test_size=0.3, random_state=42)

# Training SVM as model1 and Naive Baye's as model2

# supervised binary classifiers - effective when high number of features
model1 = LinearSVC()
model1.fit(X_train, y_train)
result1 = model1.predict(X_test)
print(confusion_matrix(y_test, result1))
print(model1.score(X_test, y_test))

# one of the popular methods for doc classification, this is a supervised probabilistic classifier - assumes independence btw each feature
model2 = MultinomialNB()  
model2.fit(X_train, y_train)
result2 = model2.predict(X_test)
print(confusion_matrix(y_test, result2))
print(model1.score(X_test, y_test))