## Imports, parsing and splitting

In [7]:
%%time

import os
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report

from nltk import pos_tag, sent_tokenize, word_tokenize, FreqDist
from nltk.corpus import words

from collections import Counter

import enchant
import pyphen

corpus = []
labels = []


for root, dirs, files in os.walk("lingspam_public/bare/"):
    for file in files:
    	with open(os.path.join(root, file), "r") as f:
    		corpus.append(f.read().strip())
    		if file[0] == 's': # Spam?
    			labels.append(1)
    		else:
    			labels.append(0)

X_train, X_test, y_train, y_test = train_test_split(corpus, labels, test_size=0.2, random_state=4)

CPU times: user 96 ms, sys: 20 ms, total: 116 ms
Wall time: 115 ms


## Building a bag-of-words model

In [10]:
%%time
# BOW
vectorizer = CountVectorizer(stop_words="english")
# Vectorize & append labels
train = np.hstack((vectorizer.fit_transform(X_train).toarray(), np.array(y_train)[:, None]))
test  = np.hstack((vectorizer.transform(X_test).toarray(), np.array(y_test)[:, None]))

CPU times: user 1.72 s, sys: 428 ms, total: 2.15 s
Wall time: 2.15 s


## Classification function

In [11]:
# Hyper parameter tuning (e.g., using random search) would give better results as well as feature selection.
def classify(train, test, train_offset=1):

    X_train = train[:, :train.shape[1]-train_offset]
    y_train = train[:, train.shape[1]-1]

    X_test = test[:, :test.shape[1]-train_offset]
    y_test = test[:, test.shape[1]-1]
    
    mnb = MultinomialNB()
    mnb.fit(X_train, y_train)

    mnb_preds = mnb.predict(X_test)

    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(X_train, y_train)

    knn_preds = knn.predict(X_test)


    rf = RandomForestClassifier(max_depth=2, random_state=0)
    rf.fit(X_train, y_train)

    rf_preds = rf.predict(X_test)


    print("Naive Bayes Report:\n")
    print(classification_report(y_test, mnb_preds))

    print("\n\nKNN Report:\n")
    print(classification_report(y_test, knn_preds))

    print("\n\nRandom Forest Report:\n")
    print(classification_report(y_test, rf_preds))


## BOW results

In [13]:
%%time
classify(train, test)

Naive Bayes Report:

             precision    recall  f1-score   support

          0       1.00      0.99      1.00       484
          1       0.97      1.00      0.98        95

avg / total       0.99      0.99      0.99       579



KNN Report:

             precision    recall  f1-score   support

          0       0.98      0.87      0.92       484
          1       0.57      0.92      0.70        95

avg / total       0.91      0.87      0.88       579



Random Forest Report:

             precision    recall  f1-score   support

          0       0.84      1.00      0.91       484
          1       1.00      0.04      0.08        95

avg / total       0.87      0.84      0.78       579

CPU times: user 2min 19s, sys: 592 ms, total: 2min 19s
Wall time: 2min 19s


## Extracting features from the documents
### Reading spam word list

In [14]:
spam_list = []
with open("spam_word_list.txt", "r") as f:
	spam_list = [word.strip().lower() for word in f.readlines() if word != "\n"]

### Initializing dictionary and syllable counter

In [15]:
d = enchant.Dict("en_US")
pyphen.language_fallback('nl_NL_variant1')
dic = pyphen.Pyphen(lang='en_GB')

### Function to extract features (except for TF-IDF) from one document

In [16]:
def extract_features(doc):
	doc = doc.lower()
	res = []
	tokens = word_tokenize(doc)
	sents = sent_tokenize(doc)
	# Number of sentences
	res.append(len(sents))

	# Number of verbs
	tags = pos_tag(tokens)
	counts = Counter(token[1] for token in tags)
	res.append(counts["VB"])

	# Number of words that are found in the spam list
	spam_list_no = 0
	for w in spam_list:
		if w in doc:
			spam_list_no += 1

	res.append(spam_list_no)

	# Number of spelling mistakes. Currently, not sensitive to other languages.
	# Number of words that contain both numeric and alphabetical chars,
	typos = 0 
	nums = 0
	
	# Number of words with more than 3 syllables
	three_syl_no = 0
	# Avg. number of syllables,
	avg_syl_word = 0
	word_no = 0

	# Sum of TF-ISF, Term frequence-Inverse sentence frequency
	tf_isf = 0.0
	f_terms = FreqDist(tokens)

	for token in tokens:
		# Checks if this token is an English word
		# if token in words.words(): # It might be a proper word with no typos from a different language
		if not d.check(token):
			typos +=1

		syl_res = dic.inserted(token)
		syls_no = len(syl_res.split("-"))

		if syls_no > 3:
			three_syl_no += 1

		word_no += 1
		avg_syl_word += syls_no

		# Not just numbers and contains at least one digit
		if not (token.isdigit()) and any(c.isdigit() for c in token):
			nums += 1

		tf = float(f_terms[token])
		isf = 0.0
		for s in sents:
			if token in s:
				isf += 1.0

		if isf > 0.0:
			isf = (float(len(sents))) / isf
		else:
			isf = 0.0
		tf = 1.0 + np.log(tf)
		tf_isf += tf * isf


	avg_syl_word /= word_no
	res.extend((typos, nums, three_syl_no, avg_syl_word, tf_isf))

	return res

### Extracting TF-IDF

In [17]:
vv = TfidfVectorizer(stop_words='english', sublinear_tf=True)
train_tf_idf = np.sum(vv.fit_transform(X_train).toarray(), axis=1)
test_tf_idf = np.sum(vv.transform(X_test).toarray(), axis=1)

### Extracting the features from the entire corpus and building the dataset. (This takes ~ 245s)

In [18]:
%%time
train = np.hstack((np.array([extract_features(doc) for doc in X_train]), train_tf_idf[:, None], \
	np.array(y_train)[:, None]))

test = np.hstack((np.array([extract_features(doc) for doc in X_test]), test_tf_idf[:, None], \
	np.array(y_test)[:, None]))


CPU times: user 3min 24s, sys: 4.18 s, total: 3min 28s
Wall time: 3min 28s


## Results for the new feature structure
### Without TF-IDF & TF-ISF

In [20]:
classify(train, test, 3)

Naive Bayes Report:

             precision    recall  f1-score   support

        0.0       0.97      0.87      0.92       484
        1.0       0.57      0.88      0.69        95

avg / total       0.91      0.87      0.88       579



KNN Report:

             precision    recall  f1-score   support

        0.0       0.95      0.98      0.96       484
        1.0       0.88      0.73      0.80        95

avg / total       0.94      0.94      0.94       579



Random Forest Report:

             precision    recall  f1-score   support

        0.0       0.90      0.99      0.94       484
        1.0       0.93      0.44      0.60        95

avg / total       0.91      0.90      0.89       579



### With TF-IDF & TF-ISF

In [21]:
classify(train, test)

Naive Bayes Report:

             precision    recall  f1-score   support

        0.0       0.88      0.94      0.91       484
        1.0       0.55      0.38      0.45        95

avg / total       0.83      0.85      0.83       579



KNN Report:

             precision    recall  f1-score   support

        0.0       0.87      0.94      0.90       484
        1.0       0.46      0.27      0.34        95

avg / total       0.80      0.83      0.81       579



Random Forest Report:

             precision    recall  f1-score   support

        0.0       0.90      0.99      0.94       484
        1.0       0.91      0.42      0.58        95

avg / total       0.90      0.90      0.88       579

