In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import sys
import os
import pathlib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Get datasets
datadir = 'datasets/spam-detection'
pathlib.Path(datadir).mkdir(parents=True, exist_ok=True)

In [4]:
%%bash
datadir="datasets/spam-detection"

for i in 20021010_easy_ham 20021010_hard_ham 20021010_spam 20030228_easy_ham 20030228_easy_ham_2 20030228_hard_ham 20030228_spam 20030228_spam_2 20050311_spam_2 
  do
  wget https://spamassassin.apache.org/old/publiccorpus/$i.tar.bz2 -O $datadir/$i.tar.bz2
  tar xjf $datadir/$i.tar.bz2 -C $datadir
  done

--2020-06-28 08:32:31--  https://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2
Resolving spamassassin.apache.org (spamassassin.apache.org)... 40.79.78.1, 95.216.24.32, 2a01:4f9:2a:185f::2
Connecting to spamassassin.apache.org (spamassassin.apache.org)|40.79.78.1|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1677144 (1.6M) [application/x-bzip2]
Saving to: ‘datasets/spam-detection/20021010_easy_ham.tar.bz2’

     0K .......... .......... .......... .......... ..........  3%  387K 4s
    50K .......... .......... .......... .......... ..........  6%  776K 3s
   100K .......... .......... .......... .......... ..........  9% 77.6M 2s
   150K .......... .......... .......... .......... .......... 12% 34.4M 1s
   200K .......... .......... .......... .......... .......... 15%  685K 1s
   250K .......... .......... .......... .......... .......... 18% 4.11M 1s
   300K .......... .......... .......... .......... .......... 21% 5.75M 1s
   350K .

In [5]:
def create_corpus(filedir, spam=False):
    corpus = []
    label = []
    for filename in os.listdir(filedir):
        with open(os.path.join(filedir, filename), 'r', encoding="ISO-8859-1") as f:
           corpus.append(f.read().replace("\n", ""))
        label.append(spam)
    return corpus, label
        

In [6]:
corpus = create_corpus('datasets/spam-detection/easy_ham/')
basedir = 'datasets/spam-detection'
subdir = ['easy_ham', 'easy_ham_2', 'hard_ham', 'spam', 'spam_2']

# N = sum([len(files) for r, d, files in os.walk(basedir)])
corpus = []
label = []

for sd in subdir:
    spam = False
    if "spam" in sd:
        spam = True
    print(f"Processing directory {basedir}/{sd}")
    c, l = create_corpus(f'{basedir}/{sd}', spam=spam)
    corpus += c
    label += l


Processing directory datasets/spam-detection/easy_ham
Processing directory datasets/spam-detection/easy_ham_2
Processing directory datasets/spam-detection/hard_ham
Processing directory datasets/spam-detection/spam
Processing directory datasets/spam-detection/spam_2


In [7]:
np.sum(label)

2400

In [8]:
len(corpus)

9354

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split


In [10]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

X_train, X_test, y_train, y_test = train_test_split(X, label, test_size=.25, random_state=42)

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix, 
    precision_score, 
    recall_score, 
    f1_score, 
    precision_recall_curve,
    roc_curve
)

In [12]:
clf = RandomForestClassifier(random_state=42, n_estimators=40, n_jobs=-1)
y_train_pred = cross_val_predict(clf, X_train, y_train, cv=3)

In [13]:
def classifier_perf(y, y_hat):
    print(confusion_matrix(y, y_hat))
    print("Accuracy:", accuracy_score(y, y_hat))
    print("Precision:", precision_score(y, y_hat))
    print("Recall:", recall_score(y, y_hat))
    print("F1:", f1_score(y, y_hat))

In [15]:
classifier_perf(y_train, y_train_pred)

[[5187   27]
 [  77 1724]]
Accuracy: 0.9851746258018532
Precision: 0.9845802398629354
Recall: 0.9572459744586341
F1: 0.9707207207207207


In [16]:
clf.fit(X_train, y_train)
y_test_pred = clf.predict(X_test)

In [17]:
classifier_perf(y_test, y_test_pred)

[[1734    6]
 [  12  587]]
Accuracy: 0.9923044035912784
Precision: 0.9898819561551433
Recall: 0.9799666110183639
F1: 0.9848993288590604
