In [22]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [38]:
import sys
import os
import pathlib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [24]:
# Get datasets
datadir = 'datasets/spam-detection'
pathlib.Path(datadir).mkdir(parents=True, exist_ok=True)

In [94]:
%%bash
datadir="datasets/spam-detection"

for i in 20021010_easy_ham 20021010_hard_ham 20021010_spam 20030228_easy_ham 20030228_easy_ham_2 20030228_hard_ham 20030228_spam 20030228_spam_2 20050311_spam_2 
  do
  wget https://spamassassin.apache.org/old/publiccorpus/$i.tar.bz2 -O $datadir/$i.tar.bz2
  tar xjf $datadir/$i.tar.bz2 -C $datadir
  done

--2020-06-22 22:51:35--  https://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2
Resolving spamassassin.apache.org (spamassassin.apache.org)... 95.216.24.32, 40.79.78.1, 2a01:4f9:2a:185f::2
Connecting to spamassassin.apache.org (spamassassin.apache.org)|95.216.24.32|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1677144 (1.6M) [application/x-bzip2]
Saving to: ‘datasets/spam-detection/20021010_easy_ham.tar.bz2’

     0K .......... .......... .......... .......... ..........  3%  121K 13s
    50K .......... .......... .......... .......... ..........  6%  249K 9s
   100K .......... .......... .......... .......... ..........  9% 13.9M 6s
   150K .......... .......... .......... .......... .......... 12%  240K 6s
   200K .......... .......... .......... .......... .......... 15% 23.2M 5s
   250K .......... .......... .......... .......... .......... 18% 23.8M 4s
   300K .......... .......... .......... .......... .......... 21%  126K 5s
   350

In [95]:
def create_corpus(filedir, spam=False):
    corpus = []
    label = []
    for filename in os.listdir(filedir):
        with open(os.path.join(filedir, filename), 'r', encoding="ISO-8859-1") as f:
           corpus.append(f.read().replace("\n", ""))
        label.append(spam)
    return corpus, label
        

In [97]:
corpus = create_corpus('datasets/spam-detection/easy_ham/')
basedir = 'datasets/spam-detection'
subdir = ['easy_ham', 'easy_ham_2', 'hard_ham', 'spam', 'spam_2']

# N = sum([len(files) for r, d, files in os.walk(basedir)])
corpus = []
label = []

for sd in subdir:
    spam = False
    if "spam" in sd:
        spam = True
    print(f"Processing directory {basedir}/{sd}")
    c, l = create_corpus(f'{basedir}/{sd}', spam=spam)
    corpus += c
    label += l


Processing directory datasets/spam-detection/easy_ham
Processing directory datasets/spam-detection/easy_ham_2
Processing directory datasets/spam-detection/hard_ham
Processing directory datasets/spam-detection/spam
Processing directory datasets/spam-detection/spam_2


In [98]:
np.sum(label)

2400

In [99]:
len(corpus)

9354

In [100]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split


In [101]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

X_train, X_test, y_train, y_test = train_test_split(X, label, test_size=.25, random_state=42)

In [102]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix, 
    precision_score, 
    recall_score, 
    f1_score, 
    precision_recall_curve,
    roc_curve
)

In [103]:
clf = RandomForestClassifier(random_state=42, n_estimators=40, n_jobs=-1)
y_train_pred = cross_val_predict(clf, X_train, y_train, cv=3)

In [104]:
def classifier_perf(y, y_hat):
    print(confusion_matrix(y, y_hat))
    print("Accuracy:", accuracy_score(y, y_hat))
    print("Precision:", precision_score(y, y_hat))
    print("Recall:", recall_score(y, y_hat))
    print("F1:", f1_score(y, y_hat))

In [None]:
classifier_perf(y_train)

In [105]:
clf.fit(X_train, y_train)
y_test_pred = clf.predict(X_test)

In [106]:
classifier_perf(y_test, y_test_pred)

[[1735    5]
 [  16  583]]
Accuracy: 0.9910218041898247
Precision: 0.9914965986394558
Recall: 0.9732888146911519
F1: 0.9823083403538332
