In [None]:
import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("datasets", "spam")

def fetch_spam_data(ham_url=HAM_URL, spam_url=SPAM_URL, spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (("ham.tar.bz2", ham_url), ("spam.tar.bz2", spam_url)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=spam_path)
        tar_bz2_file.close()

In [None]:
fetch_spam_data()


In [None]:
import os
a = []
for dirname, _, filenames in os.walk('/content/datasets/spam'):
    for filename in filenames:
        a.append(os.path.join(dirname, filename))

print(len(a))

3004


In [None]:
HAM_DIR = os.path.join(SPAM_PATH, "easy_ham")
SPAM_DIR = os.path.join(SPAM_PATH, "spam")
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]

In [None]:
print(f"Total spam files: {len(spam_filenames)}"+"\n"+f"Total ham files: {len(ham_filenames)}")

Total spam files: 500
Total ham files: 2500


In [None]:
ham_filenames[0]

'00001.7c53336b37003a9286aba55d2945844c'

In [None]:
import email
import email.policy

def load_email(is_spam, filename, spam_path=SPAM_PATH):
    directory = "spam" if is_spam else "easy_ham"
    with open(os.path.join(spam_path, directory, filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [None]:
ham_emails = [load_email(is_spam=False, filename=name) for name in ham_filenames]
spam_emails = [load_email(is_spam=True, filename=name) for name in spam_filenames]

In [None]:
print("\033[1m" + "Ham mail:" + "\033[0m\n",ham_emails[0].get_content().strip())
print('\n'*3)
print('---'*9)
print('\n'*3)
print("\033[1m" + "Spam mail:" + "\033[0m\n",spam_emails[6].get_content().strip())

[1mHam mail:[0m
 Date:        Wed, 21 Aug 2002 10:54:46 -0500
    From:        Chris Garrigues <cwg-dated-1030377287.06fa6d@DeepEddy.Com>
    Message-ID:  <1029945287.4797.TMDA@deepeddy.vircio.com>


  | I can't reproduce this error.

For me it is very repeatable... (like every time, without fail).

This is the debug log of the pick happening ...

18:19:03 Pick_It {exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace} {4852-4852 -sequence mercury}
18:19:03 exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace 4852-4852 -sequence mercury
18:19:04 Ftoc_PickMsgs {{1 hit}}
18:19:04 Marking 1 hits
18:19:04 tkerror: syntax error in expression "int ...

Note, if I run the pick command by hand ...

delta$ pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace  4852-4852 -sequence mercury
1 hit

That's where the "1 hit" comes from (obviously).  The version of nmh I'm
using is ...

delta$ pick -version
pick -- nmh-1.0.4 [compiled on fuchsia.cs.mu.OZ.AU at S

In [None]:
ham_emails[0].get_content().strip()

'Date:        Wed, 21 Aug 2002 10:54:46 -0500\n    From:        Chris Garrigues <cwg-dated-1030377287.06fa6d@DeepEddy.Com>\n    Message-ID:  <1029945287.4797.TMDA@deepeddy.vircio.com>\n\n\n  | I can\'t reproduce this error.\n\nFor me it is very repeatable... (like every time, without fail).\n\nThis is the debug log of the pick happening ...\n\n18:19:03 Pick_It {exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace} {4852-4852 -sequence mercury}\n18:19:03 exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace 4852-4852 -sequence mercury\n18:19:04 Ftoc_PickMsgs {{1 hit}}\n18:19:04 Marking 1 hits\n18:19:04 tkerror: syntax error in expression "int ...\n\nNote, if I run the pick command by hand ...\n\ndelta$ pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace  4852-4852 -sequence mercury\n1 hit\n\nThat\'s where the "1 hit" comes from (obviously).  The version of nmh I\'m\nusing is ...\n\ndelta$ pick -version\npick -- nmh-1.0.4 [compiled on fuchsia.cs.mu

In [None]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([
            get_email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()

In [None]:
from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [None]:
structures_counter(ham_emails).most_common()

[('text/plain', 2408),
 ('multipart(text/plain, application/pgp-signature)', 66),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, video/mng)', 1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain, application/x-java-applet)', 1)]

In [None]:
structures_counter(spam_emails).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ('multipart(text/plain, text/html)', 45),
 ('multipart(text/html)', 20),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain, image/jpeg)', 3),
 ('multipart(text/html, application/octet-stream)', 2),
 ('multipart(text/plain, application/octet-stream)', 1),
 ('multipart(text/html, text/plain)', 1),
 ('multipart(multipart(text/html), application/octet-stream, image/jpeg)', 1),
 ('multipart(multipart(text/plain, text/html), image/gif)', 1),
 ('multipart/alternative', 1)]

In [None]:
for header, value in spam_emails[0].items():
    print(header,":",value)

Return-Path : <12a1mailbot1@web.de>
Delivered-To : zzzz@localhost.spamassassin.taint.org
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32	for <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)
Received : from mail.webnote.net [193.120.211.219]	by localhost with POP3 (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)
Received : from dd_it7 ([210.97.77.167])	by webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623	for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100
From : 12a1mailbot1@web.de
Received : from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);	 Sat, 24 Aug 2002 09:42:10 +0900
To : dcek1a1@netsgo.com
Subject : Life Insurance - Why Pay More?
Date : Wed, 21 Aug 2002 20:31:57 -1600
MIME-Version : 1.0
Message-ID : <0103c1042001882DD_IT7@dd_it7>
Content-Type : text/html; charset="iso-8859-1"
Content-Transfer-Encoding : qu

In [None]:
for header, value in ham_emails[0].items():
    print(header,":",value)

Return-Path : <exmh-workers-admin@spamassassin.taint.org>
Delivered-To : zzzz@localhost.netnoteinc.com
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.netnoteinc.com (Postfix) with ESMTP id D03E543C36	for <zzzz@localhost>; Thu, 22 Aug 2002 07:36:16 -0400 (EDT)
Received : from phobos [127.0.0.1]	by localhost with IMAP (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 12:36:16 +0100 (IST)
Received : from listman.spamassassin.taint.org (listman.spamassassin.taint.org [66.187.233.211]) by    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g7MBYrZ04811 for    <zzzz-exmh@spamassassin.taint.org>; Thu, 22 Aug 2002 12:34:53 +0100
Received : from listman.spamassassin.taint.org (localhost.localdomain [127.0.0.1]) by    listman.redhat.com (Postfix) with ESMTP id 8386540858; Thu, 22 Aug 2002    07:35:02 -0400 (EDT)
Delivered-To : exmh-workers@listman.spamassassin.taint.org
Received : from int-mx1.corp.spamassassin.taint.org (int-mx1.corp.spamassassin.taint.org 

In [None]:
spam_emails[0]["Subject"]

'Life Insurance - Why Pay More?'

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(ham_emails + spam_emails, dtype=object)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X.shape

(3000,)

In [None]:
y.shape

(3000,)

In [None]:
X[2500]['Subject']

'Life Insurance - Why Pay More?'

In [None]:
print(spam_emails[0].get_payload().strip())

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<HTML><HEAD>
<META content=3D"text/html; charset=3Dwindows-1252" http-equiv=3DContent-T=
ype>
<META content=3D"MSHTML 5.00.2314.1000" name=3DGENERATOR></HEAD>
<BODY><!-- Inserted by Calypso -->
<TABLE border=3D0 cellPadding=3D0 cellSpacing=3D2 id=3D_CalyPrintHeader_ r=
ules=3Dnone 
style=3D"COLOR: black; DISPLAY: none" width=3D"100%">
  <TBODY>
  <TR>
    <TD colSpan=3D3>
      <HR color=3Dblack noShade SIZE=3D1>
    </TD></TR></TD></TR>
  <TR>
    <TD colSpan=3D3>
      <HR color=3Dblack noShade SIZE=3D1>
    </TD></TR></TBODY></TABLE><!-- End Calypso --><!-- Inserted by Calypso=
 --><FONT 
color=3D#000000 face=3DVERDANA,ARIAL,HELVETICA size=3D-2><BR></FONT></TD><=
/TR></TABLE><!-- End Calypso --><FONT color=3D#ff0000 
face=3D"Copperplate Gothic Bold" size=3D5 PTSIZE=3D"10">
<CENTER>Save up to 70% on Life Insurance.</CENTER></FONT><FONT color=3D#ff=
0000 
face=3D"Copperplate Gothic Bold" size=3D5 PTSIZE=3D"10">
<CENTER>Why

In [None]:
common_headers = set()

for email in X:
    headers = email.keys()
    if not common_headers:
        common_headers = set(headers)
    else:
        common_headers = common_headers.intersection(headers)

print("Common Headers:")
for header in common_headers:
    print(header)


Common Headers:
From
Date
Subject


only `Subject` is common so let's create a pandas dataframe using `Subject` and `Category` being *Spam*
or *Ham* 

In [None]:
import pandas as pd

data = {'Subject': [], 'Category': []}

for email, label in zip(X, y):
    subject = email.get('Subject', '')  # Retrieve the value of the 'Subject' header
    data['Subject'].append(subject)
    data['Category'].append(int(label))

df = pd.DataFrame(data)


In [None]:
df['Subject'][0]

'Re: New Sequences Window'

In [None]:
df['Subject'] = df['Subject'].astype(str)


In [None]:
df['Subject'].dtype

dtype('O')

In [None]:
df['Subject'] = df['Subject'].fillna('').astype(str)


In [None]:
df['Subject'].dtype

dtype('O')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Subject   3000 non-null   object
 1   Category  3000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 47.0+ KB


In [None]:
df.head(20)

Unnamed: 0,Subject,Category
0,Re: New Sequences Window,0
1,[zzzzteana] RE: Alexander,0
2,[zzzzteana] Moscow bomber,0
3,[IRR] Klez: The Virus That Won't Die,0
4,Re: [zzzzteana] Nothing like mama used to make,0
5,Re: [zzzzteana] Nothing like mama used to make,0
6,[zzzzteana] Playboy wants to go out with a bang,0
7,Re: [zzzzteana] Nothing like mama used to make,0
8,[zzzzteana] Meaningful sentences,0
9,[SAtalk] SA CGI Configurator Scripts,0


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Convert the text data into numerical features
vectorizer = CountVectorizer()
x_column = vectorizer.fit_transform(df['Subject'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x_column, df['Category'], test_size=0.2, random_state=42)

# Train a logistic regression model
model = LogisticRegression(solver="liblinear", max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
accuracy = model.score(X_test, y_test)
print('Accuracy:', accuracy)


Accuracy: 0.9166666666666666


In [None]:
from sklearn.model_selection import cross_val_score

# Logistic Regression with different solvers
solvers = ["lbfgs", "liblinear", "newton-cg"]
for solver in solvers:
    log_clf = LogisticRegression(solver=solver, max_iter=1000, random_state=42)
    score = cross_val_score(log_clf, X_train, y_train, cv=3, verbose=3)
    print(f"Logistic Regression (Solver: {solver}): Mean Score = {score.mean()}")


score = cross_val_score(log_clf, X_train, y_train, cv=3, verbose=3)
print(f"Mean Score = {score.mean()}")

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV] END ................................ score: (test=0.910) total time=   0.1s
[CV] END ................................ score: (test=0.920) total time=   0.1s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ................................ score: (test=0.906) total time=   0.1s
Logistic Regression (Solver: lbfgs): Mean Score = 0.9120833333333334
[CV] END ................................ score: (test=0.909) total time=   0.0s
[CV] END ................................ score: (test=0.920) total time=   0.0s
[CV] END ................................ score: (test=0.907) total time=   0.0s
Logistic Regression (Solver: liblinear): Mean Score = 0.9120833333333334
[CV] END ................................ score: (test=0.910) total time=   0.1s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s


[CV] END ................................ score: (test=0.920) total time=   0.1s
[CV] END ................................ score: (test=0.906) total time=   0.1s
Logistic Regression (Solver: newton-cg): Mean Score = 0.9120833333333334


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV] END ................................ score: (test=0.910) total time=   0.1s
[CV] END ................................ score: (test=0.920) total time=   0.1s
[CV] END ................................ score: (test=0.906) total time=   0.1s
Mean Score = 0.9120833333333334


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.4s finished


**So if we do like this then the model is useless because subject matter varies for each mail as we use CounntVectorizer**


In [None]:
from sklearn.metrics import precision_score, recall_score

y_pred = model.predict(X_test)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

Precision: 85.71%
Recall: 56.84%


**Let's use TFIDF**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score
# Convert the text data into numerical features using TF-IDF
vectorizer = TfidfVectorizer()
X_id = vectorizer.fit_transform(df['Subject'])

# Split the data into training and testing sets
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_id, df['Category'], test_size=0.2, random_state=42)

# Train a logistic regression model
model = LogisticRegression(solver="liblinear", max_iter=1000, random_state=42)
model.fit(X_train_1, y_train_1)

# Evaluate the model on itself first
y_train_pr = model.predict(X_train_1)
accuracy_t = accuracy_score(y_train_1, y_train_pr)
print("Accuracy: {:.2f}%".format(100 * accuracy_t))

# Make predictions on the test set
y_pred = model.predict(X_test_1)
precision = precision_score(y_test_1, y_pred)
recall = recall_score(y_test_1, y_pred)
accuracy = accuracy_score(y_test_1, y_pred)

print("Precision: {:.2f}%".format(100 * precision))
print("Recall: {:.2f}%".format(100 * recall))
print("Accuracy: {:.2f}%".format(100 * accuracy))


Accuracy: 91.38%
Precision: 88.57%
Recall: 32.63%
Accuracy: 88.67%


The precision, recall, and accuracy metrics provide insights into the performance of the model for the given classification task.

`Precision: 88.57%`

Precision is the ratio of true positive predictions to the total number of positive predictions made by the model. In this case, it indicates that out of all the predicted positive instances, approximately 88.57% of them are actually true positive instances. A higher precision value suggests that the model has a lower rate of false positive predictions.

`Recall: 32.63%`

Recall, also known as sensitivity or true positive rate, is the ratio of true positive predictions to the total number of actual positive instances in the dataset. Here, it means that the model is able to correctly identify approximately 32.63% of the actual positive instances. A higher recall value indicates that the model has a lower rate of false negative predictions.

`Accuracy: 88.67%`

Accuracy is the ratio of correctly predicted instances to the total number of instances in the dataset. It provides an overall measure of how well the model performs on the given classification task. In this case, the model achieves an accuracy of 88.67%, which means that it correctly predicts the category of approximately 88.67% of the instances in the test set.

Overall, based on these metrics, the model shows a high precision score, indicating a low false positive rate. However, the recall score is relatively low, indicating that the model has difficulty correctly identifying all the positive instances. `So converting the apache spam dataset into a 2 column pandas dataframe is a bad idea!`