In [81]:
from urllib.request import urlretrieve
import tarfile 
import pandas as pd
import uuid
import hashlib
from sklearn.impute import SimpleImputer

In [None]:

ham_link='https://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2'
spam_link='https://spamassassin.apache.org/old/publiccorpus/20021010_spam.tar.bz2'

urlretrieve(ham_link, 'easy_ham.tar.bz2')
urlretrieve(spam_link, 'spam.tar.bz2')

ham_file = tarfile.open('easy_ham.tar.bz2', 'r:bz2') 
spam_file = tarfile.open('spam.tar.bz2', 'r:bz2')

ham_file.extractall('data') 
spam_file.extractall('data')
ham_file.close() 
spam_file.close()

In [74]:
# Loading the files in the data folder
import os

ham_filenames = [name for name in sorted(os.listdir('data/easy_ham')) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir('data/spam')) if len(name) > 20]

print('Number of ham files:', len(ham_filenames))
print('Number of spam files:', len(spam_filenames))

df = pd.DataFrame(ham_filenames + spam_filenames, columns=['email_id'])

def to_uuid(id_str):
    # Create a SHA-256 hash and return the first 16 hex characters
    return hashlib.sha256(id_str.encode()).hexdigest()[:16]

# Convert all file names to UUIDs
df['email_id'] = df['email_id'].apply(to_uuid)
df.head()

# Pre-calculate the ham UUIDs for correct spam flag assignment
ham_uuids = [to_uuid(name) for name in ham_filenames]
df['is_spam'] = df['email_id'].apply(lambda x: 0 if x in ham_uuids else 1)
df.head()

contents = []

def read_file_and_append(filepath):
    with open(filepath, 'rb') as f:
        contents.append(f.read().decode('latin1'))

# Reading ham files
for filename in ham_filenames:
    read_file_and_append(os.path.join('data', 'easy_ham', filename))

# Reading spam files
for filename in spam_filenames:
    read_file_and_append(os.path.join('data', 'spam', filename))

df['content'] = contents
df.head()




Number of ham files: 2551
Number of spam files: 501


Unnamed: 0,email_id,is_spam,content
0,98e7e744d4b0ea78,0,From exmh-workers-admin@redhat.com Thu Aug 22...
1,e953def9713ad64f,0,From Steve_Burt@cursor-system.com Thu Aug 22 ...
2,a9c3be24be981dbe,0,From timc@2ubh.com Thu Aug 22 13:52:59 2002\n...
3,1874bd560ea2aaa8,0,From irregulars-admin@tb.tf Thu Aug 22 14:23:...
4,976351ae57c8b54c,0,From exmh-users-admin@redhat.com Thu Aug 22 1...


In [75]:
df['sender'] = df['content'].apply(lambda x: x.split("From " or "from")[1].split()[0] if "From " in x else None)
df['subject'] = df['content'].apply(lambda x: x.split("Subject: ")[1].split("\n")[0] if "Subject: " in x else None)
df['date'] = df['content'].apply(lambda x: x.split("Date: ")[1].split("\n")[0] if "Date: " in x else None)
df['content-type'] = df['content'].apply(lambda x: x.split("Content-Type: ")[1].split("\n")[0] if "Content-Type: " in x else None)
df['content'] = df['content'].apply(lambda x: x.split("\n\n")[1] if "\n\n" in x else None)
df.head()

Unnamed: 0,email_id,is_spam,content,sender,subject,date,content-type
0,98e7e744d4b0ea78,0,"Date: Wed, 21 Aug 2002 10:54:46 -05...",exmh-workers-admin@redhat.com,Re: New Sequences Window,"Thu, 22 Aug 2002 18:26:25 +0700",text/plain; charset=us-ascii
1,e953def9713ad64f,0,"Martin A posted:\nTassos Papadopoulos, the Gre...",Steve_Burt@cursor-system.com,[zzzzteana] RE: Alexander,"Thu, 22 Aug 2002 12:46:18 +0100",text/plain; charset=US-ASCII
2,a9c3be24be981dbe,0,Man Threatens Explosion In Moscow,timc@2ubh.com,[zzzzteana] Moscow bomber,"Thu, 22 Aug 2002 13:52:38 +0100",text/plain; charset=US-ASCII
3,1874bd560ea2aaa8,0,Klez: The Virus That Won't Die\n \nAlready the...,irregulars-admin@tb.tf,[IRR] Klez: The Virus That Won't Die,"Thu, 22 Aug 2002 09:15:25 -0400","text/plain; charset=""us-ascii"""
4,976351ae57c8b54c,0,"On Wed Aug 21 2002 at 15:46, Ulises Ponce wrote:",exmh-users-admin@redhat.com,Re: Insert signature,"Thu, 22 Aug 2002 23:36:32 +1000",


In [87]:
# Cleaning the data
imputer = SimpleImputer(strategy='most_frequent') # Replace missing values with the most frequent value as it's easier since most of the data isnt numerical yet
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns) 
df.head()

Unnamed: 0,email_id,is_spam,content,sender,subject,date,content-type
0,278ca341b7eefe55,0,"Once upon a time, Ville wrote :",rpm-list-admin@freshrpms.net,Re: RH 8 no DMA for DVD drive,"Mon, 7 Oct 2002 19:28:51 +0200",text/plain; charset=US-ASCII
1,8a7e30cfe9532356,0,">>>>> ""R"" == Robert Harley <harley@argote.ch> ...",fork-admin@xent.com,Re: Java is for kiddies,28 Aug 2002 15:06:39 -0400,text/plain; charset=us-ascii
2,6746e9cf6b8aa727,0,You around?\nC,fork-admin@xent.com,calling wayne baisley@#!,"Fri, 20 Sep 2002 23:54:00 -0500 (CDT)",TEXT/PLAIN; charset=US-ASCII
3,2b69a2b71f1a8d69,0,| \n| 0 hits here. :(\n|,spamassassin-talk-admin@lists.sourceforge.net,RE: [SAtalk] Too funny,"Wed, 28 Aug 2002 07:45:18 -0700","text/plain; charset=""us-ascii"""
4,523d25bd0a5fdcd3,1,------=_NextPart_000_0044_5A8512D6.AD3AE071\nC...,tim@paychecks4life.com,Welcome to the concept of RESIDUAL Income!,"Fri, 30 Aug 02 05:32:48 Eastern Daylight Time","multipart/mixed;boundary= ""----=_NextPart_000_..."


In [88]:
# Shuffle the dataframe without a fixed seed
df = df.sample(frac=1).reset_index(drop=True)

# Split the dataframe into 80% train and 20% test
split_ratio = 0.8
split_point = int(len(df) * split_ratio)
train = df.iloc[:split_point]
test = df.iloc[split_point:]

print("Train set head:")
train.head(10)

Train set head:


Unnamed: 0,email_id,is_spam,content,sender,subject,date,content-type
0,10bf17d18f40e8cf,0,"On Thu, Aug 22, 2002 at 10:58:34PM +0200, Robe...",fork-admin@xent.com,Re: Entrepreneurs,"Thu, 22 Aug 2002 15:04:02 -0700",text/plain; charset=us-ascii
1,a37a979152596b4d,0,URL: e59c6ca5938fc27a6995e30fc10b6482\nDate: N...,rssfeeds@jmason.org,<strong>More AmphetaDesk Reviews</strong>,"Thu, 26 Sep 2002 15:30:23 -0000",text/plain; encoding=utf-8
2,c00e62fc0906cf4d,0,\nMalte S. Stretz said:\n> 1046 is fixed. But ...,spamassassin-talk-admin@lists.sourceforge.net,Re: [SAtalk] 2.42: est release?,"Thu, 03 Oct 2002 21:23:04 +0100",
3,f02a14573ad3a5fa,1,"Mr.Vincent Nnaji,\n Standard Trust Bank Ltd,\n...",vinnet@mail.com,BUSINESS PARTNERSHIP(URGENT/CONFIDENTIAL),"Thu, 26 Sep 2002 04:22:43 +0200","text/plain; charset=""us-ascii"""
4,063c86e326ae5eec,0,"URL: http://www.newsisfree.com/click/-2,841882...",rssfeeds@jmason.org,Liddle quits Today with new BBC deal,"Tue, 01 Oct 2002 08:01:35 -0000",text/plain; encoding=utf-8
5,02e7e6e3dec9c638,0,"On Mon, 7 Oct 2002 19:28:51 +0200\nMatthias Sa...",rpm-list-admin@freshrpms.net,Re: RH 8 no DMA for DVD drive,"Mon, 7 Oct 2002 10:38:31 -0700",text/plain; charset=US-ASCII
6,a4bab1c44a1632fa,0,[Tim]\n> Another area for potentially fruitful...,,[Spambayes] all but one testing,Fri Sep 6 07:09:11 2002,
7,08a4fab127aa4341,0,"On 27 Aug 2002 at 15:00, Rohit Khare wrote:\n>...",fork-admin@xent.com,Re: DataPower announces XML-in-silicon,"Tue, 27 Aug 2002 18:29:59 -0400",text/plain; charset=US-ASCII
8,d17c1d8f663117ee,0,URL: http://jeremy.zawodny.com/blog/archives/0...,rssfeeds@jmason.org,Job Offers and My Resume,"Thu, 03 Oct 2002 08:02:41 -0000",text/plain; encoding=utf-8
9,786554e85c0f6a5e,0,I recently installed Razor v2.14 and started u...,razor-users-admin@lists.sourceforge.net,[Razor-users] Razor 2.14 - the day after,"Fri, 23 Aug 2002 09:51:23 -0400","text/plain; charset=""iso-8859-1"""
