In [1]:
# Importing dependencies

import os
import re
import email
import pandas as pd
from tqdm import tqdm

In [2]:
# Constants
DATA_PATH = 'data/trec07p/data'
LABEL_PATH = 'data/trec07p/full/index'

In [3]:
# Reading the labels
labels = pd.read_csv(LABEL_PATH, sep=' ', header=None)

labels.head()

Unnamed: 0,0,1
0,spam,../data/inmail.1
1,ham,../data/inmail.2
2,spam,../data/inmail.3
3,spam,../data/inmail.4
4,spam,../data/inmail.5


In [4]:
# Dropping any na values
labels.dropna(inplace=True)

# Adding columns to the DF
labels.columns = ['label', 'id']

# Chaging to word spam and ham to 1 and 0
labels['label'] = labels['label'].apply(lambda x: 1 if x=='spam' else 0)

# For id column, storing just the id
labels['id'] = labels['id'].apply(lambda x: x.split('/')[2])

labels.head()

Unnamed: 0,label,id
0,1,inmail.1
1,0,inmail.2
2,1,inmail.3
3,1,inmail.4
4,1,inmail.5


In [5]:
emails = pd.DataFrame(columns=["label", "subject", "email_to", "email_from", "message"])

In [6]:
for index, (label, email_id) in tqdm(enumerate(labels.values)):
    with open(os.path.join(DATA_PATH, email_id), "r", encoding='ISO-8859-1') as f:
        email_content = f.read()
        
        parsed_email_content = email.message_from_string(email_content)
        
        subject = parsed_email_content["subject"]
        email_to = parsed_email_content["to"]
        email_from = parsed_email_content["from"]
        
        message = ""
        
        if parsed_email_content.is_multipart():
            for payload in parsed_email_content.get_payload():
                message += str(payload)
        else:
            message = parsed_email_content.get_payload()
        
        emails.loc[index] = [label, subject, email_to, email_from, message]

75419it [37:27, 33.56it/s] 


In [7]:
emails.head()

Unnamed: 0,label,subject,email_to,email_from,message
0,1,"Generic Cialis, branded quality@",the00@speedy.uwaterloo.ca,"""Tomas Jacobs"" <RickyAmes@aol.com>",Content-Type: text/html;\nContent-Transfer-Enc...
1,0,Typo in /debian/README,debian-mirrors@lists.debian.org,Yan Morin <yan.morin@savoirfairelinux.com>,"Hi, i've just updated from the gulus and I che..."
2,1,authentic viagra,<the00@plg.uwaterloo.ca>,"""Sheila Crenshaw"" <7stocknews@tractionmarketin...","Content-Type: text/plain;\n\tcharset=""iso-8859..."
3,1,Nice talking with ya,opt4@speedy.uwaterloo.ca,"""Stormy Dempsey"" <vqucsmdfgvsg@ruraltek.com>","\nHey Billy, \n\nit was really fun going out t..."
4,1,or trembling; stomach cramps; trouble in sleep...,ktwarwic@speedy.uwaterloo.ca,"""Christi T. Jernigan"" <dcube@totalink.net>",Content-Type: multipart/alternative;\n ...
...,...,...,...,...,...
75414,1,the reply for your request for a job place [le...,"""Gnitpick"" <gnitpick@flax9.uwaterloo.ca>","""Sydney Car Centre"" <Merrill8783@168city.com>",Content-Type: text/html;\nContent-Transfer-Enc...
75415,1,the reply for your request for a job place [le...,"""Gnitpick"" <gnitpick@flax9.uwaterloo.ca>","""Sydney Car Centre"" <Merrill8783@168city.com>",Content-Type: text/html;\nContent-Transfer-Enc...
75416,0,"Re: [R] Me again, about the horrible documenta...",Duncan Murdoch <murdoch@stats.uwo.ca>,Philippe Grosjean <phgrosjean@sciviews.org>,"For those who are interested, I just cook a li..."
75417,0,Re: [R] RODBC problem,<r-help@stat.math.ethz.ch>,=?iso-8859-1?Q?Bernhard_Wellh=F6fer?=\n\t<Bern...,"Hello,\n\nas I wrote I call\n\n sqlFetch(chan..."


In [8]:
emails.to_csv("data/processed_data1.csv", index=False)