# First Clean

In [1]:
# time routine
import time
start_time = time.time()

# import library
import pandas as pd
import re

In [2]:
#
#
# custom functions
#
# email reader
def read_email(email_path):
    """returns email body as a body of text"""
    
    # file manager opens email file, assigns it to variable
    with open(email_path) as email_file:
        email_body = email_file.read()
    
    # returns email body as text
    return email_body

In [3]:
%%time
# import working df
df = pd.read_csv('./data/00_original_wrangle.csv', header=0, index_col=False)

Wall time: 12.3 s


### Change Datatype to Allow String Method

In [4]:
# fill na with '' possibly here

# convert mime_vers to dtype object to run string methods
df.mime_vers = df.mime_vers.astype('str')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517401 entries, 0 to 517400
Data columns (total 19 columns):
f_dir        517401 non-null object
m_id         517401 non-null object
m_date       517401 non-null object
m_from       517401 non-null object
m_to         496355 non-null object
m_cc         135166 non-null object
m_bcc        127886 non-null object
m_subj       498214 non-null object
mime_vers    517401 non-null object
cont_type    517401 non-null object
encode       517401 non-null object
x_from       517401 non-null object
x_to         508255 non-null object
x_cc         128886 non-null object
x_bcc        175 non-null object
x_fold       517401 non-null object
x_orig       517401 non-null object
x_fname      517401 non-null object
m_body       517401 non-null object
dtypes: object(19)
memory usage: 75.0+ MB


In [6]:
# total email body chars (for process tracking)
start_chars = df.m_body.apply(len).sum()

print('Total Characters Across All Email Bodies in Corpus: {}.'.format(start_chars))

Total Characters Across All Email Bodies in Corpus: 954786906.


## Start Cleaning

### Remove Whitespace

In [7]:
# PRESENTATION
# clean leading/trailing whitespace
df = df.apply(lambda x: x.str.lstrip().str.rstrip())

### Remove Redundant Chars in f_dir

In [8]:
# PRESENTATION
# remove redundant string from f_dir
df.f_dir = df.f_dir.str.replace('./data/maildir/','')

### Remove Duplicates on Email Body

In [9]:
# PRESENTATION
# returns rows where the email body is duplicated
x = len(df[df.m_body.duplicated()])

# Drop duplicates based on body text
df = df.drop_duplicates(subset = 'm_body')

# print count
print('Email Body Duplicates: {}'.format(x))

Email Body Duplicates: 269530


### Remove non-Enron Email Addresses on m_from

In [10]:
# PRESENTATION
# Select only @enron email address
only_enron = df.m_from.str.endswith('@enron.com')

# Filter df
df = df[only_enron]

# print count
print('Emails Returned  in \'@enron.com\': {}'.format(len(df)))

Emails Returned  in '@enron.com': 191433


### Strip Email Body From RE: FW: on m_subj, m_body

In [11]:
# replace null values with blank for array
df.loc[df['m_subj'].isnull(), 'm_subj'] = ''

In [12]:
def split_by_orig_forw(text):
    
    try: 
        text = text.split('-----Original Message-----')[0]
    except:
        pass
    
    try: 
        text = text.split('----- Forwarded by')[0]
    except:
        pass

    return text

In [13]:
s = """CATCH THIS
-----Original Message-----
From: 	Ed Pierangelino <ed@apbenergy.com>@ENRON [mailto:IMCEANOTES-Ed+20Pierangelino+20+3Ced+40apbenergy+2Ecom+3E+40ENRON@ENRON.com] 
Sent:	Tuesday, May 08, 2001 7:24 AM
To:	Swerzbin, Mike
Subject:	FW:

watch your back on this one....03/04 np combo was 52.50/58 ees on the offer
yesterday...let me know if you want to look at it  cal 03 pv 59/65.

ed

----------
From: 	Joe Hale
Sent: 	Tuesday, May 08, 2001 10:18 AM
To: 	Ed Pierangelino



 - twin spires.JPG
"""
split_by_orig_forw(s)

'CATCH THIS\n'

In [14]:
s = """CATCH THIS TOO
----- Forwarded by Sara Shackleton/HOU/ECT on 07/27/2000 08:22 AM -----

=09Ryan Siurek@ENRON
=0907/26/2000 12:22 PM
=09=09=20
=09=09 To: Sara Shackleton/HOU/ECT@ECT
=09=09 cc:=20
=09=09 Subject: 00-19

fyi
---------------------- Forwarded by Ryan Siurek/Corp/Enron on 07/26/2000=20
12:22 PM ---------------------------
"""
split_by_orig_forw(s)

'CATCH THIS TOO\n'

In [15]:
%%time

# drop 'Original Message' and 'Forwarded By'
df.m_body = df.m_body.apply(split_by_orig_forw)

Wall time: 540 ms


### Remove \n Characters on m_body

In [16]:
# replace null values with blank for array
df.loc[df['m_body'].isnull(), 'm_body'] = ''

In [17]:
def remove_nn_m_body(text):
   
    try:
        text = re.sub('[\n]', ' ', text)

    except:
        pass
    
    return text

In [18]:
%%time

# replace /n with ' '
df.m_body = df.m_body.apply(remove_nn_m_body)

Wall time: 929 ms


### Add Number of Emails Sent Per Address

    # Count number of emails sent per email address 

    c = pd.DataFrame(df.groupby(['m_from']).size().sort_values(ascending=False))
    c = c.reset_index()
    c.columns = ['m_from', 'sent_counts']
    c.astype({'sent_counts' : 'int'}).dtypes

    # merge sent counts based on m_from email back to DataFrame
    df = pd.merge(df, c, how='left', on='m_from')

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 191433 entries, 0 to 517400
Data columns (total 19 columns):
f_dir        191433 non-null object
m_id         191433 non-null object
m_date       191433 non-null object
m_from       191433 non-null object
m_to         185443 non-null object
m_cc         54587 non-null object
m_bcc        51268 non-null object
m_subj       191433 non-null object
mime_vers    191433 non-null object
cont_type    191433 non-null object
encode       191433 non-null object
x_from       191433 non-null object
x_to         186768 non-null object
x_cc         51653 non-null object
x_bcc        135 non-null object
x_fold       191433 non-null object
x_orig       191433 non-null object
x_fname      191433 non-null object
m_body       191433 non-null object
dtypes: object(19)
memory usage: 29.2+ MB


## Drop Duplicates and Equals '' on m_body

In [20]:
df.m_body.describe()

count     191433
unique    169286
top             
freq        9083
Name: m_body, dtype: object

In [21]:
# drop duplicates
df = df.drop_duplicates('m_body')

# drop == ''
cond = df.m_body == ''
df = df[~cond]

In [22]:
df.m_body.describe()

count                                                169285
unique                                               169285
top       Attached is the new information for the Atlant...
freq                                                      1
Name: m_body, dtype: object

## Export DataFrame

In [23]:
#
# export DataFrame for later work
#

#df.to_csv('./data/01_first_clean.csv', index=False, index_label=False)
df.head()

Unnamed: 0,f_dir,m_id,m_date,m_from,m_to,m_cc,m_bcc,m_subj,mime_vers,cont_type,encode,x_from,x_to,x_cc,x_bcc,x_fold,x_orig,x_fname,m_body
0,allen-p/_sent_mail/1,<18782981.1075855378110.JavaMail.evans@thyme>,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",phillip.allen@enron.com,tim.belden@enron.com,,,,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Here is our forecast
1,allen-p/_sent_mail/10,<15464986.1075855378456.JavaMail.evans@thyme>,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",phillip.allen@enron.com,john.lavorato@enron.com,,,Re:,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Traveling to have a business meeting takes the...
2,allen-p/_sent_mail/100,<24216240.1075855687451.JavaMail.evans@thyme>,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",phillip.allen@enron.com,leah.arsdall@enron.com,,,Re: test,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Leah Van Arsdall,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,test successful. way to go!!!
3,allen-p/_sent_mail/1000,<13505866.1075863688222.JavaMail.evans@thyme>,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",phillip.allen@enron.com,randall.gay@enron.com,,,,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Randall L Gay,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,"Randy, Can you send me a schedule of the sal..."
4,allen-p/_sent_mail/1001,<30922949.1075863688243.JavaMail.evans@thyme>,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",phillip.allen@enron.com,greg.piper@enron.com,,,Re: Hello,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Greg Piper,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,Let's shoot for Tuesday at 11:45.


### Ending Stats

In [24]:
# execution time
print("--- %s seconds ---" % (time.time() - start_time))

# total email body chars (for process tracking)
current_chars = df.m_body.apply(len).sum()

print('{} of {} characters removed from corpus for {} reduction.'.format(current_chars, start_chars,((start_chars-current_chars)/(start_chars+current_chars))))

--- 29.43383526802063 seconds ---
135952755 of 954786906 characters removed from corpus for 0.7507145657922492 reduction.
