# First Clean

In [1]:
# import library
import pandas as pd

In [2]:
#
#
# custom functions
#
# email reader
def read_email(email_path):
    """returns email body as a body of text"""
    
    # file manager opens email file, assigns it to variable
    with open(email_path) as email_file:
        email_body = email_file.read()
    
    # returns email body as text
    return email_body

In [3]:
# import working df
df = pd.read_csv('./data/00_original_wrangle.csv', header=0, index_col=False)

In [4]:
# check info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517401 entries, 0 to 517400
Data columns (total 19 columns):
f_dir        517401 non-null object
m_id         517401 non-null object
m_date       517401 non-null object
m_from       517401 non-null object
m_to         496355 non-null object
m_cc         135166 non-null object
m_bcc        127886 non-null object
m_subj       498214 non-null object
mime_vers    517401 non-null float64
cont_type    517401 non-null object
encode       517401 non-null object
x_from       517401 non-null object
x_to         508255 non-null object
x_cc         128886 non-null object
x_bcc        175 non-null object
x_fold       517401 non-null object
x_orig       517401 non-null object
x_fname      517401 non-null object
m_body       517401 non-null object
dtypes: float64(1), object(18)
memory usage: 75.0+ MB


In [5]:
# fill na with '' possibly here

# convert mime_vers to dtype object to run string methods
df.mime_vers = df.mime_vers.astype('str')


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517401 entries, 0 to 517400
Data columns (total 19 columns):
f_dir        517401 non-null object
m_id         517401 non-null object
m_date       517401 non-null object
m_from       517401 non-null object
m_to         496355 non-null object
m_cc         135166 non-null object
m_bcc        127886 non-null object
m_subj       498214 non-null object
mime_vers    517401 non-null object
cont_type    517401 non-null object
encode       517401 non-null object
x_from       517401 non-null object
x_to         508255 non-null object
x_cc         128886 non-null object
x_bcc        175 non-null object
x_fold       517401 non-null object
x_orig       517401 non-null object
x_fname      517401 non-null object
m_body       517401 non-null object
dtypes: object(19)
memory usage: 75.0+ MB


## Start Cleaning

In [7]:
# PRESENTATION
# clean leading/trailing whitespace
df = df.apply(lambda x: x.str.lstrip().str.rstrip())

In [8]:
# PRESENTATION
# remove redundant string from f_dir
df.f_dir = df.f_dir.str.replace('./data/maildir/','')

In [9]:
# PRESENTATION
# returns rows where the email body is duplicated
x = len(df[df.m_body.duplicated()])

# Drop duplicates based on body text
df = df.drop_duplicates(subset = 'm_body')

# print count
print('Email Body Duplicates: {}'.format(x))

Email Body Duplicates: 269530


In [10]:
# PRESENTATION
# Select only @enron email address
only_enron = df.m_from.str.endswith('@enron.com')

# Filter df
df = df[only_enron]

# print count
print('Emails Returned  in \'@enron.com\': {}'.format(len(df)))

Emails Returned  in '@enron.com': 191433


In [11]:
# PRESENTATION
# remove fowarded emails
fw_query = df.m_subj.str.startswith('FW:', na=False)

# Filter out forwarded emails
df = df[~fw_query]

# print count
print('After FW query: {}'.format(len(df)))

After FW query: 172487


In [12]:
# USER TOOL
# email text by line as list
e_string_rev = './data/maildir/' + 'dasovich-j/all_documents/1780'
print(read_email(e_string_rev))

Message-ID: <135221.1075842980646.JavaMail.evans@thyme>
Date: Tue, 26 Sep 2000 09:35:00 -0700 (PDT)
From: jeff.dasovich@enron.com
To: jalamo@enron.com
Subject: 
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Jeff Dasovich
X-To: jalamo@enron.com
X-cc: 
X-bcc: 
X-Folder: \Jeff_Dasovich_Dec2000\Notes Folders\All documents
X-Origin: DASOVICH-J
X-FileName: jdasovic.nsf

----- Forwarded by Jeff Dasovich/NA/Enron on 09/26/2000 04:34 PM -----

	Jeff Dasovich
	Sent by: Jeff Dasovich
	09/26/2000 10:08 AM
		 
		 To: Joseph Alamo/SFO/EES
		 cc: 
		 Subject: 

Hi and good morning.  Could you please book me a ticket for Portland.  Need 
to arrive in Portland on this coming Monday morning by 8-8:30 AM, and return 
to SF at around 4:30-5:00 PM on Tuesday.  Thanks very much.  Also, could you 
book me a hotel room at the 5th Avenue place (or whatever it's called).  
Muchos gracias.


In [13]:
# Count number of emails sent per email address 

c = pd.DataFrame(df.groupby(['m_from']).size().sort_values(ascending=False))
c = c.reset_index()
c.columns = ['m_from', 'sent_counts']
c.astype({'sent_counts' : 'int'}).dtypes

m_from         object
sent_counts     int32
dtype: object

In [14]:
# merge sent counts based on m_from email back to DataFrame
df = pd.merge(df, c, how='left', on='m_from')

In [15]:
# Looking at counts for 'Forwarded by' occurence
print(len(df))
print(len(df[df.m_body.str.contains('-- Forwarded by')]))

# create condition for 'Forwarded by'
cond = df.m_body.str.contains('-- Forwarded by')

# set df to ignore email body's with condition
df = df[~cond]

# print record count
print(len(df))

172487
33735
138752


In [16]:
#
# export DataFrame for later work
#

# df.to_csv('./data/01_first_clean.csv', index=False, index_label=False)
df.head()

Unnamed: 0,f_dir,m_id,m_date,m_from,m_to,m_cc,m_bcc,m_subj,mime_vers,cont_type,encode,x_from,x_to,x_cc,x_bcc,x_fold,x_orig,x_fname,m_body,sent_counts
0,allen-p/_sent_mail/1,<18782981.1075855378110.JavaMail.evans@thyme>,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",phillip.allen@enron.com,tim.belden@enron.com,,,,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Here is our forecast,606
1,allen-p/_sent_mail/10,<15464986.1075855378456.JavaMail.evans@thyme>,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",phillip.allen@enron.com,john.lavorato@enron.com,,,Re:,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Traveling to have a business meeting takes the...,606
2,allen-p/_sent_mail/100,<24216240.1075855687451.JavaMail.evans@thyme>,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",phillip.allen@enron.com,leah.arsdall@enron.com,,,Re: test,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Leah Van Arsdall,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,test successful. way to go!!!,606
3,allen-p/_sent_mail/1000,<13505866.1075863688222.JavaMail.evans@thyme>,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",phillip.allen@enron.com,randall.gay@enron.com,,,,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Randall L Gay,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,"Randy,\n\n Can you send me a schedule of the s...",606
4,allen-p/_sent_mail/1001,<30922949.1075863688243.JavaMail.evans@thyme>,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",phillip.allen@enron.com,greg.piper@enron.com,,,Re: Hello,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Greg Piper,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,Let's shoot for Tuesday at 11:45.,606


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 138752 entries, 0 to 172486
Data columns (total 20 columns):
f_dir          138752 non-null object
m_id           138752 non-null object
m_date         138752 non-null object
m_from         138752 non-null object
m_to           132960 non-null object
m_cc           42401 non-null object
m_bcc          41009 non-null object
m_subj         132115 non-null object
mime_vers      138752 non-null object
cont_type      138752 non-null object
encode         138752 non-null object
x_from         138752 non-null object
x_to           134187 non-null object
x_cc           41339 non-null object
x_bcc          117 non-null object
x_fold         138752 non-null object
x_orig         138752 non-null object
x_fname        138752 non-null object
m_body         138752 non-null object
sent_counts    138752 non-null int64
dtypes: int64(1), object(19)
memory usage: 22.2+ MB
