# The class definitions

### Class: EnronEmailParser
Parser for the emails included in the [Enron Email Dataset](http://bailando.sims.berkeley.edu/enron/enron_with_categories.tar.gz).

_Note: This particular implementation treats all recipients including to, cc and bcc recipients as same type._

In [1]:
import json
import dateutil.parser
import datetime
import pytz
import re

class EnronEmailParser:
    '''
    Parser for the emails included in the Enron Email Dataset available at
    http://bailando.sims.berkeley.edu/enron/enron_with_categories.tar.gz
    '''
    EMAIL_REGEX = re.compile(r"^[^@]+@[^@]+\.[^@]+$")
    EPOCH = datetime.datetime.utcfromtimestamp(0).replace(tzinfo = pytz.utc)
    
    def __init__(self, filename, verbose=False):
        self.filename = filename
        self.ts = None
        self.sender = None
        self.recipients = None
        self.subject = None
        self.num_recipients = None
        self.json_repr = None
        self.parse()
        if verbose:
            print self.__str__()
    
    def to_json(self):
        if not self.json_repr:
            self.json_repr = {
                'filename' : self.filename,
                'ts': self.ts,
                'sender': self.sender,
                'recipients': list(self.recipients),
                'subject': self.subject
            }
        return self.json_repr
    
    def __str__(self):
        return json.dumps(self.to_json())
        
    def check_prefix(self, line, prefix):
        '''
        Make sure the line starts with the given prefix and strip the prefix and return the rest
        '''
        if not line.startswith(prefix):
            raise ValueError('Line invalid in {}, needs prefix "{}": {}'.format(self.filename, prefix, line))
        return line[len(prefix)+1:].strip()
    
    def parse_recipients(self, filehandle, prefix, line=None):
        '''
        Parse the recipient email addresses (To: Cc: and Bcc:)
        
        Including cases in which the data spans multiple lines
        '''
        if not line:
            line = filehandle.readline().strip()
        if not line.startswith(prefix):
            return line
        line = self.check_prefix(line, prefix)
        while (line):
            for recipient in [address.strip() for address in line.split(',') if address]:
                if not self.EMAIL_REGEX.match(recipient):
                    raise ValueError('Invalid recipient address {} found in: "{}"'.format(recipient, line))
                recipient = recipient.strip()
                if recipient != self.sender:
                    self.recipients.add(recipient.strip())
            line = filehandle.readline().strip() if line.endswith(',') else None
        return None

    def parse(self):
        '''
        Parse the file a line at a time according to expected format.
        '''
        with open(self.filename, 'r') as f:
            # Ignore Message-ID
            line = f.readline()
            self.check_prefix(line, 'Message-ID')
            
            # Read date and time
            datetime_object = dateutil.parser.parse(self.check_prefix(f.readline().strip(), "Date:"))
            self.ts = (datetime_object - self.EPOCH).total_seconds()
            
            # Read sender
            self.sender = self.check_prefix(f.readline().strip(), "From:")
            if not self.EMAIL_REGEX.match(self.sender):
                raise ValueError('Invalid sender address found: {}'.format(self.sender))
            
            # Assume all recipients in the to:, cc:, bcc: lists are equivalent.
            self.recipients = set()
            
            # To addresses, if any
            unprocessed_line = self.parse_recipients(f, "To:")
            
            # Subject
            subjectline = unprocessed_line if unprocessed_line else f.readline().strip()
            self.subject = self.check_prefix(subjectline, "Subject:")
            
            # Handle case for mulitiline subject
            line = f.readline().strip()
            while not line.startswith('Cc:') and not line.startswith('Mime-Version'):
                self.subject = self.subject + line
                line = f.readline().strip()
            
            # Cc addreses, if any
            unprocessed_line = self.parse_recipients(f, "Cc:", line)
            
            # Mime version - ignore
            line = unprocessed_line if unprocessed_line else f.readline()
            self.check_prefix(line, 'Mime-Version')
            
            # Content type - ignore
            line = f.readline()
            self.check_prefix(line, 'Content-Type')
            
            # Content-Transfer-Encoding - ignore
            line = f.readline()
            self.check_prefix(line, 'Content-Transfer-Encoding')
            
            # Bcc addresses, if any
            unprocessed_line = self.parse_recipients(f, "Bcc:")
            
            # Total number of recipients
            self.num_recipients = len(self.recipients)
            
            # Read till the header is done, no need to do verification of X- fields at this point
            # Not reading these at this point
            #
            #line = unprocessed_line if unprocessed_line else f.readline().strip()
            #while line:
            #    line = f.readline().strip()
            
            # All that is left is the message body
            # Not saving it for the timebeing
            #
            #message = '\n'.join([line.strip() for line in f])

### Class: EnronEmailDataset

Data handler for the Enron Email Dataset

_Note1: It relies on the EnronEmailParser class to do the actual email parsing._

_Note2: It uses pandas dataframes as the data storage objects._

In [2]:
import os
import pandas as pd

class EnronEmailDataset:
    '''
    Data handler for the Enron Email Dataset
    
    Relies on the EnronEmailParser class to do the actual email parsing. 
    Uses pandas dataframes as the data storage objects.
    '''
    
    def __init__(self, data_dir):        
        if not data_dir or not os.path.isdir(data_dir):
            raise ValueError(data_dir + " needs to be a valid directory")
        self.data_dir = data_dir
        self.email_files = []
        self.emails = None
        self.recipients = None
        self.survey()
        self.parse()
    
    def survey(self):
        '''
        Walk through the data directory making a note of all the email files
        '''
        if self.email_files:
            return
        for subdir in os.walk(self.data_dir).next()[1]:
            subdir = os.path.join(self.data_dir, subdir)
            for email_file in os.walk(subdir).next()[2]:
                if email_file.endswith('.txt'):
                    self.email_files.append(os.path.join(subdir, email_file))
        print 'Surveyed {} email files'.format(self.email_files.__len__())
    
    def parse(self):
        '''
        Parse all the emails surveyed and store the resulting data fiels in the pandas dataframes
        '''
        if self.emails:
            return
        emails = {}
        recipients = []
        for email_file in self.email_files:
            parsed_email = EnronEmailParser(email_file)
            emails[parsed_email.filename]=(parsed_email.ts, 
                                           parsed_email.sender,
                                           parsed_email.num_recipients,
                                           parsed_email.subject)
            for recipient in parsed_email.recipients:
                recipients.append((parsed_email.filename, recipient))
        self.emails = pd.DataFrame.from_dict(emails, orient='index')
        self.emails.index.name = 'email_id'
        self.emails.columns = ['ts', 'sender', 'num_recipients', 'subject']
        self.recipients = pd.DataFrame.from_records(recipients)
        self.recipients.columns = ['email_id', 'recipient']
        print 'Parsed {} emails'.format(len(emails))

# Basic Setup

Having defined the basic classes that will handle the data and parsing for us, we can now start to load and parse our data. The two main tables, aka dataframes, are shown below (limited to the top 10 rows in each case).

In [3]:
# Load and parse the enron email dataset
enronData = EnronEmailDataset('./data')

Surveyed 1702 email files
Parsed 1702 emails


In [4]:
# Let's take a look at the emails table
enronData.emails.head(10)

Unnamed: 0_level_0,ts,sender,num_recipients,subject
email_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
./data/4/54650.txt,993726297,j.kaminski@enron.com,1,RE: Thu evening
./data/6/173776.txt,963928140,steven.kean@enron.com,1,Re: Price Cap Media--DRAFT
./data/1/138102.txt,1005755746,john.shelk@enron.com,2,RE: Dynegy/Enron Point of Contact
./data/1/173413.txt,951069180,steven.kean@enron.com,3,Re: Trade Mission
./data/1/219048.txt,997483225,ray.alvarez@enron.com,4,CONFIDENTIAL Attached file
./data/1/175244.txt,989784180,steven.kean@enron.com,1,"Presidential Gala, May 22nd"
./data/8/174465.txt,971088360,steven.kean@enron.com,1,Letter to Lynch from Us Describing Info ISO sh...
./data/4/54588.txt,993059614,j.kaminski@enron.com,1,RE: Catch Up
./data/4/173235.txt,878727600,steven.kean@enron.com,0,Enron Management Conference - Hyatt Hill Count...
./data/1/150157.txt,961597920,steve.duffy@enron.com,1,API Question; CONFIDENTIAL AND PRIVILEGED ATTO...


In [5]:
# The recipients table is being maintained separately so as to not keep lists as values in the dataframe
enronData.recipients.head(10)

Unnamed: 0,email_id,recipient
0,./data/1/10425.txt,kenneth.lay@enron.com
1,./data/1/10425.txt,mark.frevert@enron.com
2,./data/1/10425.txt,jeff.skilling@enron.com
3,./data/1/10425.txt,mark.schroeder@enron.com
4,./data/1/10425.txt,joseph.sutton@enron.com
5,./data/1/10425.txt,john.sherriff@enron.com
6,./data/1/106296.txt,darrell.schoolcraft@enron.com
7,./data/1/106296.txt,steven.harris@enron.com
8,./data/1/106296.txt,kevin.hyatt@enron.com
9,./data/1/106296.txt,danny.mccarty@enron.com


# Basic analysis

Let's now do some basic analysis to see how we can use this data and play with it to get some insights and information of value.

## Question 1

In the next couple sections I am trying to answer the following question:

**Let's label an email as "direct" if there is exactly one recipient and "broadcast" if it has multiple recipients. Identify the top 3 people who received the largest number of direct emails and the person (or people) who sent the largest number of broadcast emails.**

In [6]:
directs = pd.merge(enronData.recipients, enronData.emails[enronData.emails['num_recipients'] == 1], left_on='email_id', right_index=True)[['ts', 'recipient']]
directs = directs.groupby('recipient').count().sort_values(by='ts', ascending=[0])
directs.columns = ['direct_email_count']
directs.head()

Unnamed: 0_level_0,direct_email_count
recipient,Unnamed: 1_level_1
maureen.mcvicker@enron.com,115
vkaminski@aol.com,43
jeff.dasovich@enron.com,25
richard.shapiro@enron.com,23
elizabeth.linnell@enron.com,18


In [7]:
broadcasts = enronData.emails[enronData.emails['num_recipients'] > 1][['sender', 'ts']]
broadcasts = broadcasts.groupby('sender').count().sort_values(by='ts', ascending=[0])
broadcasts.columns = ['broadcast_email_count']
broadcasts.head()

Unnamed: 0_level_0,broadcast_email_count
sender,Unnamed: 1_level_1
steven.kean@enron.com,252
john.shelk@enron.com,83
j.kaminski@enron.com,40
miyung.buster@enron.com,31
alan.comnes@enron.com,19


## Answer 1

Based on the outputs above, we can say:

- The top three people who received the largets number of direct mail are:
    1. Maureen McVicker (maureen.mcvicker@enron.com)
    2. V Kaminski (vkaminski@aol.com)
    3. Jeff Dasovich (jeff.dasovich@enron.com)
- The person who sent the largest number of direct email is **Steven Kean**

## Question 2

In the section I am trying to answer the following question:

**Find the five emails with the fastest response times. Please include file IDs, subject, sender, recipient, and response times. (A response is defined as a message from one of the recipients to the original sender whose subject line contains all of the words from the subject of the original email, and the response time should be measured as the difference between when the original email was sent and when the response was sent.)**

In [8]:
# Nested joins to find all emails to which an email can be a potential response
responses = pd.merge(
    pd.merge(
        enronData.emails, 
        enronData.recipients, 
        left_on='sender',
        right_on='recipient'), 
    enronData.emails, 
    left_on='email_id', 
    right_index=True)

# Drop unnecessary columns and rename some of the ones we are using
responses = responses[['ts_x', 'subject_x', 'email_id', 'recipient', 'ts_y', 'sender_y', 'subject_y']]
responses.columns = ['response_ts', 'response_subject', 'email_id', 'recipient', 'ts', 'sender', 'subject']

# Apply conditions
responses['lowercase_subject'] = responses.subject.apply(lambda value: value.lower())
responses['response_time'] = responses['response_ts'] - responses['ts']
responses = responses[(responses.lowercase_subject != '') 
                      & (responses.lowercase_subject != 're:') 
                      & (responses.lowercase_subject != 'fwd:') 
                      & (responses.response_time > 0)]
responses = responses[responses.apply(lambda row: row['subject'] in row['response_subject'], axis=1)]

# Pick the shortest response time for each email
responses = responses.sort_values(by=['email_id', 'response_time'], ascending=[1, 1])
responses = responses.groupby(['email_id']).first().sort_values(by='response_time', ascending=[1])

# Retain useful columns and rename some
responses = responses[['subject', 'sender', 'recipient', 'response_time']]
responses.columns = ['subject', 'sender', 'recipient_responder', 'response_time_in_seconds']
responses.head(10)

Unnamed: 0_level_0,subject,sender,recipient_responder,response_time_in_seconds
email_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
./data/1/139495.txt,FW: Confidential - GSS Organization Value to ETS,rod.hayslett@enron.com,stanley.horton@enron.com,148
./data/1/228996.txt,RE: CONFIDENTIAL Personnel issue,michelle.cash@enron.com,lizzette.palmer@enron.com,236
./data/1/121747.txt,Re: CONFIDENTIAL - Residential in CA,karen.denne@enron.com,jeff.dasovich@enron.com,240
./data/4/122923.txt,RE: Eeegads...,paul.kaufman@enron.com,jeff.dasovich@enron.com,240
./data/1/201878.txt,FW: SRP SETTLEMENT PROPOSAL - PRIVILEGED AND C...,m..tholt@enron.com,stephanie.miller@enron.com,262
./data/5/228981.txt,RE: CONFIDENTIAL Personnel issue,lizzette.palmer@enron.com,michelle.cash@enron.com,322
./data/1/221669.txt,ISO-NE failure to mitigate ICAP market -- Rele...,thane.twiggs@enron.com,dana.davis@enron.com,360
./data/4/228911.txt,RE: CONFIDENTIAL Personnel issue,lizzette.palmer@enron.com,michelle.cash@enron.com,362
./data/1/153662.txt,CONFIDENTIAL AND LEGALLY PRIVILEGED,marcus.nettelton@enron.com,vicki.sharp@enron.com,543
./data/3/121748.txt,Re: CONFIDENTIAL - Residential in CA,jeff.dasovich@enron.com,karen.denne@enron.com,660


## Answer 2

Based on the outputs above, we can say that the five emails with the fastest response times are:

| id   | email_id | subject | sender | recipient_responder | response_time_in_seconds |			
| ---  | -------- | ------- | ------ | ------------------- | ------------------------ |
| 1 | ./data/1/139495.txt | FW: Confidential - GSS Organization Value to ETS | rod.hayslett@enron.com | stanley.horton@enron.com | 148 |
| 2 | ./data/1/228996.txt | RE: CONFIDENTIAL Personnel issue | michelle.cash@enron.com | lizzette.palmer@enron.com | 236 |
| 3 | ./data/1/121747.txt | Re: CONFIDENTIAL - Residential in CA | karen.denne@enron.com | jeff.dasovich@enron.com | 240 |
| 4 | ./data/4/122923.txt | RE: Eeegads... | paul.kaufman@enron.com | jeff.dasovich@enron.com | 240 |
| 5 | ./data/1/201878.txt | FW: SRP SETTLEMENT PROPOSAL - PRIVILEGED AND C... | m..tholt@enron.com | stephanie.miller@enron.com | 262 |