# 02 Add Gender

In [1]:
# time routine
import time
start_time = time.time()

# import library
import pandas as pd
import re

In [2]:
%%time
# import working df
df = pd.read_csv('./data/01_first_clean.csv', header=0, index_col=False)

Wall time: 2.83 s


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169285 entries, 0 to 169284
Data columns (total 19 columns):
f_dir        169285 non-null object
m_id         169285 non-null object
m_date       169285 non-null object
m_from       169285 non-null object
m_to         163413 non-null object
m_cc         52714 non-null object
m_bcc        50250 non-null object
m_subj       162429 non-null object
mime_vers    169285 non-null float64
cont_type    169285 non-null object
encode       169285 non-null object
x_from       169285 non-null object
x_to         164697 non-null object
x_cc         50633 non-null object
x_bcc        131 non-null object
x_fold       169285 non-null object
x_orig       169285 non-null object
x_fname      167315 non-null object
m_body       169285 non-null object
dtypes: float64(1), object(18)
memory usage: 24.5+ MB


In [4]:
# total email body chars (for process tracking)
start_chars = df.m_body.apply(len).sum()

print('Total Characters Across All Email Bodies in Corpus: {}.'.format(start_chars))

Total Characters Across All Email Bodies in Corpus: 135952755.


### Separate Columns of Interest (Emails, Internal From Field)

In [5]:
# take out columns for work
fn = df[['m_from', 'x_from']].copy()

# drop duplicates based on email address
fn = fn.drop_duplicates(subset='m_from')

# reset index for the new df
fn = fn.reset_index(drop=True)

# make both columns lowercase
fn.m_from = fn.m_from.str.lower()
fn.x_from = fn.x_from.str.lower()

In [6]:
# show example of m_from
fn.head()

Unnamed: 0,m_from,x_from
0,phillip.allen@enron.com,phillip k allen
1,ina.rangel@enron.com,ina rangel
2,critical.notice@enron.com,critical.notice@enron.com
3,rebecca.cantrell@enron.com,rebecca w cantrell
4,paul.kaufman@enron.com,paul kaufman


In [7]:
def clean_m_from(s):
    """Use this to recover names from the email addresses
       with regular expression captures. 8 cases are outlined
       below for capture. A final case (ZZ) wipes the cell.
       
       Wiped cell will be used to consider value fills from
       strings recovered from the X-From column."""
    #
    # cleans email addresses for the following cases, returning a string to place in a subsequent (name) column.
    # the following notation is used to explain cases:
    #     fname = first name
    #     lname = last name
    #     mname = middle name
    #     init = initial
    #     ffname = another word considered with first name
    #     scotts = name string contains an apostrophe
    #
    # CASE 01: fname'.lname'@enron.com (fname lname)
    # CASE 02: string@enron.com (string)
    # CASE 03: finit..lname@enron.com or finit.lname@enron.com (finit lname)
    # CASE 04: fname.minit.lname@enron.com (fname lname)
    # CASE 05: fname-ffname.lname@enron.com (fname-ffname lname)
    # CASE 06: fname.lname.enronxgate@enron.com (fname lname)
    # CASE 07: fname_lname@enron.com (fname lname)
    # CASE 08: fname.lname-llname@enron.com (fname lname-llname)
    # CASE ZZ [CREATE NULL]: @enron.com ('')
    # 

    try:

        # CASE 01
        p = '^([A-Za-z\']+)[.]([A-Za-z\']+)@enron.com$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]+' '+s[2]) # (fname lname)
                
        # CASE 02
        p = '^([A-Za-z]+)@enron.com$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]) # (string)
                    
        # CASE 03
        p = '^([A-Za-z])[.]{1,2}([A-Za-z]+)@enron.com$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]+' '+s[2]) # (finit lname)

        # CASE 04
        p = '^([A-Za-z]+)[.][A-Za-z][.]([A-Za-z]+)@enron.com$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]+' '+s[2]) # (fname lname)
            
        # CASE 05
        p = '^([A-Za-z]+[-][A-Za-z]+)[.]([A-Za-z]+)@enron.com$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]+' '+s[2]) # (fname-ffname lname)
            
        # CASE 06
        p = '^([A-Za-z]+)[.]([A-Za-z]+).enronxgate@enron.com$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]+' '+s[2]) # (fname lname)
        
        # CASE 07
        p = '^([A-Za-z\']+)[_]([A-Za-z\']+)@enron.com$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]+' '+s[2]) # (fname lname)
        
        # CASE 08
        p = '^([A-Za-z]+)[.]([A-Za-z]+[-][A-Za-z]+)@enron.com$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]+' '+s[2]) # (fname lname)

        # CASE R02
        p = '^[A-Za-z] [A-Za-z\'-]+$'
        if re.search(p, s):
            s = ''

        # CASE R01
        buzz_words = ['team', 'technology', 'security', 'enron', 'chairman', 'office', 'announcement', 'wizard', 'notice', 'address', 'hr taylor', 'coo jeff', 'infrastructure', 'ubsw', 'europe',
                      'human', 'resources', 'connection', 'ibuyit', 'users', 'livelink', 'registrar', 'global', 'business', 'compensation', 'executive', 'risk', 'analytics', 'daemon', 'information',
                      'management', 'helpdesk', 'project', 'sunrise', 'oncall', 'credit', 'union', 'notification', 'central', 'communication', 'center', 'parking', 'transportation', 'international',
                      'diversity', 'survey', 'automation', 'document', 'exec', 'iscinfra', 'public', 'relations', 'controls', 'exchange', 'cms router', 'hotline', 'admin', 'pr id', 'expertfinder',
                      'notes', 'the buzz', 'gpg dss', 'xi xi', 'enw piper', 'institute', 'agent', 'tarrif']
                      
        for word in buzz_words:
            if re.search(word, s):
                s = ''
        
        # CASE ZZ
        p = '^[A-Za-z\'-]+ [A-Za-z\'-]+$'
        if re.search(p, s):
            s
        else:
            s = ''

        return(s)
    
    except:
        
        print('EXCEPTION RETURN: ' + s)
        return(s)

### Parse Names from Email Addresses

In [8]:
# run custom function to create a new column
fn['m_from_cleaned'] = fn.m_from.apply(clean_m_from)

# change column order
fn = fn[['m_from', 'm_from_cleaned', 'x_from']]

# check head
fn.head()

Unnamed: 0,m_from,m_from_cleaned,x_from
0,phillip.allen@enron.com,phillip allen,phillip k allen
1,ina.rangel@enron.com,ina rangel,ina rangel
2,critical.notice@enron.com,,critical.notice@enron.com
3,rebecca.cantrell@enron.com,rebecca cantrell,rebecca w cantrell
4,paul.kaufman@enron.com,paul kaufman,paul kaufman


In [9]:
def clean_x_from(s):
    """Use this to recover names from the X-From column
       with regular expression captures. [ ] cases are outlined
       below for capture. A final case (ZZ) wipes the cell.
       
       Wiped cell will be used to match any possible fills between
       m_from and x_from."""
    #
    # cleans email addresses for the following cases, returning a string to place in a subsequent (name) column.
    # the following notation is used to explain cases:
    #     fname = first name
    #     lname = last name
    #     mname = middle name
    #     init = initial
    #     ffname = another word considered with first name
    #     scotts = name string contains an apostrophe
    #
    # CASE 01: 
    # CASE 02: 
    # CASE 03: 
    # CASE 04: 
    # CASE 05: 
    # CASE 06: 
    # CASE 07: 
    # CASE 08: 
    # CASE ZZ [CREATE NULL]: @enron.com ('')
    # 

    try:
        
        # CASE 01
        p = '^([A-Za-z]+) ([A-Za-z]+)$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]+' '+s[2]) # (fname lname)

        # CASE 03
        p = '^([A-Za-z]+), ([A-Za-z]+) [A-Za-z][.] {1,2}<.+$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[2]+' '+s[1]) # (fname lname)    
            
        # CASE 02
        p = '^([A-Za-z\'-]+), ([A-Za-z\'-]+) <.+$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[2]+' '+s[1]) # (fname lname)
            
            # removes 'team.' subsets of this pattern
            if re.search('team', s):
                s = ''

        # CASE 14
        p = '^[A-Za-z] ([A-Za-z]+) ([A-Za-z]+) <.+$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]+' '+s[2]) # (fname lname)                
                
        # CASE 04
        p = '^([A-Za-z]+) [A-Za-z] ([A-Za-z]+) <.+$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]+' '+s[2]) # (fname lname)

        # CASE 05
        p = '^([A-Za-z]+) ([A-Za-z]+) <.+$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]+' '+s[2]) # (fname lname)
            
        # CASE 06
        p = '^([A-Za-z]+) [A-Za-z]{2,} ([A-Za-z]+) <.+$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]+' '+s[2]) # (fname lname)

        # CASE 13
        p = '^[A-Za-z] ([A-Za-z]+) ([A-Za-z]+)$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]+' '+s[2]) # (fname lname)
            
        # CASE 07
        p = '^([A-Za-z]+) [A-Za-z]+ ([A-Za-z]+)$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]+' '+s[2]) # (fname lname)
        
        # CASE 08
        p = '^([A-Za-z]+), ([A-Za-z]+) [A-Za-z]+ {1,2}<.+$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[2]+' '+s[1]) # (fname lname)
            
        # CASE 09
        p = '^([A-Za-z]+), ([A-Za-z]+)$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[2]+' '+s[1]) # (fname lname)
            
        # CASE 10
        p = '^([A-Za-z]+) [A-Za-z]+., ([A-Za-z]+) <.+$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[2]+' '+s[1]) # (fname lname)
            
        # CASE 11
        p = '^([A-Za-z]+) [A-Za-z]+[.], ([A-Za-z]+) [A-Za-z][.] <.+$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[2]+' '+s[1]) # (fname lname)

        # CASE 12
        p = '^([A-Za-z]+) [A-Za-z]+[.], ([A-Za-z]+)[A-Za-z .]{0,3}$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[2]+' '+s[1]) # (fname lname)

        # CASE R02
        p = '^[A-Za-z] [A-Za-z\'-]+$'
        if re.search(p, s):
            s = ''

        # CASE R01
        buzz_words = ['team', 'technology', 'security', 'enron', 'chairman', 'office', 'announcement', 'wizard', 'notice', 'address', 'hr taylor', 'coo jeff', 'infrastructure', 'ubsw', 'europe',
                      'human', 'resources', 'connection', 'ibuyit', 'users', 'livelink', 'registrar', 'global', 'business', 'compensation', 'executive', 'risk', 'analytics', 'daemon', 'information',
                      'management', 'helpdesk', 'project', 'sunrise', 'oncall', 'credit', 'union', 'notification', 'central', 'communication', 'center', 'parking', 'transportation', 'international',
                      'diversity', 'survey', 'automation', 'document', 'exec', 'iscinfra', 'public', 'relations', 'controls', 'exchange', 'cms router', 'hotline', 'admin', 'pr id', 'expertfinder',
                      'notes', 'the buzz', 'gpg dss', 'xi xi', 'enw piper', 'institute', 'agent', 'tarrif']
                      
        for word in buzz_words:
            if re.search(word, s):
                s = ''
        
        # CASE ZZ
        p = '^[A-Za-z\'-]+ [A-Za-z\'-]+$'
        if re.search(p, s):
            s
        else:
            s = ''
        
        return(s)
    
    except:
        
        print('EXCEPTION RETURN: ' + s.group())
        return(s)

### Parse Names from X-From Field

In [10]:
# x_from
fn['x_from_cleaned'] = fn.x_from.apply(clean_x_from)

# peek results
fn.head()

Unnamed: 0,m_from,m_from_cleaned,x_from,x_from_cleaned
0,phillip.allen@enron.com,phillip allen,phillip k allen,phillip allen
1,ina.rangel@enron.com,ina rangel,ina rangel,ina rangel
2,critical.notice@enron.com,,critical.notice@enron.com,
3,rebecca.cantrell@enron.com,rebecca cantrell,rebecca w cantrell,rebecca cantrell
4,paul.kaufman@enron.com,paul kaufman,paul kaufman,paul kaufman


### Remove Non-Names

In [11]:
# condition to catch rows where both clean columns are blank
cond = (fn.m_from_cleaned == '') & (fn.x_from_cleaned == '')

# copy new df of non-condition return
cd = fn[~cond].copy()

    # fill m_from_cleaned empty values with the value from x_from_cleaned
    cd.loc[cd['m_from_cleaned'] == '', 'm_from_cleaned'] = cd['x_from_cleaned']

    # fill x_from_cleaned empty values with the value from m_from_cleaned
    cd.loc[cd['x_from_cleaned'] == '', 'x_from_cleaned'] = cd['m_from_cleaned']

In [12]:
# create a capture column for the final fname lname
cd['clean_name'] = cd['x_from_cleaned']

# fill blanks with other column returns
cd.loc[cd['clean_name'] == '', 'clean_name'] = cd['m_from_cleaned']

### Evaluate Conflicts Between Parses

In [13]:
# return differences between the two parsed values if they both returned a value
makin_picks = cd[(cd.m_from_cleaned != cd.x_from_cleaned) & (cd.m_from_cleaned != '') & (cd.x_from_cleaned != '')]

# manually review mismatches
print(len(makin_picks))
makin_picks

32


Unnamed: 0,m_from,m_from_cleaned,x_from,x_from_cleaned,clean_name
168,buckner.thomas@enron.com,buckner thomas,"thomas, john buckner </o=enron/ou=na/cn=recipi...",john thomas,john thomas
220,jae.black@enron.com,jae black,"black, tamara jae </o=enron/ou=na/cn=recipient...",tamara black,tamara black
338,pinto.leite@enron.com,pinto leite,"leite, francisco pinto </o=enron/ou=na/cn=reci...",francisco leite,francisco leite
1061,dana.davis@enron.com,dana davis,"davis, mark dana </o=enron/ou=na/cn=recipients...",mark davis,mark davis
1096,kay.miller@enron.com,kay miller,"miller, mary kay </o=enron/ou=na/cn=recipients...",mary miller,mary miller
1113,ann.foret@enron.com,ann foret,"foret, leigh ann </o=enron/ou=na/cn=recipients...",leigh foret,leigh foret
1553,ann.matson@enron.com,ann matson,"matson, jo ann </o=enron/ou=na/cn=recipients/c...",jo matson,jo matson
1570,ann.hill@enron.com,ann hill,"hill, jo ann </o=enron/ou=na/cn=recipients/cn=...",jo hill,jo hill
1628,marie.allex@enron.com,marie allex,"allex, anne marie </o=enron/ou=na/cn=recipient...",anne allex,anne allex
2296,nell.browning@enron.com,nell browning,"browning, mary nell </o=enron/ou=na/cn=recipie...",mary browning,mary browning


In [14]:
# manual review, update where names are not equal
# 
# decision based on most gender-strong form of the name assumption
# 
# 

cd.loc[cd['m_from_cleaned'] == 'kay miller', 'clean_name'] = 'kay miller'
cd.loc[cd['m_from_cleaned'] == 'ann foret', 'clean_name'] = 'ann foret'
cd.loc[cd['m_from_cleaned'] == 'ann hill', 'clean_name'] = 'ann hill'
cd.loc[cd['m_from_cleaned'] == 'ann matson', 'clean_name'] = 'ann matson'
cd.loc[cd['m_from_cleaned'] == 'ann chance', 'clean_name'] = 'ann chance'
cd.loc[cd['m_from_cleaned'] == 'lynn schaffart', 'clean_name'] = 'lynn schaffart'

# clean ' from strings
cd.loc[cd['m_from_cleaned'] == 'chenee\' franklin', 'clean_name'] = 'chenee franklin'
cd.loc[cd['m_from_cleaned'] == '\'todd\' delahoussaye', 'clean_name'] = 'todd delahoussaye'

### Separate, Clean First Name for Scraping

In [15]:
cd['gender_query'] = cd['clean_name'].str.extract('^([A-Za-z\'-]+) [A-Za-z\'-]+$')
cd.head()

Unnamed: 0,m_from,m_from_cleaned,x_from,x_from_cleaned,clean_name,gender_query
0,phillip.allen@enron.com,phillip allen,phillip k allen,phillip allen,phillip allen,phillip
1,ina.rangel@enron.com,ina rangel,ina rangel,ina rangel,ina rangel,ina
3,rebecca.cantrell@enron.com,rebecca cantrell,rebecca w cantrell,rebecca cantrell,rebecca cantrell,rebecca
4,paul.kaufman@enron.com,paul kaufman,paul kaufman,paul kaufman,paul kaufman,paul
6,stephanie.miller@enron.com,stephanie miller,stephanie miller,stephanie miller,stephanie miller,stephanie


### Remove Hyphen From Strings

In [16]:
# contains '-'
cond = cd.gender_query.str.contains('-')

# return fields
cd[cond]

Unnamed: 0,m_from,m_from_cleaned,x_from,x_from_cleaned,clean_name,gender_query
73,sarah-joy.hunter@enron.com,sarah-joy hunter,sarah-joy hunter,sarah-joy hunter,sarah-joy hunter,sarah-joy
615,jan-erland.bekeng@enron.com,jan-erland bekeng,jan-erland bekeng,jan-erland bekeng,jan-erland bekeng,jan-erland
2021,xochitl-alexis.velasco@enron.com,xochitl-alexis velasco,xochitl-alexis velasco,xochitl-alexis velasco,xochitl-alexis velasco,xochitl-alexis
2355,sladana-anna.kulic@enron.com,sladana-anna kulic,"kulic, sladana-anna </o=enron/ou=na/cn=recipie...",sladana-anna kulic,sladana-anna kulic,sladana-anna
2727,seung-taek.oh@enron.com,seung-taek oh,"oh, seung-taek </o=enron/ou=na/cn=recipients/c...",seung-taek oh,seung-taek oh,seung-taek
5343,jung-suk.suh@enron.com,jung-suk suh,jung-suk suh,jung-suk suh,jung-suk suh,jung-suk
5961,jae-moo.lee@enron.com,jae-moo lee,jae-moo lee,jae-moo lee,jae-moo lee,jae-moo


In [17]:
# manual review, hyphen names return bad values from web query
#
# removing one name and learing the most gender-strong form for query
#
#

cd.loc[cd['gender_query'] == 'sarah-joy', 'gender_query'] = 'sarah'
cd.loc[cd['gender_query'] == 'jan-erland', 'gender_query'] = 'jan'
cd.loc[cd['gender_query'] == 'sladana-anna', 'gender_query'] = 'anna'
cd.loc[cd['gender_query'] == 'seung-taek', 'gender_query'] = 'seung'
cd.loc[cd['gender_query'] == 'xochitl-alexis', 'gender_query'] = 'alexis'
cd.loc[cd['gender_query'] == 'jung-suk', 'gender_query'] = 'jung'
cd.loc[cd['gender_query'] == 'jae-moo', 'gender_query'] = 'jae'

In [18]:
cd[cd.gender_query == 'estalee']

Unnamed: 0,m_from,m_from_cleaned,x_from,x_from_cleaned,clean_name,gender_query
2163,estalee.russi@enron.com,estalee russi,"russi, estalee </o=enron/ou=na/cn=recipients/c...",estalee russi,estalee russi,estalee


In [19]:
len(cd.drop_duplicates('gender_query'))

1699

In [20]:
len(cd)

5930

### Web Scraping the Gender

In [21]:
# create name series
fname_series = []
fname_series = cd.drop_duplicates('gender_query').gender_query

# sort values, reset index
fname_series = fname_series.sort_values()
fname_series = fname_series.reset_index(drop=True)

In [22]:
# create list for web scrape
print(len(fname_series))
fname_list = fname_series.values.tolist()

1699


In [23]:
#
# baby name guesser function
#

def get_gender_guesser(name):
    
    # capture page response
    with HTMLSession() as s:
        
        url = b_url + name
        
        # set request var
        r = s.get(url, headers=headers)

        # parse out gender
        if r.html.search('It\'s a {}!') is not None:
            gender = r.html.search('It\'s a {}!')[0]
        else:
            gender = 'not_found'

        # record captures
        csv_writer.writerow([name, url, gender])

In [24]:
%%time
#
# Pass names to gpeters.com baby name guesser
#

# base url
b_url = 'https://www.gpeters.com/names/baby-names.php?name='

# emulate browser
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'
}

# import library
from requests_html import HTML, HTMLSession
import csv
import concurrent.futures

# create output csv
csv_file = open('./data/name_scrape_guesser.csv', 'w', newline='')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['name', 'url', 'gender'])

# concurrent futures executor
with concurrent.futures.ThreadPoolExecutor() as executor:  
    future = executor.map(get_gender_guesser, fname_list)
    
# close csv
csv_file.close()

Wall time: 1min 51s


In [25]:
# import csv scrape
g_df = pd.read_csv('./data/name_scrape_guesser.csv')

# drop duplicates
g_df = g_df.drop_duplicates()

# sort values & reset index
g_df = g_df.sort_values('name').reset_index(drop=True)

# drop the URLs used to scrape
g_df = g_df[['name', 'gender']]
len(g_df)

1699

In [26]:
# correcting gender return

g_df.loc[g_df['name'] == 'todd', 'gender'] = 'boy' # todd is a boy
g_df.loc[g_df['name'] == 'gay', 'gender'] = 'girl' # todd is a boy

In [27]:
# check length
a = len(g_df[g_df.gender == 'not_found'])
b = len(g_df)
print('{} of {} not found.'.format(a, b))

# view head
g_df.head()

61 of 1699 not found.


Unnamed: 0,name,gender
0,aamir,boy
1,aaron,boy
2,abhijeet,boy
3,adam,boy
4,adarsh,boy


In [28]:
g_df[g_df.gender == 'not_found'][0:50]

Unnamed: 0,name,gender
24,albernita,not_found
40,alhamd,not_found
113,arquella,not_found
154,bessik,not_found
190,breanden,not_found
201,britaldo,not_found
220,cantekin,not_found
240,castlen,not_found
267,chonawee,not_found
359,datren,not_found


### Merge Name & Gender to DataFrame

In [29]:
# merge 
m_df = pd.merge(cd, g_df, how='left', left_on='gender_query', right_on='name')
print(len(m_df))
m_df.head()

5930


Unnamed: 0,m_from,m_from_cleaned,x_from,x_from_cleaned,clean_name,gender_query,name,gender
0,phillip.allen@enron.com,phillip allen,phillip k allen,phillip allen,phillip allen,phillip,phillip,boy
1,ina.rangel@enron.com,ina rangel,ina rangel,ina rangel,ina rangel,ina,ina,girl
2,rebecca.cantrell@enron.com,rebecca cantrell,rebecca w cantrell,rebecca cantrell,rebecca cantrell,rebecca,rebecca,girl
3,paul.kaufman@enron.com,paul kaufman,paul kaufman,paul kaufman,paul kaufman,paul,paul,boy
4,stephanie.miller@enron.com,stephanie miller,stephanie miller,stephanie miller,stephanie miller,stephanie,stephanie,girl


In [30]:
# updating values that returned 'not_found' but were verified individually

manual_updates = dict(
    [
('shemeika landry', 'girl'),
('castlen moore', 'girl'),
('geynille dillingham', 'girl'),
('albernita travis', 'girl'),
('taquesha frank', 'girl'),
('geiv dubash', 'boy'),
('luchas johnson', 'boy'),
('estalee russi', 'girl'),
('gay mayeux', 'girl'),
('musslewhite diane', 'girl'),
('sidrac flores', 'boy'),
('khymberly booth', 'girl'),
('elberg gelin', 'boy'),
('tantra invedy', 'boy'),
    ])

for k, v in manual_updates.items():
    m_df.loc[m_df['clean_name'] == k, 'gender'] = v

In [31]:
# export csv for gender key
email_gender_key = m_df[['m_from', 'name', 'gender']]
email_gender_key.head()

Unnamed: 0,m_from,name,gender
0,phillip.allen@enron.com,phillip,boy
1,ina.rangel@enron.com,ina,girl
2,rebecca.cantrell@enron.com,rebecca,girl
3,paul.kaufman@enron.com,paul,boy
4,stephanie.miller@enron.com,stephanie,girl


In [32]:
email_gender_key.to_csv('./data/email_gender_key.csv', index=False, index_label=False)

In [33]:
# merge DataFrame with the genders, join on email address
df_gender = pd.merge(df, email_gender_key, how='left', on='m_from')

In [34]:
df_gender.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 169285 entries, 0 to 169284
Data columns (total 21 columns):
f_dir        169285 non-null object
m_id         169285 non-null object
m_date       169285 non-null object
m_from       169285 non-null object
m_to         163413 non-null object
m_cc         52714 non-null object
m_bcc        50250 non-null object
m_subj       162429 non-null object
mime_vers    169285 non-null float64
cont_type    169285 non-null object
encode       169285 non-null object
x_from       169285 non-null object
x_to         164697 non-null object
x_cc         50633 non-null object
x_bcc        131 non-null object
x_fold       169285 non-null object
x_orig       169285 non-null object
x_fname      167315 non-null object
m_body       169285 non-null object
name         162847 non-null object
gender       162847 non-null object
dtypes: float64(1), object(20)
memory usage: 28.4+ MB


In [35]:
#
# export DataFrame for later work
#

# df_gender.to_csv('./data/02_add_gender.csv', index=False, index_label=False)
df_gender.head()

Unnamed: 0,f_dir,m_id,m_date,m_from,m_to,m_cc,m_bcc,m_subj,mime_vers,cont_type,...,x_from,x_to,x_cc,x_bcc,x_fold,x_orig,x_fname,m_body,name,gender
0,allen-p/_sent_mail/1,<18782981.1075855378110.JavaMail.evans@thyme>,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",phillip.allen@enron.com,tim.belden@enron.com,,,,1.0,text/plain; charset=us-ascii,...,Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Here is our forecast,phillip,boy
1,allen-p/_sent_mail/10,<15464986.1075855378456.JavaMail.evans@thyme>,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",phillip.allen@enron.com,john.lavorato@enron.com,,,Re:,1.0,text/plain; charset=us-ascii,...,Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Traveling to have a business meeting takes the...,phillip,boy
2,allen-p/_sent_mail/100,<24216240.1075855687451.JavaMail.evans@thyme>,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",phillip.allen@enron.com,leah.arsdall@enron.com,,,Re: test,1.0,text/plain; charset=us-ascii,...,Phillip K Allen,Leah Van Arsdall,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,test successful. way to go!!!,phillip,boy
3,allen-p/_sent_mail/1000,<13505866.1075863688222.JavaMail.evans@thyme>,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",phillip.allen@enron.com,randall.gay@enron.com,,,,1.0,text/plain; charset=us-ascii,...,Phillip K Allen,Randall L Gay,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,"Randy, Can you send me a schedule of the sal...",phillip,boy
4,allen-p/_sent_mail/1001,<30922949.1075863688243.JavaMail.evans@thyme>,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",phillip.allen@enron.com,greg.piper@enron.com,,,Re: Hello,1.0,text/plain; charset=us-ascii,...,Phillip K Allen,Greg Piper,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,Let's shoot for Tuesday at 11:45.,phillip,boy


In [36]:
# generate sample for review of gender accuracy
gender_sample = df_gender[['m_from', 'name', 'gender']].sample(n=100, random_state=1)

In [37]:
gender_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 155758 to 130304
Data columns (total 3 columns):
m_from    100 non-null object
name      98 non-null object
gender    98 non-null object
dtypes: object(3)
memory usage: 3.1+ KB


In [38]:
gender_sample[0:50]

Unnamed: 0,m_from,name,gender
155758,mark.dilworth@enron.com,mark,boy
143392,sherri.sera@enron.com,sherri,girl
135336,sara.shackleton@enron.com,sara,girl
21952,karen.denne@enron.com,karen,girl
145939,carol.clair@enron.com,carol,girl
52041,pete.davis@enron.com,pete,boy
30829,carolyn.centilli@enron.com,carolyn,girl
124549,elizabeth.sager@enron.com,elizabeth,girl
137366,sara.shackleton@enron.com,sara,girl
65073,tana.jones@enron.com,tana,girl


In [39]:
gender_sample[50:]

Unnamed: 0,m_from,name,gender
118051,l..nicolay@enron.com,christi,girl
152668,cara.semperger@enron.com,cara,girl
39632,kevin.cousineau@enron.com,kevin,boy
110955,mark.whitt@enron.com,mark,boy
57016,marie.heard@enron.com,marie,girl
151026,kate.symes@enron.com,kate,girl
63952,marie.heard@enron.com,marie,girl
66974,justin.boyd@enron.com,justin,girl
107198,pete.davis@enron.com,pete,boy
166209,julie.kearney@enron.com,julie,girl


### Ending Stats

In [40]:
# execution time
print("--- %s seconds ---" % (time.time() - start_time))

# total email body chars (for process tracking)
current_chars = df.m_body.apply(len).sum()

print('{} of {} characters removed from corpus for {} reduction.'.format(current_chars, start_chars,((start_chars-current_chars)/(start_chars+current_chars))))

--- 122.03231763839722 seconds ---
135952755 of 135952755 characters removed from corpus for 0.0 reduction.
