Names Corpus, Version 1.3 (1994-03-29)
Copyright (C) 1991 Mark Kantrowitz

In [1]:
# time routine
import time
start_time = time.time()

# import library
import pandas as pd
import regex as re

In [2]:
%%time
# import working df
df_full = pd.read_csv('./data/enron/00_original_wrangle.csv', header=0, index_col=False)

Wall time: 10.6 s


In [3]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517401 entries, 0 to 517400
Data columns (total 19 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   f_dir      517401 non-null  object 
 1   m_id       517401 non-null  object 
 2   m_date     517401 non-null  object 
 3   m_from     517401 non-null  object 
 4   m_to       496355 non-null  object 
 5   m_cc       135166 non-null  object 
 6   m_bcc      127886 non-null  object 
 7   m_subj     498214 non-null  object 
 8   mime_vers  517401 non-null  float64
 9   cont_type  517401 non-null  object 
 10  encode     517401 non-null  object 
 11  x_from     517401 non-null  object 
 12  x_to       508255 non-null  object 
 13  x_cc       128886 non-null  object 
 14  x_bcc      175 non-null     object 
 15  x_fold     517401 non-null  object 
 16  x_orig     517401 non-null  object 
 17  x_fname    517401 non-null  object 
 18  m_body     517401 non-null  object 
dtypes: float64(1), object(1

## Get Gender (Labels)

### Separate Columns of Interest (Emails, Internal From Field)

In [4]:
# take out columns for work
fn = df_full[['m_from', 'x_from']].copy()

# drop duplicates based on email address
fn = fn.drop_duplicates(subset='m_from')

# reset index for the new df
fn = fn.reset_index(drop=True)

# make both columns lowercase
fn.m_from = fn.m_from.str.lower()
fn.x_from = fn.x_from.str.lower()

In [5]:
# show example of m_from
fn.head()

Unnamed: 0,m_from,x_from
0,phillip.allen@enron.com,phillip k allen
1,ina.rangel@enron.com,ina rangel
2,1.11913372.-2@multexinvestornetwork.com,multex investor <1.11913372.-2@multexinvestorn...
3,messenger@ecm.bloomberg.com,"""bloomberg.com"" <messenger@ecm.bloomberg.com>"
4,aod@newsdata.com,"""arthur o'donnell"" <aod@newsdata.com>"


In [6]:
def clean_m_from(s):
    """Use this to recover names from the email addresses
       with regular expression captures. 8 cases are outlined
       below for capture. A final case (ZZ) wipes the cell.
       
       Wiped cell will be used to consider value fills from
       strings recovered from the X-From column."""
    #
    # cleans email addresses for the following cases, returning a string to place in a subsequent (name) column.
    # the following notation is used to explain cases:
    #     fname = first name
    #     lname = last name
    #     mname = middle name
    #     init = initial
    #     ffname = another word considered with first name
    #     scotts = name string contains an apostrophe
    #
    # CASE 01: fname'.lname'@enron.com (fname lname)
    # CASE 02: string@enron.com (string)
    # CASE 03: finit..lname@enron.com or finit.lname@enron.com (finit lname)
    # CASE 04: fname.minit.lname@enron.com (fname lname)
    # CASE 05: fname-ffname.lname@enron.com (fname-ffname lname)
    # CASE 06: fname.lname.enronxgate@enron.com (fname lname)
    # CASE 07: fname_lname@enron.com (fname lname)
    # CASE 08: fname.lname-llname@enron.com (fname lname-llname)
    # CASE ZZ [CREATE NULL]: @enron.com ('')
    # 

    try:

        # CASE 01
        p = '^([A-Za-z\']+)[.]([A-Za-z\']+)@enron.com$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]+' '+s[2]) # (fname lname)
                
        # CASE 02
        p = '^([A-Za-z]+)@enron.com$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]) # (string)
                    
        # CASE 03
        p = '^([A-Za-z])[.]{1,2}([A-Za-z]+)@enron.com$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]+' '+s[2]) # (finit lname)

        # CASE 04
        p = '^([A-Za-z]+)[.][A-Za-z][.]([A-Za-z]+)@enron.com$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]+' '+s[2]) # (fname lname)
            
        # CASE 05
        p = '^([A-Za-z]+[-][A-Za-z]+)[.]([A-Za-z]+)@enron.com$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]+' '+s[2]) # (fname-ffname lname)
            
        # CASE 06
        p = '^([A-Za-z]+)[.]([A-Za-z]+).enronxgate@enron.com$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]+' '+s[2]) # (fname lname)
        
        # CASE 07
        p = '^([A-Za-z\']+)[_]([A-Za-z\']+)@enron.com$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]+' '+s[2]) # (fname lname)
        
        # CASE 08
        p = '^([A-Za-z]+)[.]([A-Za-z]+[-][A-Za-z]+)@enron.com$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]+' '+s[2]) # (fname lname)

        # CASE R02
        p = '^[A-Za-z] [A-Za-z\'-]+$'
        if re.search(p, s):
            s = ''

        # CASE R01
        buzz_words = ['team', 'technology', 'security', 'enron', 'chairman', 'office', 'announcement', 'wizard', 'notice', 'address', 'hr taylor', 'coo jeff', 'infrastructure', 'ubsw', 'europe',
                      'human', 'resources', 'connection', 'ibuyit', 'users', 'livelink', 'registrar', 'global', 'business', 'compensation', 'executive', 'risk', 'analytics', 'daemon', 'information',
                      'management', 'helpdesk', 'project', 'sunrise', 'oncall', 'credit', 'union', 'notification', 'central', 'communication', 'center', 'parking', 'transportation', 'international',
                      'diversity', 'survey', 'automation', 'document', 'exec', 'iscinfra', 'public', 'relations', 'controls', 'exchange', 'cms router', 'hotline', 'admin', 'pr id', 'expertfinder',
                      'notes', 'the buzz', 'gpg dss', 'xi xi', 'enw piper', 'institute', 'agent', 'tarrif']
                      
        for word in buzz_words:
            if re.search(word, s):
                s = ''
        
        # CASE ZZ
        p = '^[A-Za-z\'-]+ [A-Za-z\'-]+$'
        if re.search(p, s):
            s
        else:
            s = ''

        return(s)
    
    except:
        
        print('EXCEPTION RETURN: ' + s)
        return(s)

### Parse Names from Email Addresses

In [7]:
# run custom function to create a new column
fn['m_from_cleaned'] = fn.m_from.apply(clean_m_from)

# change column order
fn = fn[['m_from', 'm_from_cleaned', 'x_from']]

# check head
fn.head()

Unnamed: 0,m_from,m_from_cleaned,x_from
0,phillip.allen@enron.com,phillip allen,phillip k allen
1,ina.rangel@enron.com,ina rangel,ina rangel
2,1.11913372.-2@multexinvestornetwork.com,,multex investor <1.11913372.-2@multexinvestorn...
3,messenger@ecm.bloomberg.com,,"""bloomberg.com"" <messenger@ecm.bloomberg.com>"
4,aod@newsdata.com,,"""arthur o'donnell"" <aod@newsdata.com>"


In [8]:
def clean_x_from(s):
    """Use this to recover names from the X-From column
       with regular expression captures. [ ] cases are outlined
       below for capture. A final case (ZZ) wipes the cell.
       
       Wiped cell will be used to match any possible fills between
       m_from and x_from."""
    #
    # cleans email addresses for the following cases, returning a string to place in a subsequent (name) column.
    # the following notation is used to explain cases:
    #     fname = first name
    #     lname = last name
    #     mname = middle name
    #     init = initial
    #     ffname = another word considered with first name
    #     scotts = name string contains an apostrophe
    #
    # CASE 01: 
    # CASE 02: 
    # CASE 03: 
    # CASE 04: 
    # CASE 05: 
    # CASE 06: 
    # CASE 07: 
    # CASE 08: 
    # CASE ZZ [CREATE NULL]: @enron.com ('')
    # 

    try:
        
        # CASE 01
        p = '^([A-Za-z]+) ([A-Za-z]+)$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]+' '+s[2]) # (fname lname)

        # CASE 03
        p = '^([A-Za-z]+), ([A-Za-z]+) [A-Za-z][.] {1,2}<.+$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[2]+' '+s[1]) # (fname lname)    
            
        # CASE 02
        p = '^([A-Za-z\'-]+), ([A-Za-z\'-]+) <.+$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[2]+' '+s[1]) # (fname lname)
            
            # removes 'team.' subsets of this pattern
            if re.search('team', s):
                s = ''

        # CASE 14
        p = '^[A-Za-z] ([A-Za-z]+) ([A-Za-z]+) <.+$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]+' '+s[2]) # (fname lname)                
                
        # CASE 04
        p = '^([A-Za-z]+) [A-Za-z] ([A-Za-z]+) <.+$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]+' '+s[2]) # (fname lname)

        # CASE 05
        p = '^([A-Za-z]+) ([A-Za-z]+) <.+$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]+' '+s[2]) # (fname lname)
            
        # CASE 06
        p = '^([A-Za-z]+) [A-Za-z]{2,} ([A-Za-z]+) <.+$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]+' '+s[2]) # (fname lname)

        # CASE 13
        p = '^[A-Za-z] ([A-Za-z]+) ([A-Za-z]+)$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]+' '+s[2]) # (fname lname)
            
        # CASE 07
        p = '^([A-Za-z]+) [A-Za-z]+ ([A-Za-z]+)$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[1]+' '+s[2]) # (fname lname)
        
        # CASE 08
        p = '^([A-Za-z]+), ([A-Za-z]+) [A-Za-z]+ {1,2}<.+$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[2]+' '+s[1]) # (fname lname)
            
        # CASE 09
        p = '^([A-Za-z]+), ([A-Za-z]+)$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[2]+' '+s[1]) # (fname lname)
            
        # CASE 10
        p = '^([A-Za-z]+) [A-Za-z]+., ([A-Za-z]+) <.+$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[2]+' '+s[1]) # (fname lname)
            
        # CASE 11
        p = '^([A-Za-z]+) [A-Za-z]+[.], ([A-Za-z]+) [A-Za-z][.] <.+$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[2]+' '+s[1]) # (fname lname)

        # CASE 12
        p = '^([A-Za-z]+) [A-Za-z]+[.], ([A-Za-z]+)[A-Za-z .]{0,3}$'
        if re.search(p, s):
            s = re.search(p, s)
            s = (s[2]+' '+s[1]) # (fname lname)

        # CASE R02
        p = '^[A-Za-z] [A-Za-z\'-]+$'
        if re.search(p, s):
            s = ''

        # CASE R01
        buzz_words = ['team', 'technology', 'security', 'enron', 'chairman', 'office', 'announcement', 'wizard', 'notice', 'address', 'hr taylor', 'coo jeff', 'infrastructure', 'ubsw', 'europe',
                      'human', 'resources', 'connection', 'ibuyit', 'users', 'livelink', 'registrar', 'global', 'business', 'compensation', 'executive', 'risk', 'analytics', 'daemon', 'information',
                      'management', 'helpdesk', 'project', 'sunrise', 'oncall', 'credit', 'union', 'notification', 'central', 'communication', 'center', 'parking', 'transportation', 'international',
                      'diversity', 'survey', 'automation', 'document', 'exec', 'iscinfra', 'public', 'relations', 'controls', 'exchange', 'cms router', 'hotline', 'admin', 'pr id', 'expertfinder',
                      'notes', 'the buzz', 'gpg dss', 'xi xi', 'enw piper', 'institute', 'agent', 'tarrif']
                      
        for word in buzz_words:
            if re.search(word, s):
                s = ''
        
        # CASE ZZ
        p = '^[A-Za-z\'-]+ [A-Za-z\'-]+$'
        if re.search(p, s):
            s
        else:
            s = ''
        
        return(s)
    
    except:
        
        print('EXCEPTION RETURN: ' + s.group())
        return(s)

### Parse Names from X-From Field

In [9]:
# x_from
fn['x_from_cleaned'] = fn.x_from.apply(clean_x_from)

# peek results
fn.head()

Unnamed: 0,m_from,m_from_cleaned,x_from,x_from_cleaned
0,phillip.allen@enron.com,phillip allen,phillip k allen,phillip allen
1,ina.rangel@enron.com,ina rangel,ina rangel,ina rangel
2,1.11913372.-2@multexinvestornetwork.com,,multex investor <1.11913372.-2@multexinvestorn...,multex investor
3,messenger@ecm.bloomberg.com,,"""bloomberg.com"" <messenger@ecm.bloomberg.com>",
4,aod@newsdata.com,,"""arthur o'donnell"" <aod@newsdata.com>",


### Remove From, X From Dual-Blanks

In [10]:
# condition to catch rows where both clean columns are blank
cond = (fn.m_from_cleaned == '') & (fn.x_from_cleaned == '')

# copy new df of non-condition return
cd = fn[~cond].copy()

    # fill m_from_cleaned empty values with the value from x_from_cleaned
    cd.loc[cd['m_from_cleaned'] == '', 'm_from_cleaned'] = cd['x_from_cleaned']

    # fill x_from_cleaned empty values with the value from m_from_cleaned
    cd.loc[cd['x_from_cleaned'] == '', 'x_from_cleaned'] = cd['m_from_cleaned']

In [11]:
# create a capture column for the final fname lname
cd['clean_name'] = cd['x_from_cleaned']

# fill blanks with other column returns
cd.loc[cd['clean_name'] == '', 'clean_name'] = cd['m_from_cleaned']

### Evaluate Conflicts Between Parses

In [12]:
# return differences between the two parsed values if they both returned a value
makin_picks = cd[(cd.m_from_cleaned != cd.x_from_cleaned) & (cd.m_from_cleaned != '') & (cd.x_from_cleaned != '')]

# manually review mismatches
print(len(makin_picks))
makin_picks

32


Unnamed: 0,m_from,m_from_cleaned,x_from,x_from_cleaned,clean_name
409,buckner.thomas@enron.com,buckner thomas,"thomas, john buckner </o=enron/ou=na/cn=recipi...",john thomas,john thomas
530,jae.black@enron.com,jae black,"black, tamara jae </o=enron/ou=na/cn=recipient...",tamara black,tamara black
732,pinto.leite@enron.com,pinto leite,"leite, francisco pinto </o=enron/ou=na/cn=reci...",francisco leite,francisco leite
1953,dana.davis@enron.com,dana davis,"davis, mark dana </o=enron/ou=na/cn=recipients...",mark davis,mark davis
1994,kay.miller@enron.com,kay miller,"miller, mary kay </o=enron/ou=na/cn=recipients...",mary miller,mary miller
2016,ann.foret@enron.com,ann foret,"foret, leigh ann </o=enron/ou=na/cn=recipients...",leigh foret,leigh foret
3124,ann.matson@enron.com,ann matson,"matson, jo ann </o=enron/ou=na/cn=recipients/c...",jo matson,jo matson
3149,ann.hill@enron.com,ann hill,"hill, jo ann </o=enron/ou=na/cn=recipients/cn=...",jo hill,jo hill
3246,marie.allex@enron.com,marie allex,"allex, anne marie </o=enron/ou=na/cn=recipient...",anne allex,anne allex
5252,nell.browning@enron.com,nell browning,"browning, mary nell </o=enron/ou=na/cn=recipie...",mary browning,mary browning


In [13]:
# manual review, update where names are not equal
# 
# decision based on most gender-strong form of the name assumption
# 
# 

cd.loc[cd['m_from_cleaned'] == 'kay miller', 'clean_name'] = 'kay miller'
cd.loc[cd['m_from_cleaned'] == 'ann foret', 'clean_name'] = 'ann foret'
cd.loc[cd['m_from_cleaned'] == 'ann hill', 'clean_name'] = 'ann hill'
cd.loc[cd['m_from_cleaned'] == 'ann matson', 'clean_name'] = 'ann matson'
cd.loc[cd['m_from_cleaned'] == 'ann chance', 'clean_name'] = 'ann chance'
cd.loc[cd['m_from_cleaned'] == 'lynn schaffart', 'clean_name'] = 'lynn schaffart'

# clean ' from strings
cd.loc[cd['m_from_cleaned'] == 'chenee\' franklin', 'clean_name'] = 'chenee franklin'
cd.loc[cd['m_from_cleaned'] == '\'todd\' delahoussaye', 'clean_name'] = 'todd delahoussaye'

### Separate, Clean First Name for Scraping

In [14]:
cd['gender_query'] = cd['clean_name'].str.extract('^([A-Za-z\'-]+) [A-Za-z\'-]+$')
cd.head()

Unnamed: 0,m_from,m_from_cleaned,x_from,x_from_cleaned,clean_name,gender_query
0,phillip.allen@enron.com,phillip allen,phillip k allen,phillip allen,phillip allen,phillip
1,ina.rangel@enron.com,ina rangel,ina rangel,ina rangel,ina rangel,ina
2,1.11913372.-2@multexinvestornetwork.com,,multex investor <1.11913372.-2@multexinvestorn...,multex investor,multex investor,multex
7,rebecca.cantrell@enron.com,rebecca cantrell,rebecca w cantrell,rebecca cantrell,rebecca cantrell,rebecca
9,paul.kaufman@enron.com,paul kaufman,paul kaufman,paul kaufman,paul kaufman,paul


In [15]:
len(cd)

9152

### Remove Hyphen From Strings

In [16]:
# contains '-'
cond = cd.gender_query.str.contains('-')

# return fields
cd[cond]

Unnamed: 0,m_from,m_from_cleaned,x_from,x_from_cleaned,clean_name,gender_query
187,sarah-joy.hunter@enron.com,sarah-joy hunter,sarah-joy hunter,sarah-joy hunter,sarah-joy hunter,sarah-joy
1425,jan-erland.bekeng@enron.com,jan-erland bekeng,jan-erland bekeng,jan-erland bekeng,jan-erland bekeng,jan-erland
4430,xochitl-alexis.velasco@enron.com,xochitl-alexis velasco,xochitl-alexis velasco,xochitl-alexis velasco,xochitl-alexis velasco,xochitl-alexis
5361,sladana-anna.kulic@enron.com,sladana-anna kulic,"kulic, sladana-anna </o=enron/ou=na/cn=recipie...",sladana-anna kulic,sladana-anna kulic,sladana-anna
6088,seung-taek.oh@enron.com,seung-taek oh,"oh, seung-taek </o=enron/ou=na/cn=recipients/c...",seung-taek oh,seung-taek oh,seung-taek
15899,jung-suk.suh@enron.com,jung-suk suh,jung-suk suh,jung-suk suh,jung-suk suh,jung-suk
18772,jae-moo.lee@enron.com,jae-moo lee,jae-moo lee,jae-moo lee,jae-moo lee,jae-moo


In [17]:
# manual review, hyphen names return bad values from web query
#
# removing one name and learing the most gender-strong form for query
#
#

cd.loc[cd['gender_query'] == 'sarah-joy', 'gender_query'] = 'sarah'
cd.loc[cd['gender_query'] == 'jan-erland', 'gender_query'] = 'jan'
cd.loc[cd['gender_query'] == 'sladana-anna', 'gender_query'] = 'anna'
cd.loc[cd['gender_query'] == 'seung-taek', 'gender_query'] = 'seung'
cd.loc[cd['gender_query'] == 'xochitl-alexis', 'gender_query'] = 'alexis'
cd.loc[cd['gender_query'] == 'jung-suk', 'gender_query'] = 'jung'
cd.loc[cd['gender_query'] == 'jae-moo', 'gender_query'] = 'jae'

In [18]:
cd[cd.gender_query == 'estalee']

Unnamed: 0,m_from,m_from_cleaned,x_from,x_from_cleaned,clean_name,gender_query
4965,estalee.russi@enron.com,estalee russi,"russi, estalee </o=enron/ou=na/cn=recipients/c...",estalee russi,estalee russi,estalee


### Getting Gender

In [19]:
# create name series
fname_series = []
fname_series = cd.drop_duplicates('gender_query').gender_query

# sort values, reset index
fname_series = fname_series.sort_values()
fname_series = fname_series.reset_index(drop=True)

In [20]:
# create list of names to evaluate
print(len(fname_series))
fname_list = fname_series.values.tolist()

2391


In [21]:
fname_series

0          aamir
1          aaron
2           abby
3       abhijeet
4          about
          ...   
2386       zofia
2387      zoltan
2388       zooey
2389         zou
2390       zulie
Name: gender_query, Length: 2391, dtype: object

### Import nltk names lists

In [22]:
female_names = pd.read_csv('./data/names/female.txt', sep='\n', header=None)
male_names = pd.read_csv('./data/names/male.txt', sep='\n', header=None)

In [23]:
# import numpy
import numpy as np

# check count of names that match name list
male_name_matches = np.intersect1d(fname_series.array, male_names[0].str.lower().array)
female_name_matches = np.intersect1d(fname_series.array, female_names[0].str.lower().array)

In [24]:
nltk_key = pd.DataFrame()

In [25]:
nltk_key = pd.DataFrame(fname_series)

In [26]:
def get_gender(name):
    if np.intersect1d(name, male_names[0].str.lower().array).size > 0:
        gender = 'boy'
    elif np.intersect1d(name, female_names[0].str.lower().array).size > 0:
        gender = 'girl'
    else:
        gender = np.NaN
 
    return gender

In [27]:
# export nltk_key to csv
#nltk_key.to_csv('./data/enron/nltk_gender_key.csv', index=False, index_label=False)
#nltk_key.head()

In [28]:
# create gender column
nltk_key['gender'] = nltk_key.gender_query.apply(get_gender)

In [29]:
nltk_key.groupby(['gender']).size()

gender
boy     709
girl    631
dtype: int64

In [30]:
# merge the nltk return to the original names dataframe
cd = cd.merge(nltk_key, how='left', on='gender_query')
cd['gender_nltk'] = cd.gender.copy() # rename column
cd = cd.drop('gender', axis=1) # drop original

### Add in gender key from web api

In [38]:
web_key = pd.read_csv('./data/enron/website_gender_key.csv')

In [40]:
cd.head()

Unnamed: 0,m_from,m_from_cleaned,x_from,x_from_cleaned,clean_name,gender_query,gender_nltk
0,phillip.allen@enron.com,phillip allen,phillip k allen,phillip allen,phillip allen,phillip,boy
1,ina.rangel@enron.com,ina rangel,ina rangel,ina rangel,ina rangel,ina,girl
2,1.11913372.-2@multexinvestornetwork.com,,multex investor <1.11913372.-2@multexinvestorn...,multex investor,multex investor,multex,
3,rebecca.cantrell@enron.com,rebecca cantrell,rebecca w cantrell,rebecca cantrell,rebecca cantrell,rebecca,girl
4,paul.kaufman@enron.com,paul kaufman,paul kaufman,paul kaufman,paul kaufman,paul,boy


In [41]:
# merge in the web gender key
cd = cd.merge(web_key, how='left', left_on='gender_query', right_on='name')
cd['gender_web'] = cd.gender.copy() # rename column
cd = cd.drop(['name', 'gender'], axis=1) # drop original

In [42]:
cd.head()

Unnamed: 0,m_from,m_from_cleaned,x_from,x_from_cleaned,clean_name,gender_query,gender_nltk,gender_web
0,phillip.allen@enron.com,phillip allen,phillip k allen,phillip allen,phillip allen,phillip,boy,boy
1,ina.rangel@enron.com,ina rangel,ina rangel,ina rangel,ina rangel,ina,girl,girl
2,1.11913372.-2@multexinvestornetwork.com,,multex investor <1.11913372.-2@multexinvestorn...,multex investor,multex investor,multex,,
3,rebecca.cantrell@enron.com,rebecca cantrell,rebecca w cantrell,rebecca cantrell,rebecca cantrell,rebecca,girl,girl
4,paul.kaufman@enron.com,paul kaufman,paul kaufman,paul kaufman,paul kaufman,paul,boy,boy


### Combine gender keys, create a dataframe linking gender to *From:* email

In [63]:
# copy web returns to new column
cd['gender'] = cd['gender_web']

# fill NaN with nltk values
cd.loc[cd['gender'].isna(), 'gender'] = cd['gender_nltk']

In [64]:
cd.head()

Unnamed: 0,m_from,m_from_cleaned,x_from,x_from_cleaned,clean_name,gender_query,gender_nltk,gender_web,gender
0,phillip.allen@enron.com,phillip allen,phillip k allen,phillip allen,phillip allen,phillip,boy,boy,boy
1,ina.rangel@enron.com,ina rangel,ina rangel,ina rangel,ina rangel,ina,girl,girl,girl
2,1.11913372.-2@multexinvestornetwork.com,,multex investor <1.11913372.-2@multexinvestorn...,multex investor,multex investor,multex,,,
3,rebecca.cantrell@enron.com,rebecca cantrell,rebecca w cantrell,rebecca cantrell,rebecca cantrell,rebecca,girl,girl,girl
4,paul.kaufman@enron.com,paul kaufman,paul kaufman,paul kaufman,paul kaufman,paul,boy,boy,boy


In [65]:
cd.groupby(['gender']).size()

gender
boy     4744
girl    3458
dtype: int64

In [66]:
cd.groupby(['gender_nltk']).size()

gender_nltk
boy     4989
girl    2618
dtype: int64

In [67]:
cd.groupby(['gender_web']).size()

gender_web
boy     4618
girl    3353
dtype: int64

In [81]:
# correct some boy names that should be labelled girl
correct_to_girl = cd[~(cd['gender_web'] == cd['gender_nltk']) & (cd.gender_web.notna()) & (cd.gender_nltk.notna()) & ~(cd.gender == 'girl')]
correct_to_girl = correct_to_girl[['m_from', 'gender_correction']]
cd = cd.merge(correct_to_girl, how='left', on='m_from')
# correct gender to girl
cd.loc[cd['gender_correction'] == 'girl', 'gender'] = 'girl'

In [100]:
master_key = cd[['m_from', 'gender']]

In [101]:
master_key

Unnamed: 0,m_from,gender
0,phillip.allen@enron.com,boy
1,ina.rangel@enron.com,girl
2,1.11913372.-2@multexinvestornetwork.com,
3,rebecca.cantrell@enron.com,girl
4,paul.kaufman@enron.com,boy
...,...,...
9147,emma.vine@gvsi.com,girl
9148,ben.sturgeon@gfinet.co.uk,boy
9149,6.1132.6c-af5ssclxjfagjsrr.1@mail3.travelocity...,
9150,megan.scott@enron.com,girl


In [104]:
gender_df = pd.merge(df_full, master_key, how='left', on='m_from')

In [106]:
gender_df.groupby(['gender']).size()

gender
boy     235804
girl    174119
dtype: int64

In [108]:
# gender_df.to_csv('./data/enron/01_got_gender.csv')