In [1]:
import time
start_time = time.perf_counter()

In [2]:
%matplotlib inline
# version check
import numpy
print('The numpy version is {}.'.format(numpy.__version__))
import pandas
print('The pandas version is {}.'.format(pandas.__version__))
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))
import matplotlib
print('The matplotlib version is {}.'.format(matplotlib.__version__))
import regex
print('The regex version is {}.'.format(regex.__version__))

The numpy version is 1.18.1.
The pandas version is 1.0.4.
The scikit-learn version is 0.23.1.
The matplotlib version is 3.2.1.
The regex version is 2.5.80.


In [3]:
# set random state for reproducibility
random_state = 42

# default numpy settings
import numpy as np
np.set_printoptions(edgeitems=3)
np.core.arrayprint._line_width = 80

# update settings
import numpy as np
np.set_printoptions(edgeitems=15, linewidth=100000, 
    formatter=dict(float=lambda x: "%.3f" % x))

# update pandas settings
import pandas as pd
pd.set_option('max_colwidth', 200)
pd.options.display.max_rows = 250

### Threadpool executor for this notebook's user functions

In [4]:
%%time

import os
print("Number of processors: %d" % (os.cpu_count()))

def threadpool_executor(io_func=None, df=None, series=None, merge_back=False, idict=None, new_name=None):
    """Use this to pass a dataframe series to the threadpool executor to trigger async processessing. Make sure the user function (io_func) meets the following conditions:
    - Receives two arguments: series index position & series values
    - Has an output dictionary that assembles the index, values as key, values for reassembly
    - Output dictionary is set as the return by name within the called function
    
    io_func: user function to pass to threadpoolexecutor; df: dataframe with data; series: string name of column; idict: dictionary linked to user function to catch output values; new_name: string for new column name
    when reassigning values back to the main dataframe"""
    
    if (io_func is None) | (df is None) | (series is None):
        sys.exit("'None' received as input for either io_func, df, or series. Please assign a value and try again.")
    if (merge_back is True) & (idict is None):
        sys.exit("'merge_back is set to True. However, no dictionary has been passed to capture output. Please set the idict= argument to an initialized dictionary and try again.'")
    
    import concurrent.futures
    with concurrent.futures.ThreadPoolExecutor() as executor:  
        executor.map(io_func, df[series].index, df[series])
    
    if merge_back == True: # will merge the processed values back to the initial dataframe
        if idict is not None:
            new_frame = pd.DataFrame.from_dict(idict, orient='index')
            if new_name is not None:
                new_frame.columns = [new_name]
                df = pd.merge(df, new_frame, how='left', left_index=True, right_index=True, copy=False)
            else:
                sys.exit("Merge back is True, but no series or name for a new column has been received. If a new column is desired, please pass a new_name. Otherwise, please pass, 'series' to merge_back to replace the current series.")
    if merge_back == 'series': # will drop the old series and replace it with the new processed documents
        df = df.drop(series, axis=1)
        new_frame = pd.DataFrame.from_dict(idict, orient='index')
        new_frame.columns = [series]
        df = pd.merge(df, new_frame, how='left', left_index=True, right_index=True, copy=False)
            
    return df

Number of processors: 16
Wall time: 0 ns


## Import data

In [5]:
%%time
# import dataframe, drop duplicates
dataset_directory = './data/enron/clean_clean_by_strings.csv'
Corpus_input = 'clean_body'
Corpus_output = 'preprocessed_body'
df = pd.read_csv(dataset_directory, index_col=0)

Wall time: 6.75 s


In [6]:
df.head(2)

Unnamed: 0,f_dir,m_id,m_date,m_from,m_to,m_cc,m_bcc,m_subj,mime_vers,cont_type,...,x_orig,x_fname,o_body,m_body,gender,n_emails_sent,n_characters_start,clean_char,n_char,clean_body
0,./data/enron/maildir/farmer-d/logistics/12,<18632438.1075840432068.JavaMail.evans@thyme>,"Tue, 11 Dec 2001 06:07:33 -0800 (PST)",rita.wynne@enron.com,"michael.olsen@enron.com, stephen.swisher@enron.com","sherry.anastas@enron.com, j..farmer@enron.com","sherry.anastas@enron.com, j..farmer@enron.com",Centana Storage Deal,1.0,text/plain; charset=us-ascii,...,FARMER-D,darren farmer 6-26-02.pst\n\n,"Message-ID: <18632438.1075840432068.JavaMail.evans@thyme>\nDate: Tue, 11 Dec 2001 06:07:33 -0800 (PST)\nFrom: rita.wynne@enron.com\nTo: michael.olsen@enron.com, stephen.swisher@enron.com\nSubject:...","Mike/Stephen,\n\nHave the two of you been able to get the deal in for the sale of the Centana storage to AEP? Please advise. I would like to have this completed for this close if possible.\n\nDa...",1.0,57,268,268,268,"Mike/Stephen,\n\nHave the two of you been able to get the deal in for the sale of the Centana storage to AEP? Please advise. I would like to have this completed for this close if possible.\n\nDa..."
1,./data/enron/maildir/guzman-m/all_documents/1429,<18190482.1075840619280.JavaMail.evans@thyme>,"Thu, 21 Dec 2000 04:07:00 -0800 (PST)",caroline.emmert@enron.com,"geir.solberg@enron.com, holden.salisbury@enron.com, mark.guzman@enron.com",virginia.thompson@enron.com,virginia.thompson@enron.com,Puget Sound Deal on October 17,1.0,text/plain; charset=us-ascii,...,GUZMAN-M,mark guzman 6-28-02.nsf\n\n,"Message-ID: <18190482.1075840619280.JavaMail.evans@thyme>\nDate: Thu, 21 Dec 2000 04:07:00 -0800 (PST)\nFrom: caroline.emmert@enron.com\nTo: geir.solberg@enron.com, holden.salisbury@enron.com, mar...","Guys,\n\nPuget is claiming that we did a real-time deal with their trader, Lisa, on \nOctober 17 HE 20. The terms are 20MW purchased from Puget at a rate of \n$115. We cannot find anything in th...",1.0,90,479,479,479,"Guys,\n\nPuget is claiming that we did a real-time deal with their trader, Lisa, on \nOctober 17 HE 20. The terms are 20MW purchased from Puget at a rate of \n$115. We cannot find anything in th..."


## Monitoring impact on labels
We want to keep an eye on the male/female ratio when filtering out our dataset (especially when blanket removing things like 'duplicates', where the filtering can be applied to either label for the same condition) so that our label ratio isn't significantly unbalanced as a result.

In [7]:
def monitor_label(df):
    b = df.groupby(['gender']).size()[0]
    g = df.groupby(['gender']).size()[1]
    print('Frame Size: {} observations\nboy: {}\ngirl: {}\nB/G Ratio: {:.3f} ({:.0f}%, {:.0f}%)'.format(len(df), b, g, b/g, b*100/(b+g), g*100/(b+g)))
    
monitor_label(df)

Frame Size: 170545 observations
boy: 96240
girl: 74305
B/G Ratio: 1.295 (56%, 44%)


## Remove NaN from gender, message body

In [8]:
df = df[df.gender.notna()]
df = df[df[Corpus_input].notna()]
monitor_label(df)

Frame Size: 170545 observations
boy: 96240
girl: 74305
B/G Ratio: 1.295 (56%, 44%)


## Remove duplicates from cleaned email body

In [9]:
# set dataframe to dropped duplicates
df = df.drop_duplicates(Corpus_input)

monitor_label(df)

Frame Size: 156373 observations
boy: 87147
girl: 69226
B/G Ratio: 1.259 (56%, 44%)


## Randomize dataframe

In [10]:
def resample_frame(df):
    df = df.sample(n=len(df)).reset_index(drop=True)
    return df
df = resample_frame(df) # resampled

### Test samples for preprocess function setup

In [11]:
# return the largest email body for preprocessing setup
test_sample = df[Corpus_input].max()
test_samples = df.loc[0:20, Corpus_input]

In [12]:
for sample in test_samples[:3]:
    print(sample)

Jimbo, thank you for the message.  I, too, have great, ever-lasting memories of our times together in Spain and Scottsdale. I am confident that there will be many more to come.  Please let me know how much longer you will be in Spain so that I can attempt to visit you again.  You have wonderful skills and integrity and a very big and caring heart, and you will do extremely well in whatever career life offers you. I am proud to be your father.  Love, Dad 

 
I have tracked down an old memo written by the Chief Counsel of Federal 
Affairs for Niagara Mohawk, summarizing positions taken in requests for 
rehearing of Order No. 888 to close the bypass loophole.  She discusses how 
there would be many efforts to municipalize utilities' distribution systems 
in New York and how NiMo would be meeting with the FERC Staff to tell them 
how much money they might lose if they couldn't figure out a way to recover 
these costs.  Maybe whoever is researching this should focus on FERC cases 
involving

## Stop Words & Names

In [13]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
_ = list(stop_words)[:10]
print('10 samples from stop_words list: %s' % (_[:10]))
names_from_file = pd.read_csv('./data/names/all_names.txt', sep='\n', header=None)
names_list = list(names_from_file[0].str.lower())
stop_words_names = names_list.copy()
stop_words_names.extend(list(stop_words))
print('10 samples from new stop_words list: %s' % (stop_words_names[:10]))
stop_words_names = set(stop_words_names)
print('stop_words count: %d\nnames from file: %d\nstop_words_names count: %d' % (len(stop_words), len(names_list), len(stop_words_names)))

10 samples from stop_words list: ['from', 'some', 'd', "mightn't", 'him', 'when', "hadn't", 'whom', 'before', 'such']
10 samples from new stop_words list: ['nedre', 'cathlene', 'jason', 'orton', 'goradia', 'mina', 'weezie', 'kanya', 'ring', 'saundra']
stop_words count: 179
names from file: 12418
stop_words_names count: 12591


### Preprocess function

In [14]:
# import regex
import regex as re
# import lemmatizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize # creates tokenized words
# instantiate stemmer object
stemmer = WordNetLemmatizer()

rem_nword_char = re.compile(r'\W') # remove characters that are not word characters
rem_underscore = re.compile(r'_+', flags=re.I) # removes the underscore character
rem_single_char = re.compile(r'\b\w\b', flags=re.I) # removes all single characters
rem_numbers = re.compile(r'\d') # removes numbers
rem_mult_spaces = re.compile(r'\s+', flags=re.I) # reduces multiple spaces to single space

def process_text(doc_idx, doc):
    try:
        doc = re.sub(rem_nword_char, ' ', str(doc)) # remove characters that are not word characters
    except:
        pass
    try:
        doc = re.sub(rem_underscore, ' ', str(doc)) # removes the underscore character
    except:
        pass
    try:
        doc = re.sub(rem_single_char, ' ', str(doc)) # removes all single characters
    except:
        pass
    try:
        doc = re.sub(rem_numbers, ' ', str(doc)) # removes numbers
    except:
        pass
    try:
        doc = doc.lower()
    except:
        pass
    try:
        doc = re.sub(rem_mult_spaces, ' ', str(doc)) # reduces multiple spaces to single space
    except:
        pass

    try:
        doc = ' '.join([stemmer.lemmatize(word) for word in word_tokenize(doc) if word not in stop_words_names])
        #doc = ' '.join([stemmer.lemmatize(word) for word in doc.split() if word not in stop_words_names])
        #doc = ' '.join([word for word in word_tokenize(doc) if word not in stop_words_names])
        
    except:
        pass
    
    returns_dict[doc_idx] = doc

## Process Messages

In [15]:
%%time
returns_dict = {} # call threadpool user function, pass signature args
df = threadpool_executor(io_func=process_text, df=df, series=Corpus_input, merge_back=True, idict=returns_dict, new_name=Corpus_output)

df = df.reset_index(drop=True)
monitor_label(df)

Frame Size: 156373 observations
boy: 87147
girl: 69226
B/G Ratio: 1.259 (56%, 44%)
Wall time: 6min 28s


In [16]:
len(returns_dict)

156373

## Processed Messages: Remove rows where processed message is NaN (pd.notna)

In [17]:
df = df[df[Corpus_output].notna()]
df = df.reset_index(drop=True)
monitor_label(df)

Frame Size: 156373 observations
boy: 87147
girl: 69226
B/G Ratio: 1.259 (56%, 44%)


## Processed Messages: Remove rows where processed message equals a blank (== '')

In [18]:
df = df.drop(np.where(df[Corpus_output] == '')[0])
df = df.reset_index(drop=True)
monitor_label(df)

Frame Size: 154021 observations
boy: 85683
girl: 68338
B/G Ratio: 1.254 (56%, 44%)


## Processed Messages: Remove rows where processed message is a duplicate (.drop_duplicates)

In [19]:
# set dataframe to dropped duplicates
df = df.drop_duplicates(Corpus_output)
df = df.reset_index(drop=True)
monitor_label(df)

Frame Size: 139283 observations
boy: 77155
girl: 62128
B/G Ratio: 1.242 (55%, 45%)


In [21]:
df.head(1)

Unnamed: 0,f_dir,m_id,m_date,m_from,m_to,m_cc,m_bcc,m_subj,mime_vers,cont_type,...,x_fname,o_body,m_body,gender,n_emails_sent,n_characters_start,clean_char,n_char,clean_body,preprocessed_body
0,./data/enron/maildir/derrick-j/sent_items/247,<3600822.1075845098335.JavaMail.evans@thyme>,"Fri, 6 Apr 2001 11:54:21 -0700 (PDT)",james.derrick@enron.com,jvd024@hotmail.com,,,RE:,1.0,text/plain; charset=ANSI_X3.4-1968,...,"Derrick Jr., James.pst\n\n","Message-ID: <3600822.1075845098335.JavaMail.evans@thyme>\nDate: Fri, 6 Apr 2001 11:54:21 -0700 (PDT)\nFrom: james.derrick@enron.com\nTo: jvd024@hotmail.com\nSubject: RE:\nMime-Version: 1.0\nConten...","Jimbo, thank you for the message. I, too, have great, ever-lasting memories of our times together in Spain and Scottsdale. I am confident that there will be many more to come. Please let me know...",0.0,909,1859,461,461,"Jimbo, thank you for the message. I, too, have great, ever-lasting memories of our times together in Spain and Scottsdale. I am confident that there will be many more to come. Please let me know...",jimbo thank you for the message too have great ever lasting memories of our times together in spain and scottsdale am confident that there will be many more to come please let me know how much lon...


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139283 entries, 0 to 139282
Data columns (total 27 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   f_dir               139283 non-null  object 
 1   m_id                139283 non-null  object 
 2   m_date              139283 non-null  object 
 3   m_from              139283 non-null  object 
 4   m_to                136997 non-null  object 
 5   m_cc                44984 non-null   object 
 6   m_bcc               42944 non-null   object 
 7   m_subj              133440 non-null  object 
 8   mime_vers           139283 non-null  float64
 9   cont_type           139283 non-null  object 
 10  encode              139283 non-null  object 
 11  x_from              139283 non-null  object 
 12  x_to                137591 non-null  object 
 13  x_cc                43246 non-null   object 
 14  x_bcc               117 non-null     object 
 15  x_fold              139283 non-nul

In [23]:
df.to_csv('./data/enron/preprocess_data.csv')

# End Section

In [24]:
end_time = time.perf_counter()
print('Run time: %.1fs (~%dm)' % ((end_time-start_time, (end_time-start_time)/60)))

Run time: 411.8s (~6m)
