In [1]:
import time
start_time = time.time()

# 03 TF-IDF to Clean Email Bodies
- Preprocess
- Calculate TF-IDF
- Ranking with Cosine Similarity

## Preprocessing
To preprocess the emails, let's create a collection of functions that will perform each process on a body of text. Then, we can tie them together in an umbrella function, and apply it to the series with `series.apply()`.

- 0.1 Isolate key, body in subFrame
- 0.2 Convert to lowercase
- 0.3 Remove stop words (nltk library)
- 0.4 Remove punctuation
 - *special case: " ' "*
- 0.5 Single characters
- 0.6 Stemming (Stemming converts words to its stem) AND/OR Lemmatisation (Reduce word to root synonym of a word; will make sure is dictionary word)
 - Can do either or both; recommend lemma then stem
- 0.7 Converting numbers

In [2]:
# import library
import numpy as np
import pandas as pd
import re
import sys
import concurrent.futures

In [3]:
# import master DataFrame mdf
df = pd.read_csv('./data/02_add_gender.csv', header=0, index_col=False)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169285 entries, 0 to 169284
Data columns (total 21 columns):
f_dir        169285 non-null object
m_id         169285 non-null object
m_date       169285 non-null object
m_from       169285 non-null object
m_to         163413 non-null object
m_cc         52714 non-null object
m_bcc        50250 non-null object
m_subj       162429 non-null object
mime_vers    169285 non-null float64
cont_type    169285 non-null object
encode       169285 non-null object
x_from       169285 non-null object
x_to         164697 non-null object
x_cc         50633 non-null object
x_bcc        131 non-null object
x_fold       169285 non-null object
x_orig       169285 non-null object
x_fname      167315 non-null object
m_body       169285 non-null object
name         162847 non-null object
gender       162847 non-null object
dtypes: float64(1), object(20)
memory usage: 27.1+ MB


In [5]:
# total email body chars (for process tracking)
start_chars = df.m_body.apply(len).sum()

print('Total characters across all email bodies in corpus: {}.'.format(start_chars))

Total characters across all email bodies in corpus: 135952755.


In [6]:
df.head(2)

Unnamed: 0,f_dir,m_id,m_date,m_from,m_to,m_cc,m_bcc,m_subj,mime_vers,cont_type,...,x_from,x_to,x_cc,x_bcc,x_fold,x_orig,x_fname,m_body,name,gender
0,allen-p/_sent_mail/1,<18782981.1075855378110.JavaMail.evans@thyme>,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",phillip.allen@enron.com,tim.belden@enron.com,,,,1.0,text/plain; charset=us-ascii,...,Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Here is our forecast,phillip,boy
1,allen-p/_sent_mail/10,<15464986.1075855378456.JavaMail.evans@thyme>,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",phillip.allen@enron.com,john.lavorato@enron.com,,,Re:,1.0,text/plain; charset=us-ascii,...,Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Traveling to have a business meeting takes the...,phillip,boy


### Reduce Set on Gender Null Values, Not Found

In [7]:
len(df)

169285

In [8]:
# check gender values
df.groupby(['gender']).size()

gender
boy          92565
girl         69849
not_found      433
dtype: int64

In [9]:
# null count
len(df[df.gender.isnull()])

6438

In [10]:
# create capture condition: is null OR gender not found
cond = (df.gender.isnull()) | (df.gender == 'not_found')

# remove condition
df = df[~cond]

### Reduce Set Based on Character Count

### 0.1 Isolate key, body in subFrame

In [11]:
# create subFrame with directory, email body text
df = df[['m_body']]

In [12]:
type(df)

pandas.core.frame.DataFrame

In [13]:
# create testFrame from subFrame for method testing
#df = df.sample(n=5000, random_state=1)
#df = df.reset_index()
#tsf = sf.copy()

# show testFrame
df.head()

Unnamed: 0,m_body
0,Here is our forecast
1,Traveling to have a business meeting takes the...
2,test successful. way to go!!!
3,"Randy, Can you send me a schedule of the sal..."
4,Let's shoot for Tuesday at 11:45.


### 0.2 Convert to lowercase

In [14]:
# lowers case
def no_talls(t):

    return t.lower()

# test
print(df.m_body.head().apply(no_talls))

0                                 here is our forecast
1    traveling to have a business meeting takes the...
2                       test successful.  way to go!!!
3    randy,   can you send me a schedule of the sal...
4                    let's shoot for tuesday at 11:45.
Name: m_body, dtype: object


### 0.3 Remove stop words (nltk library)

In [15]:
# import library
import nltk # natural language toolkit
from nltk.tokenize import word_tokenize # creates tokenized words
from nltk.corpus import stopwords

# set stopwords list to english
stop_words = set(stopwords.words('english'))

In [16]:
# removes stopwords
def no_stops(t):
    
    # new text to be returned
    nt = ""
    
    # split text into list of words
    words = word_tokenize(t)
    
    for word in words:
        if word not in stop_words:
            nt = nt + ' ' + word
    
    return nt
    
# test
print(df.m_body.head().apply(no_stops))

0                                        Here forecast
1     Traveling business meeting takes fun trip . E...
2                       test successful . way go ! ! !
3     Randy , Can send schedule salary level everyo...
4                         Let 's shoot Tuesday 11:45 .
Name: m_body, dtype: object


### 0.4 Remove punctuation

In [17]:
# removes select punctuation
def no_puncs(t):
    
    # list of punctuation marks
    puncs = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n,"

    for p in puncs:
        t = t.replace(p, ' ')
    
    return t

# test
print(df.m_body.head().apply(no_puncs))

0                                 Here is our forecast
1    Traveling to have a business meeting takes the...
2                       test successful   way to go   
3    Randy    Can you send me a schedule of the sal...
4                    Let's shoot for Tuesday at 11 45 
Name: m_body, dtype: object


### 0.4.1 Remove apostrophe

In [18]:
# removes apostrophe
def no_aposts(t):
    
    t = t.replace("'", ' ')
    
    return t

# test
print(df.m_body.head().apply(no_aposts))

0                                 Here is our forecast
1    Traveling to have a business meeting takes the...
2                       test successful.  way to go!!!
3    Randy,   Can you send me a schedule of the sal...
4                    Let s shoot for Tuesday at 11:45.
Name: m_body, dtype: object


### 0.5 Single characters

In [19]:
# remove single characters
def no_singles(t):
    
    nt = ""
    
    words = word_tokenize(t)
    
    for word in words:
        if len(word) > 1:
            nt = nt + ' ' + word
            
    return nt

# test
print(df.m_body.head().apply(no_singles))

0                                 Here is our forecast
1     Traveling to have business meeting takes the ...
2                            test successful way to go
3     Randy Can you send me schedule of the salary ...
4                    Let 's shoot for Tuesday at 11:45
Name: m_body, dtype: object


### 0.6.1 Lemmatisation

In [20]:
# import library
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# create instance
lemmatizer = WordNetLemmatizer()

# lemmatizer test
print(lemmatizer.lemmatize('better', pos = 'a'))

good


In [21]:
# lemmatize text
def go_lemms(t):
    
    nt = ""
    
    words = word_tokenize(t)
    
    for word in words:
        word = lemmatizer.lemmatize(word) # noun
        word = lemmatizer.lemmatize(word, pos = 'a') # adjective
        word = lemmatizer.lemmatize(word, pos = 'v') # verb
        nt = nt + ' ' + word

    return nt

# test
print(df.m_body.head().apply(go_lemms))

0                                 Here be our forecast
1     Traveling to have a business meet take the fu...
2                    test successful . way to go ! ! !
3     Randy , Can you send me a schedule of the sal...
4                  Let 's shoot for Tuesday at 11:45 .
Name: m_body, dtype: object


### 0.6.2 Stemming

In [22]:
# import library
from nltk.stem.snowball import SnowballStemmer

# create an instance
stemmer = SnowballStemmer("english")

# stemmer test
print(stemmer.stem("running"))

run


In [23]:
# stem text
def go_stems(t):

    nt = ""
    
    words = word_tokenize(t)

    for word in words:
        nt = nt + ' ' + stemmer.stem(word)

    return nt
        
# test
print(df.m_body.head().apply(go_stems))

0                                 here is our forecast
1     travel to have a busi meet take the fun out o...
2                       test success . way to go ! ! !
3     randi , can you send me a schedul of the sala...
4                  let 's shoot for tuesday at 11:45 .
Name: m_body, dtype: object


### 0.7 Converting numbers

In [24]:
# import library
from num2words import num2words

In [25]:
# convert numbers to text
def no_numbs(t):
        
    nt = ""
    
    words = word_tokenize(t)

    for word in words:
        try:
            word = num2words(int(word))
        except:
            a = 0
        nt = nt + ' ' + word

    return nt

# test
print(df.m_body.head().apply(no_numbs))

0                                 Here is our forecast
1     Traveling to have a business meeting takes th...
2                    test successful . way to go ! ! !
3     Randy , Can you send me a schedule of the sal...
4                  Let 's shoot for Tuesday at 11:45 .
Name: m_body, dtype: object


### 0.8 Preprocessing function

In [26]:
# combines preprocessing steps
def preprocess(t):

    t = no_talls(t)
    t = no_puncs(t)    
    t = no_aposts(t)
    t = no_singles(t)
    t = no_numbs(t)
    t = no_stops(t)
    t = go_lemms(t)
    t = go_stems(t)
    t = no_puncs(t)    
    t = no_numbs(t)
    
    # process numbers to text
    #t = no_talls(t)
    #t = no_numbs(t)

    # process punctuation, single characters, stopwords
    #t = no_puncs(t)
    #t = no_aposts(t)    
    #t = no_singles(t) 
    #t = no_stops(t)

    # t = no_numbs(t)
    
    # lemm & stem
    #t = go_lemms(t)
    #t = go_stems(t) 

    # cleanup set
    #t = no_numbs(t)

    # return final text
    return t

# test
print(df.m_body.head().apply(preprocess))

0                                             forecast
1     travel busi meet take fun trip especi prepar ...
2                                  test success way go
3     randi send schedul salari level everyon sched...
4                     let shoot tuesday eleven forty f
Name: m_body, dtype: object


## Preprocess Sample Into New Column

In [27]:
%%time
# create new column
df['m_body'] = df.m_body.apply(preprocess)

# concurrent futures executor
#with concurrent.futures.ThreadPoolExecutor() as executor:  
    #tsf['p_body'] = executor.map(preprocess, tsf.m_body)

# view head
df.head()

Wall time: 16min 32s


Unnamed: 0,m_body
0,forecast
1,travel busi meet take fun trip especi prepar ...
2,test success way go
3,randi send schedul salari level everyon sched...
4,let shoot tuesday eleven forty f


In [28]:
# total email body chars (for process tracking)
process_chars = df.m_body.apply(len).sum()

print('Total characters after processing: {}.'.format(process_chars))

Total characters after processing: 81459576.


## Removing Any Blanks

    #check count of entries
    print('DataFrame Entries: {}.'.format(len(df)))

    # condition set to blank body
    cond = df.m_body == ''

    print(df[cond])

    # return non-blanks to new df
    df = df[~cond]

    #check count of entries
    print('DataFrame Entries: {}.'.format(len(df)))

## Calculating TF-IDF
- Calculate *document frequency*
- Calculate TF-IDF

### 1.0 Calculate DF

In [29]:
# iterate through all words in all documents, store document id's for each word

def calculate_doc_freq(text_body, a_dict):
    
    words = word_tokenize(text_body)
    
    for i, w in enumerate(words):
        
        try:
            a_dict[w].add(i)
            
        except:
            a_dict[w] = {i}

In [30]:
%%time
DF_dict = {} # dictionary for document frequency (ie occurence of word across all documents)

# call document frequency method
df.m_body.apply(calculate_doc_freq, a_dict=DF_dict)

# prints number of words found
print('Found {} unique words.'.format(len(DF_dict)))

# replace the set of catches with a count
for i in DF_dict:
    DF_dict[i] = len(DF_dict[i])

Found 111867 unique words.
Wall time: 1min 11s


In [31]:
# count of total words included in the corpus
total_vocab_size = len(DF_dict)

# creates list of total vocab
total_vocab = [word for word in DF_dict]

# view list
print(total_vocab[:5])

['forecast', 'travel', 'busi', 'meet', 'take']


### 1.1 Calcuate TF-IDF

In [32]:
# get number of corpus
N = len(df)

# print text
print('The number of documents in current corpus is {}.'.format(N))

The number of documents in current corpus is 162414.


In [33]:
# returns the document frequency of a word
def doc_freq(word):

    c = 0
    try:
        c = DF_dict[word]
    except:
        pass
    
    return c

In [34]:
# import library
import collections
import math

In [35]:
# calculates tf_idf
def calculate_tf_idf(doc_id, text_body):
    """Calculates tf-idf and saves the score to the tf_idf dictionary as a (doc_id, word) tuple key.
       
       Make sure to initialize the 'tf_idf' dictionary outside of this function prior to call.
    
       Arguments: 
           - Document identifier
           - Body of text"""
    
    # tokenizes each word in the text body
    words = word_tokenize(text_body)
    
    # creates a collection of word counts from the text body; referenced like a dictionary
    counter = collections.Counter(words)
    
    # loop through each word in list using unique values
    for word in np.unique(words):
        
        # term frequency = count of word in document / number of words in document
        tf = counter[word]/len(words)
        
        # document frequency is called from df dictionary
        df = doc_freq(word)
        
        # inverse document frequency is calculated with log(N as number of documents in corpus / df + 1)
        # 1 is added to df during inverse incase no words exist so that a #div0 is not returned
        idf = math.log(N/(df+1))
        
        # record tf-idf score to doc_id, word key
        tf_idf[doc_id, word] = tf*idf

In [36]:
%%time

# initalize dictionary
tf_idf = {}

# loop through index and processed text items
for idx, email in df.m_body.items():
    
    # calculate tf-idf
    calculate_tf_idf(idx, email)

Wall time: 1min 33s


In [37]:
# number of tf_idf scores
print(len(tf_idf))

7117964


## Ranking with Cosine Similarity

In [38]:
# calculates cosine similarity between two text bodies
def cosine_sim(a, b):
    """Calculates cosine similarity between two text bodies.
       Arguments:
       - a: set of vectorized values
       - b: second body vectorized set"""
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim    

### Vectorizing tf-idf Across Emails, Total Words

In [42]:
def gen_vector(text_body):
    
    # tokenizes each word in the text body
    words = word_tokenize(text_body)
    
    Q = np.zeros((total_vocab_size))
    
    counter = collections.Counter(words)
    words_count = len(words)
    
    for word in np.unique(words):
        
        tf = counter[word]/words_count
        df = doc_freq(word)
        idf = math.log((N)/(df+1))
        
        try:
            idx = total_vocab.index(word)
            Q[idx] = tf*idf
        except:
            pass
    
    return Q

In [43]:
def cos_sim_bulk(corpus, query, q_val):
    """Returns a dictionary of any cosine similarity calculations that match query.
       Note: All cosine similarity are calculated within the function. 
             For accurate returns, the index should match the sequence of the corpus entries.
       Arguments:
       corpus: a collection of text bodies
       query: either of three  is >, <, =
       q_val is the value to query."""

    # initialize dictionary
    #a_dict = {}
    
    # get length of corpus
    c = len(corpus)
    
    # setup the loop cycle
    for a in range(c):
        av = gen_vector(df.iloc[a, 0])
        for b in range(c):
            # reduce output by not including 1 CS match to self
            if a == b:
                pass
            else:
                bv = gen_vector(df.iloc[b, 0])

                # calculate cosine similarity
                score = cosine_sim(av, bv)

                # evaluate the query
                if query == 'g':
                    if (score > q_val):
                        try:
                            CS_dict[a].add(b)
                        except:
                            CS_dict[a] = {b}
                # if less than
                elif query == 'l':
                    if (score < q_val):
                        try:
                            CS_dict[a].add(b)
                        except:
                            CS_dict[a] = {b}
                # if equal to
                elif query == 'e':
                    if (score == q_val):
                        try:
                            CS_dict[a].add(b)
                        except:
                            CS_dict[a] = {b}
    
    #return a_dict

In [None]:
%%time

# return CS 

CS_dict = {}

test_frame = list(df.m_body)

corpus = test_frame
query = 'g'
q_val = .95

cos_sim_bulk(corpus, query, q_val)
print(len(CS_dict))

In [46]:
def cosine_similarity(k, doc_id, text_body):
    
    # tokenizes each word in the text body
    # words = word_tokenize(text_body)
    
    print('Cosine Similarity')
    
    print('\nQuery: ', doc_id)

    d_cosines = []
    
    query_vector = gen_vector(text_body)
    
    for d in D:
        d_cosines.append(cosine_sim(query_vector, d))
    
    out = np.array(d_cosines).argsort()[-k:][::-1]
    
    print('')
    print(out)
    print('')
    
    d_cosines.sort(reverse = True)
    
    for thing in d_cosines[:k]:
        print(thing)

In [55]:
len(CS_dict)

4

In [56]:
print(CS_dict)

{1: {197}, 14: {198}, 19: {199}, 25: {200}}

In [57]:
# TEST CELL check body text
df.loc[1, 'm_body']

' travel busi meet take fun trip especi prepar present would suggest hold busi plan meet take trip without formal busi meet would even tri get honest opinion whether trip even desir necessari far busi meet think would product tri stimul discuss across differ group work often present speak other quiet wait turn meet might good hold round tabl discuss format suggest go austin play golf rent ski boat jet ski fli somewher take much time'

In [59]:
# TEST CELL check body text
df.loc[197, 'm_body']

' travel busi meet take fun trip especi prepar present would suggest hold busi plan meet take trip without formal busi meet would even tri get honest opinion whether trip even desir necessari far busi meet think would product tri stimul discuss across differ group work often present speak other quiet wait turn meet might good hold round tabl discuss format suggest go austin play golf rent ski boat jet ski fli somewher take much time'

In [61]:
# TEST CELL check body text
df.loc[19, 'm_body']

' reagan want give updat chang unit mix includ bedroom reduc number build twelv kipp flore work construct draw time pursu fha financ construct draw complet send revis bid origin bid competit still attract firm strong local presenc contact phillip'

In [60]:
# TEST CELL check body text
df.loc[199, 'm_body']

' reagan want give updat chang unit mix includ bedroom reduc number build twelv kipp flore work construct draw time pursu fha financ construct draw complet send revis bid origin bid competit still attract firm strong local presenc contact phillip'