## Import data

In [1]:
# import pandas
import pandas as pd
# viewing options
pd.set_option('max_colwidth', 100)

In [2]:
%%time
# import dataframe, drop duplicates
df = pd.read_csv('./data/enron/03_processed_body.csv', index_col=0)
df.groupby(['gender']).size()

Wall time: 6.71 s


gender
boy     235804
girl    174119
dtype: int64

## Monitoring impact on label
We want to keep an eye on the male/female ratio when filtering out our dataset (especially when blanket removing things like 'duplicates', where the filtering can be applied to either label for the same condition) so that our label ratio isn't significantly unbalanced as a result.

In [3]:
def monitor_label(df):
    b = df.groupby(['gender']).size().boy
    g = df.groupby(['gender']).size().girl
    print('Frame Size: {}, B/G Ratio: {:.3f}'.format(b+g, b/g))
    
monitor_label(df)

Frame Size: 409923, B/G Ratio: 1.354


## Remove NaN from gender, email

In [4]:
# set dataframe to not NaN gender returns
df = df[df.gender.notna()]

# return index of NaN email body
_ = df[df.p_body.isna()].index

# drop NaN body
df = df.drop(_)

monitor_label(df)

Frame Size: 409719, B/G Ratio: 1.354


## Remove duplicate emails from body

In [5]:
# set dataframe to dropped duplicates
df = df.drop_duplicates('p_body')

monitor_label(df)

Frame Size: 182823, B/G Ratio: 1.315


## Reset index
Prior to using the indices and indexes to reference data in the dataframe, we'll need to reset the index so the values line up.

In [6]:
# reindex dataframe for cosine similarity matching
df = df.reset_index(drop=True)

## Cosine similarity (cossim) to find near-duplicate items
Here we'll use cosine similarity scores across samples from our population to review for filtering criteria. Because we're working with limited memory, we'll take a sample from the dataframe, transform it with tfidf, and calculate the cossim score.    

Once the cossim matrix is returned for all documents in the corpus, we'll get a list of the indices where cossim score is greater than ~90-95% and filter the dataframe by indexing the indice returns. **This will return a subset of the sample with high cossim.**    

Finally, we'll fit the data with a Multinomial Naive Bayes model to identify features from the dataset for review.

### Use Cosine Similarity (cossim) to evaluate identical email body items

In [7]:
%%time
# import numpy
import numpy as np

# viewing options
np.set_printoptions(edgeitems=10)
np.core.arrayprint._line_width = 300

# user function
def cos_sim_this(df, p=0.95, n=10000):
    """input a dataframe, cos sim %, and sample number
       return a filtered dataframe of the cos sim results"""
    # reset index (needs full, linear index or will throw flag)
    df = df.reset_index(drop=True)
    
    # collect a sample
    sample = df.sample(n, random_state=42)

    # feature extraction - tfidf
    from sklearn.feature_extraction.text import TfidfVectorizer
    tf_vect = TfidfVectorizer()
    tfidf = tf_vect.fit_transform(sample.p_body)

    # pairwise - cosine similarity
    from sklearn.metrics.pairwise import cosine_similarity
    cos = cosine_similarity(tfidf) # return pairwise similarities between all samples in input (Note: can be sliced [0:1000])

    # make a list of array indices that match our cos % condition for dataframe indexing
    idx_list = []
    for doc_ind in np.arange(len(cos)):
        doc_tup = np.where(cos[doc_ind] > p)
        if len(doc_tup[0]) > 1:
            x = doc_tup[0]
            idx_list.append(x[x != doc_ind][0])
    idx_list = list(set(idx_list))

    # get high cossim selection from original df (by index)
    filter_df = df.loc[idx_list, :]
    
    return filter_df


# call function
initial_sample = cos_sim_this(df, p=0.9, n=20000)

Wall time: 33.2 s


## Using a model to identify features for filtering

### Vectorize input data
We'll use a Tfidf vectorizer to transform the text here because we're interested in identifying 'features' across our subsample with high cosine similarity to identify any keywords that can help with filtering out mass almost-perfect duplicates that would skew our dataset (ex. FW:, RE:, spam, etc).

### Reviewing 'forwarded by'

In [8]:
%%time

# create stopwords list
from nltk.corpus import stopwords

# set stopwords list to english
stop_words = set(stopwords.words('english'))


def check_feature_perc(s, a_df):
    ft_len = len(a_df[a_df.p_body.str.contains(s)])
    df_len = len(a_df)
    return (ft_len/df_len)*100


def get_features(df, n=10, ngrams=(1,1), transform='t', stop_words=stop_words):
    # get variables for model
    Xf = df.p_body.values
    yf = np.zeros((len(Xf)), np.int8) # dumbie label '0'

    # instantiate, fit transformer
    if transform == 't':
        from sklearn.feature_extraction.text import TfidfVectorizer
        tf_vect = TfidfVectorizer(analyzer='word', ngram_range=ngrams, stop_words=stop_words)
        X_t = tf_vect.fit_transform(Xf)
        v = tf_vect
    
    # instantiate, fit vectorizer
    if transform == 'c':
        from sklearn.feature_extraction.text import CountVectorizer
        c_vect = CountVectorizer(analyzer='word', ngram_range=ngrams, stop_words=stop_words)
        X_t = c_vect.fit_transform(Xf)
        v = c_vect

    # import model
    from sklearn.naive_bayes import MultinomialNB

    # instantiate model object
    mnb = MultinomialNB()

    # fit model on data
    mnb.fit(X_t, yf)

    # return features by coefs
    def show_most_informative_features(vectorizer, clf, n=n):
        feature_names = vectorizer.get_feature_names()
        coefs_with_fns = sorted(zip(clf.coef_[0], feature_names), reverse=True)
        top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
        print('Most Informative Features\n(coef score, % of df has feature, feature string)\n')
        for (coef_1, fn_1), (coef_2, fn_2) in top:
            p_1 = check_feature_perc(fn_1, df)
            print('{:.3f} {:.0f}% {}'.format(coef_1, p_1, fn_1))
            
            #p_2 = check_feature_perc(fn_2, df)
            #print('{:.0f}% {} {:.3f}\t\t{:.0f}% {} {:.3f}'.format(p_1, fn_1, coef_1, p_2, fn_2, coef_2))
            # print('{:.3f} {}, {:.3f} {}'.format(coef_1, fn_1, coef_2, fn_2))

    show_most_informative_features(v, mnb)
    

# call function
get_features(initial_sample, n=20, transform='t', ngrams=(4,4))

Most Informative Features
(coef score, % of df has feature, feature string)

-9.593 12% hou ect ect cc
-9.741 9% ect ect cc subject
-10.152 4% eric bass hou ect
-10.289 5% sally beck hou ect
-10.316 3% bass hou ect ect
-10.380 4% enron north america corp
-10.404 3% john arnold hou ect
-10.410 4% enron com cc subject
-10.433 2% wireless handheld www blackberry
-10.433 0% sent blackberry wireless handheld
-10.433 2% handheld www blackberry net
-10.433 2% blackberry wireless handheld www
-10.482 0% original message arnold john
-10.489 4% hou ect ect subject
-10.489 2% arnold hou ect ect
-10.494 0% message arnold john sent
-10.497 2% phillip allen hou ect
-10.508 4% beck hou ect ect
-10.610 2% eric bass enron com
-10.661 3% thanks lynn original message
Wall time: 1.92 s


### Adding to stop words list

In [9]:
# create a new stop words list that includes 'enron', 'ect', 'hou', 'phillip allen', 'eric bass', 'arnold john'
add_stops = ['enron', 'ect', 'hou', 'phillip', 'allen', 'eric', 'bass', 'arnold', 'john', 'com']
new_stops = list(stop_words) + add_stops
new_stops = set(new_stops)

# call function
get_features(initial_sample, n=20, transform='t', ngrams=(2,5), stop_words=new_stops)

Most Informative Features
(coef score, % of df has feature, feature string)

-10.796 33% original message
-11.151 19% cc subject
-11.160 0% let know
-11.595 10% would like
-11.659 9% sent monday
-11.709 8% sent tuesday
-11.740 8% please let
-11.773 7% sally beck
-11.773 0% please let know
-11.777 8% sent thursday
-11.851 6% october pm
-11.876 7% sent wednesday
-11.884 4% http www
-11.894 8% subject fw
-11.934 6% sent friday
-11.959 5% north america
-11.970 5% blair lynn
-12.013 3% michelle cash
-12.042 4% thanks lynn
-12.060 0% pm cc


## Review email phrases
Here, we're going to pass phrases back to the cossim calculator and return the % of emails that meet the cossim threshold, relative to the overall number of emails in the sample. We'll use a lower threshold to amplify the significance of each phrase.

In [10]:
def cosine_by_phrases(s, n=10000, p=0.8):
    
    first_sample = cos_sim_this(df=df, p=p, n=n)
    print('% of sample frame that meets cosim threshold: {:.0f}%\n'.format(len(first_sample)*100/n))
    
    for phrase in s:
        cond = df.p_body.str.contains(phrase)
        df_filt = df[cond]
        second_sample = cos_sim_this(df=df_filt, p=p, n=n)
        
        print("'{}' filter: {:.0f}%".format(phrase, len(second_sample)*100/n))

In [11]:
s = ['cc subject', 'original message', 'forwarded by', 'subject fw']
cosine_by_phrases(s)

% of sample frame that meets cosim threshold: 6%

'cc subject' filter: 18%
'original message' filter: 20%
'forwarded by' filter: 20%
'subject fw' filter: 35%


> From a random sample of 10,000 emails pulled from our dataset, **35%** of the sample meets the cossim threshold when filtered by the phrase **'subject fw'** versus 6% returned by a control sample.

In [12]:
cosine_by_phrases(s, p=.95)

% of sample frame that meets cosim threshold: 3%

'cc subject' filter: 10%
'original message' filter: 11%
'forwarded by' filter: 12%
'subject fw' filter: 29%


> Even when we increase the cossim threshold, the phrase **'subject fw'** still returns **29%** meeting the cossim threshold, compared to only 3% from a control sample.

## Checking the impact of filters

In [13]:
n = 20000
p = .95

# none filtered
a_sample = cos_sim_this(df=df, p=p, n=n)
print("0 filters: {:.0f}%".format(len(a_sample)*100/n))
print(monitor_label(df))
print('')

# check impact of removing 1 phrase
cond = (df.p_body.str.contains('subject fw'))
f_df = df[~cond]
a_sample = cos_sim_this(df=f_df, p=p, n=n)
print("1 filter('subject fw'): {:.0f}%".format(len(a_sample)*100/n))
print(monitor_label(f_df))
print('')

# check impact of removing 1 phrase
cond = (df.p_body.str.contains('original message'))
f_df = df[~cond]
a_sample = cos_sim_this(df=f_df, p=p, n=n)
print("1 filter('original message'): {:.0f}%".format(len(a_sample)*100/n))
print(monitor_label(f_df))
print('')

# check impact of removing 1 phrase
cond = (df.p_body.str.contains('forwarded by'))
f_df = df[~cond]
a_sample = cos_sim_this(df=f_df, p=p, n=n)
print("1 filter('forwarded by'): {:.0f}%".format(len(a_sample)*100/n))
print(monitor_label(f_df))
print('')

# check impact of removing 1 phrase
cond = (df.p_body.str.contains('cc subject'))
f_df = df[~cond]
a_sample = cos_sim_this(df=f_df, p=p, n=n)
print("1 filter('cc subject'): {:.0f}%".format(len(a_sample)*100/n))
print(monitor_label(f_df))
print('')

# check impact of removing all phrases
cond = (df.p_body.str.contains('subject fw') | df.p_body.str.contains('forwarded by') | df.p_body.str.contains('original message') | df.p_body.str.contains('cc subject'))
f_df = df[~cond]
a_sample = cos_sim_this(df=f_df, p=p, n=n)
print(monitor_label(f_df))
print("4 filters: {:.0f}%\n".format(len(a_sample)*100/n))

0 filters: 4%
Frame Size: 182823, B/G Ratio: 1.315
None

1 filter('subject fw'): 4%
Frame Size: 169560, B/G Ratio: 1.303
None

1 filter('original message'): 3%
Frame Size: 131472, B/G Ratio: 1.227
None

1 filter('forwarded by'): 4%
Frame Size: 149053, B/G Ratio: 1.327
None

1 filter('cc subject'): 4%
Frame Size: 144231, B/G Ratio: 1.321
None

Frame Size: 90085, B/G Ratio: 1.222
None
4 filters: 2%



> Though **subject fw** has the higher similarity scores when isolated, the frequency of the phrase across the dataset is low - mitigating the influence of possible duplicates.    
> 
> When we apply each filter individually, there isn't a significant reduction in the cos sim score, relative to the reduction across the dataset. When we apply all 4 filters, both the dataset size and cosim % are reduced roughly the same amount.

In [16]:
# df.to_csv('./data/enron/03_filtered_dataset.csv')