In [1]:
import time
start_time = time.perf_counter()

In [2]:
%matplotlib inline
# version check
import numpy
print('The numpy version is {}.'.format(numpy.__version__))
import pandas
print('The pandas version is {}.'.format(pandas.__version__))
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))
import matplotlib
print('The matplotlib version is {}.'.format(matplotlib.__version__))
import regex
print('The regex version is {}.'.format(regex.__version__))
import scipy
print('The scipy version is {}.'.format(scipy.__version__))

The numpy version is 1.18.1.
The pandas version is 1.0.4.
The scikit-learn version is 0.23.1.
The matplotlib version is 3.2.1.
The regex version is 2.5.80.
The scipy version is 1.4.1.


In [3]:
# set random state for reproducibility
random_state = 42

# default numpy settings
import numpy as np
np.set_printoptions(edgeitems=3)
np.core.arrayprint._line_width = 80

# update settings
import numpy as np
np.set_printoptions(edgeitems=15, linewidth=150, 
    formatter=dict(float=lambda x: "%.2f" % x))

# update pandas settings
import pandas as pd
pd.set_option('max_colwidth', 50)
pd.options.display.max_columns = 25
pd.options.display.max_rows = 150
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# system
import sys
#sys.getsizeof

# timeit
import timeit

In [4]:
# plotting functions
import numpy as np
import pandas as pd
import regex as re
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
from matplotlib import rcParams
sns.set_style("whitegrid")
sns.set_context("notebook")

def get_character_counts(df, col='m_body', new_name='n_char'):
    A = np.array(df.loc[:, col]) 
    getlength = np.vectorize(len)
    df[new_name] = getlength(A)
    return df

def plot_ecdf(x, xlab='', ylab='', title='', color=None, xlim=None, ylim=None, logx=False, logy=False, hline=None, vline=None, alpha=1, s=1):
    x = np.sort(x)
    n = x.size
    y = np.arange(1, n+1) / n
    plt.scatter(x, y, color=color, s=s, alpha=alpha)
    plt.xlabel(xlab, fontsize=14)
    plt.ylabel(ylab, fontsize=14)
    plt.xlim(xlim)
    plt.ylim(ylim)
    plt.title(title)
    if logx == True:
        plt.xscale('log')
    if logy == True:
        plt.yscale('log')
    if hline is not None:
        plt.axhline(hline, color='r', linestyle='--')
    if vline is not None:
        plt.axvline(vline, color='r', linestyle='--')

def plot_scatter(x, y, xlab='', ylab='', title='', logx=False, logy=False, hline=None, vline=None, s=None):
    plt.scatter(x, y, alpha=0.25)
    plt.xlabel(xlab)
    plt.ylabel(ylab)
    plt.title(title)
    if logx == True:
        plt.xscale('log')
    if logy == True:
        plt.yscale('log')
    if hline is not None:
        plt.axhline(hline, color='r', linestyle='--')
    if vline is not None:
        plt.axvline(vline, color='r', linestyle='--')

def plot_2_2(x, y, suptitle='', hline=None, vline=None):
    plt.figure(figsize=(10,10))

    plt.subplot(2, 2, 1)
    plot_scatter(x, y, xlab='', ylab='', title='Plot 1: linx, liny', logx=False, logy=False, hline=None, vline=None)
    plt.subplot(2, 2, 2)
    plot_scatter(x, y, xlab='', ylab='', title='Plot 2: linx, logy', logx=False, logy=True, hline=None, vline=None)
    plt.subplot(2, 2, 3)
    plot_scatter(x, y, xlab='', ylab='', title='Plot 3: linx, liny', logx=True, logy=False, hline=None, vline=None)
    plt.subplot(2, 2, 4)
    plot_scatter(x, y, xlab='', ylab='', title='Plot 4: linx, logy', logx=True, logy=True, hline=None, vline=None)

    plt.suptitle(suptitle, fontsize=16)
    plt.show() 

def plot_2_4(_, suptitle='', hline=None, vline=None):
    plt.figure(figsize=(20,10))

    plt.subplot(2, 4, 1)
    _.plot(kind='hist', bins=30, title='Plot 1: linx, liny', logx=False, logy=False)
    plt.subplot(2, 4, 2)
    _.plot(kind='hist', bins=30, title='Plot 2: linx, logy', logx=False, logy=True)
    plt.subplot(2, 4, 3)
    plot_ecdf(_, xlab='', ylab='', title='Plot 3: linx, liny', logx=False, logy=False, hline=None, vline=None)
    plt.subplot(2, 4, 4)
    plot_ecdf(_, xlab='', ylab='', title='Plot 4: linx, logy', logx=False, logy=True, hline=None, vline=None)
    plt.subplot(2, 4, 5)
    _.plot(kind='hist', bins=30, title='Plot 5: logx, liny', logx=True, logy=False)
    plt.subplot(2, 4, 6)
    _.plot(kind='hist', bins=30, title='Plot 6: logx, logy', logx=True, logy=True)
    plt.subplot(2, 4, 7)
    plot_ecdf(_, xlab='', ylab='', title='Plot 7: logx, liny', logx=True, logy=False, hline=None, vline=None)
    plt.subplot(2, 4, 8)
    plot_ecdf(_, xlab='', ylab='', title='Plot 8: logx, logy', logx=True, logy=True, hline=None, vline=None)
    plt.suptitle(suptitle, fontsize=16)
    plt.show()   

    
def plot_sender_stats(df, n, suptitle=''):
    plt.figure(figsize=(14, 12))
    plt.subplot(2, 2, 1) # plot 1: horizontal bar
    _ = pd.DataFrame(df.groupby(['m_from']).size()).sort_values(0,ascending=False).reset_index() # convert data
    l = np.array(_.iloc[:,0]) # labels
    v = np.array(_.iloc[:,1]) # values
    plt.barh(np.linspace(n,1,n), v[:n], tick_label=l[:n])
    plt.subplot(2, 2, 2) # plot 2: full histogram
    plt.hist(v, bins=30)
    plt.yscale('log')
    plt.subplot(2, 2, 3) # plot 3: pie
    pie_x = [len(df[df.gender == 0].m_from.unique()), len(df[df.gender == 1].m_from.unique())]
    pie_lab = ['boy: %d email addresses' % (pie_x[0]), 'girl: %d email addresses' % (pie_x[1])]
    plt.pie(pie_x, labels=pie_lab, autopct='%.2f%%', colors=['#347DC1', '#FF85A2'])
    plt.subplot(2, 2, 4) # plot 4: ecdf
    plot_ecdf(v, xlim=(0,stats.scoreatpercentile(v,95)))
    plt.suptitle(suptitle)
    plt.show()
    print(_.describe())
    print(stats.describe(v))
    
def plot_character_frequency(df, suptitle=''):
    v = np.array(df.n_characters_start.values) # values
    plt.figure(figsize=(18,6))
    plt.subplot(1, 3, 1) # plot 1: pie chart
    pie_x = [df[df.gender == 0].n_characters_start.sum(),df[df.gender == 1].n_characters_start.sum()]
    pie_lab = ['boy: %.3g characters' % (pie_x[0]), 'girl: %.3g characters' % (pie_x[1])]
    plt.pie(pie_x, labels=pie_lab, autopct='%.2f%%', colors=['#347DC1', '#FF85A2'])
    plt.subplot(1, 3, 2) # plot 2: full histogram
    plt.hist(v, bins=30)
    plt.yscale('log')
    plt.subplot(1, 3, 3) # plot 3: ecdf
    plot_ecdf(v, xlim=(0,stats.scoreatpercentile(v,95)))
    plt.show()
    print(df.n_characters_start.describe())
    print(stats.describe(v))
    

    
def plot_gender_character_frequency(df, suptitle='', alpha=0.1, col='m_body', new_name='n_char'):
    df = get_character_counts(df)
    bv = np.array(df[df.gender == 0][new_name].values) # values
    gv = np.array(df[df.gender == 1][new_name].values) # values
    plt.figure(figsize=(18,6))
    plt.subplot(1, 3, 1) # plot 1: pie chart
    pie_x = [np.sum(bv), np.sum(gv)]
    pie_lab = ['boy: %dM characters' % (pie_x[0]/1000000), 'girl: %dM characters' % (pie_x[1]/1000000)]
    plt.pie(pie_x, labels=pie_lab, autopct='%.2f%%', colors=['#347DC1', '#FF85A2'])
    plt.subplot(1, 3, 2) # plot 2: full histogram
    plt.hist(x=[bv, gv], bins=10, color=['#347DC1', '#FF85A2'], rwidth=1,)
    #plt.hist(gv, bins=30, color='#FF85A2')
    plt.yscale('log')
    plt.subplot(1, 3, 3) # plot 3: ecdf
    plot_ecdf(bv, xlim=(0,stats.scoreatpercentile(bv,97)), color='#347DC1', alpha=.5, s=.1)
    plot_ecdf(gv, xlim=(0,stats.scoreatpercentile(gv,97)), color='#FF85A2', alpha=.25, s=.1)
    plt.show()
    bd = df[df.gender == 0].n_characters_start.describe()
    bd.name = 'boy'
    gd = df[df.gender == 1].n_characters_start.describe()
    gd.name = 'girl'
    bg_describe = pd.concat([bd, gd], axis=1)
    print(bg_describe)

## Input dataframe

In [5]:
%%time
input_directory = './data/enron/clean_clean_by_strings.csv'
df = pd.read_csv(input_directory, index_col=0)

Wall time: 6.78 s


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 170545 entries, 0 to 170544
Data columns (total 26 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   f_dir               170545 non-null  object 
 1   m_id                170545 non-null  object 
 2   m_date              170545 non-null  object 
 3   m_from              170545 non-null  object 
 4   m_to                167231 non-null  object 
 5   m_cc                49793 non-null   object 
 6   m_bcc               46726 non-null   object 
 7   m_subj              163666 non-null  object 
 8   mime_vers           170545 non-null  float64
 9   cont_type           170545 non-null  object 
 10  encode              170545 non-null  object 
 11  x_from              170545 non-null  object 
 12  x_to                167909 non-null  object 
 13  x_cc                47096 non-null   object 
 14  x_bcc               134 non-null     object 
 15  x_fold              170545 non-nul

In [7]:
df.head(2)

Unnamed: 0,f_dir,m_id,m_date,m_from,m_to,m_cc,m_bcc,m_subj,mime_vers,cont_type,encode,x_from,...,x_bcc,x_fold,x_orig,x_fname,o_body,m_body,gender,n_emails_sent,n_characters_start,clean_char,n_char,clean_body
0,./data/enron/maildir/farmer-d/logistics/12,<18632438.1075840432068.JavaMail.evans@thyme>,"Tue, 11 Dec 2001 06:07:33 -0800 (PST)",rita.wynne@enron.com,"michael.olsen@enron.com, stephen.swisher@enron...","sherry.anastas@enron.com, j..farmer@enron.com","sherry.anastas@enron.com, j..farmer@enron.com",Centana Storage Deal,1.0,text/plain; charset=us-ascii,7bit,"Wynne, Rita </O=ENRON/OU=NA/CN=RECIPIENTS/CN=R...",...,,"\ExMerge - Farmer, Darren\Logistics",FARMER-D,darren farmer 6-26-02.pst\n\n,Message-ID: <18632438.1075840432068.JavaMail.e...,"Mike/Stephen,\n\nHave the two of you been able...",1.0,57,268,268,268,"Mike/Stephen,\n\nHave the two of you been able..."
1,./data/enron/maildir/guzman-m/all_documents/1429,<18190482.1075840619280.JavaMail.evans@thyme>,"Thu, 21 Dec 2000 04:07:00 -0800 (PST)",caroline.emmert@enron.com,"geir.solberg@enron.com, holden.salisbury@enron...",virginia.thompson@enron.com,virginia.thompson@enron.com,Puget Sound Deal on October 17,1.0,text/plain; charset=us-ascii,7bit,Caroline Emmert,...,,\mark guzman 6-28-02\Notes Folders\All documents,GUZMAN-M,mark guzman 6-28-02.nsf\n\n,Message-ID: <18190482.1075840619280.JavaMail.e...,"Guys,\n\nPuget is claiming that we did a real-...",1.0,90,479,479,479,"Guys,\n\nPuget is claiming that we did a real-..."


## Set Series Global Variables

In [8]:
Corpus_series = 'clean_body'
Target_series = 'gender'

## Randomize dataframe

In [9]:
def resample_frame(df):
    df = df.sample(n=len(df)).reset_index(drop=True)
    return df
df = resample_frame(df) # resampled

## Cosine similarity (cossim) to find near-duplicate items
Here we'll use cosine similarity scores across samples from our population to review for filtering criteria. Because we're working with limited memory, we'll take a sample from the dataframe, transform it with tfidf, and calculate the cossim score.    

Once the cossim matrix is returned for all documents in the corpus, we'll get a list of the indices where cossim score is greater than ~90-95% and filter the dataframe by indexing the indice returns. **This will return a subset of the sample with high cossim.**    

**What can we do with this?**
Once we determine a threshold that reflects, "Almost duplicate," we can:
- Trim the dataset by returning indicies (emails/documents) that exceed our threshold;
- Fit a model to the returns for futher evaluation (parse by 'high frequency words', create stopwords, etc);

In [10]:
%%time
# user function
def cos_sim_this(df, p=0.95, n=10000, Corpus_series=Corpus_series, po=True):
    """input a dataframe, cos sim %, and sample number
       return a filtered dataframe of the cos sim results"""

    # take the n number of observations from top of randomized sample
    sample = df[:n].copy()
    
    # get tfidf score
    from sklearn.feature_extraction.text import TfidfVectorizer
    tf_vect = TfidfVectorizer()
    tfidf = tf_vect.fit_transform(sample[Corpus_series])

    # pairwise - cosine similarity
    from sklearn.metrics.pairwise import cosine_similarity
    cos = cosine_similarity(tfidf) # return pairwise similarities between all samples in input (Note: can be sliced [0:1000])
    # make a list of array indices that match our cos % condition for dataframe indexing
    match_groups = [] # returns groups for observation
    match_list = [] # returns list for df filtering
    for document_index in np.arange(len(cos)):
        if len(np.where(cos[document_index] > p)[0]) > 1: # checks that something was returned in this array, else the entire dataframe index will get returned each time
            A = np.where(cos[document_index] > p)[0]
            match_groups.append(A)
            match_list.extend(A[1:])  
    match_list = list(set(match_list)) # use set to parse down list
    # print summary
    if po == True:
        print('%0.f of %0.f considered near duplicates at %.2f threshold.\n%.2f percent' % (len(match_list), n, p, len(match_list)*100/n))
    
    return match_groups, match_list

Wall time: 0 ns


## Compile Cosine Similarity from Dataset p=99%

In [11]:
%%time
# instantiate list capture
chk_match_groups = []
chk_match_list = []
# set p
p = 0.99
# import dataframe, drop duplicates
chunked_df = pd.read_csv(input_directory, index_col=0, chunksize=25000)
# chunk that
for chunk in chunked_df:
    print('Processing %d to %d' % (np.min(chunk.index), np.max(chunk.index)))
    match_groups, match_list = cos_sim_this(chunk, p=p, n=len(chunk), Corpus_series=Corpus_series, po=False)
    # extend lists
    chk_match_groups.extend(match_groups)
    chk_match_list.extend(match_list)
# print outputs
print('%d of %d considered near duplicates at %.2f threshold.\n%.2f percent' % (len(chk_match_list), len(df), p, len(chk_match_list)*100/len(df)))

Processing 0 to 24999
Processing 25000 to 49999
Processing 50000 to 74999
Processing 75000 to 99999
Processing 100000 to 124999
Processing 125000 to 149999
Processing 150000 to 170544
7215 of 170545 considered near duplicates at 0.99 threshold.
4.23 percent
Wall time: 2min 45s


## Print examples from the largest group of high-score cosine similarity p=99%

In [12]:
# Which index position has the highest number of 'similarities'?
cnts = [len(chk_match_groups[i]) for i, v in enumerate(chk_match_groups)]
max_grp = chk_match_groups[np.where(cnts == np.max(cnts))[0][0]]
print('Printing messages from index numbers: %s\n' % (max_grp))

for idx in max_grp: # returns them as email body 
    print('[INDEX %s]' % (idx))
    print(df.loc[idx, Corpus_series])

Printing messages from index numbers: [  145   175   181   265   271   276   378   522   552   608   740   792   808   821   826   845   856   883   906   943   964   997  1041  1059
  1285  1323  1326  1350  1410  1468  1494  1550  1742  1770  1796  1865  2039  2446  2481  2588  2608  2712  2724  2751  2782  2796  2827  2874
  3040  3101  3107  3237  3286  3331  3339  3458  3559  3602  3629  3691  3698  3802  3834  3935  3983  4008  4092  4144  4295  4390  4525  4543
  4576  4584  4645  4694  4698  4724  4848  4955  5020  5063  5119  5133  5191  5247  5295  5303  5325  5336  5400  5492  5521  5693  5699  5715
  5769  5847  6043  6163  6262  6317  6337  6431  6561  6565  6572  6666  6738  6880  6970  6991  7008  7038  7063  7076  7112  7222  7327  7395
  7677  7678  7734  7892  7925  7961  7974  8048  8073  8108  8152  8226  8356  8436  8471  8490  8622  8695  8707  8959  8969  8997  9048  9152
  9245  9341  9342  9357  9376  9401  9412  9448  9449  9493  9561  9665  9734  9828  9875  

In [13]:
end_time = time.perf_counter()
print('Run time: %.1fs (~%dm)' % ((end_time-start_time, (end_time-start_time)/60)))

Run time: 174.1s (~2m)
