In [2]:
## Set inline plot for jupyter ipynb
% matplotlib inline

# import the modules
from time import time
from gensim import models, corpora
from nltk.corpus import stopwords
from six import iteritems
import numpy as np
import matplotlib.pyplot as plt
import os, sys, re, pickle

In [3]:
"""
# use peter-thompson/Topic-modelling-using-LDA Pre-processing functions
def re_clean_list():
    # define a list of regular expression clean up to use
    re_list = []
    re_list.append(re.compile('>'))
    re_list.append(re.compile('(Message-ID(.*?\n)*X-FileName.*?\n)|'
                     '(To:(.*?\n)*?Subject.*?\n)|'
                     '(< (Message-ID(.*?\n)*.*?X-FileName.*?\n))'))
    re_list.append(re.compile('(.+)@(.+)')) # Remove emails
    re_list.append(re.compile('\s(-----)(.*?)(-----)\s', re.DOTALL))
    re_list.append(re.compile('''\s(\*\*\*\*\*)(.*?)(\*\*\*\*\*)\s''', re.DOTALL))
    re_list.append(re.compile('\s(_____)(.*?)(_____)\s', re.DOTALL))
    re_list.append(re.compile('\n( )*-.*'))
    re_list.append(re.compile('\n( )*\d.*'))
    re_list.append(re.compile('(\n( )*[\w]+($|( )*\n))|(\n( )*(\w)+(\s)+(\w)+(( )*\n)|$)|(\n( )*(\w)+(\s)+(\w)+(\s)+(\w)+(( )*\n)|$)'))
    re_list.append(re.compile('.*orwarded.*'))
    re_list.append(re.compile('From.*|Sent.*|cc.*|Subject.*|Embedded.*|http.*|\w+\.\w+|.*\d\d/\d\d/\d\d\d\d.*'))
    re_list.append(re.compile(' [\d:;,.]+ '))
    return re_list

def remove_clutter(text,re_list):
    # takes in the regular expression to remove clutter
    for i in range(len(re_list)):
        text = re.sub(re_list[i], ' ', text)
    return text
"""

# ============================
# regular expression patterns
# ============================
# pattern to extract data into structure
date_pattern = re.compile('(?<=Date:).*',re.IGNORECASE)
msgId_pattern = re.compile('(?<=: <)(.*?)(?=@)',re.IGNORECASE)
from_pattern = re.compile(r"(?<=From: )[\"']*(.*?)(?:\"|\'|\n|\<|\[|\\)",re.IGNORECASE) #note: still have to clean up names
sent_pattern = re.compile('(?<=Sent: ).*',re.IGNORECASE)
to_pattern = re.compile(r"(?<=To: )[\"']*(.*?)(?:\"|\'|\n|\<|\[|\\)",re.IGNORECASE)
subj_pattern = re.compile('(?<=Subject: )[Fw: | Fwd]*(.*)',re.IGNORECASE)
clean_content_pattern = re.compile(r'[\W\d]\s*')
content_pattern = re.compile(r"[\t|\n]*(.*)[\n]") # only can use with clear patterns below

# patterns to clean up data
# clear_content_pattern = re.compile('(?:.*:)(.*)')
clear_header_pattern = re.compile('(?:from:.*|mime-.*|sent.*|to:.*|'
                                      'subject:.*|received:.*|date:.*|'
                                      'folder:.*|filename:.*|cc:.*|Message-ID:.*|'
                                      'X-.*|status:.*|content-.*|'
                                      'boundary[-|=].*|http.*)',re.IGNORECASE)
clear_star_pattern = re.compile('(?<=\*)[\n].*[\n](?=\*)')
clear_symbol_pattern = re.compile('[\*|\-|\=|\_]{2,}')

In [4]:
# ============================
# own pre-processing functions
# ============================
def uniq_set(list_item):
    return list(set(list_item))

def clean_names(list_item):
    list_item = [i.strip() for i in list_item]
    # list_item = [re.findall(parse_name_pattern,i)[0].strip() for i in list_item]
    return uniq_set(list_item)

def parse_data(content):
    # this function takes in the data and parses it and stores into
    # a json object for retrival
    c_data = {} # stores the content data
    extract_date = re.findall(date_pattern,content)
    extract_sent = re.findall(sent_pattern,content)
    extract_date.extend(extract_sent)
    # store some of the key informaion
    c_data['isValid'] = True # set data to valid
    c_data['date'] = uniq_set(extract_date)
    c_data['msgId'] = re.findall(msgId_pattern,content)
    c_data['from'] = clean_names(re.findall(from_pattern,content))
    c_data['to'] = clean_names(re.findall(to_pattern,content))
    c_data['subj'] = uniq_set(re.findall(subj_pattern,content))
    c_data['file'] = ''
    # extract contents from the email
    remove_idx = content.find('\n_____') # remove everything after this
    content = content[0:remove_idx]
    remove_idx = content.find('\n*******')
    content = content[0:remove_idx]
    content = re.sub(clear_header_pattern,'',content)
    # content = re.sub(clear_star_pattern,'',content)
    content = re.sub(clear_symbol_pattern,'',content)
    content = re.findall(content_pattern,content)
    c_data['content'] = [' '.join(content)]
    if c_data['content'][0] == '': # if there is no content
        c_data['isValid'] = False # set the validity to false
    else:
        # final clean
        c_data['content'][0] = re.sub(clean_content_pattern,' ',c_data['content'][0])
    # return the data
    return c_data

In [5]:
# ======================
# Extract the Enron data
# ======================
# change this directory to point to where the Enron data is
root_dir = '/home/hweilun/data/corpora/Enron/extract'

# check whether to store, overwrite or open pickled file
saved_file = "raw_doc.p"
if saved_file in os.listdir('.'):
    # open the pickled file
    print 'Loading saved file ...'
    raw_doc = pickle.load(open(saved_file,"rb"))
else:
    # structure to store the original contents
    print 'Reading and extracting ...'
    raw_doc = []
    # re_list = re_clean_list()
    start = time()
    for sub_dir, dirs, files in os.walk(root_dir):
        for file in files:
            # obtain the file name to load
            # only want to open files with NO extension!
            if file.find('.') == -1:
                file_path = os.path.join(sub_dir, file)
                f=open(file_path,'r')
                content = f.read()
                # content = remove_clutter(content,re_list)
                # content = f.readlines()
                # print content
                extract_data = parse_data(content)
                extract_data['file'] = file_path # store the file path
                if extract_data['isValid']:
                    raw_doc.append(extract_data['content'][0])
    end = time()           
    print 'Finish extraction, time taken = ',(end-start)
    # save the file
    pickle.dump(raw_doc, open("raw_doc.p","wb"))
print 'Data extracted and saved in raw_doc'

Loading saved file ...
Data extracted and saved in raw_doc


In [36]:
# stop words
custom_stopwords = ['is','a','on','at','the','and','to','i','be']
nltk_stopwords = stopwords.words('english')
stoplist= set(nltk_stopwords + custom_stopwords)

In [38]:
# extract tokens from the documents
start = time()
# tokens = [[word for word in doc.lower().split() if word not in stoplist and len(word)>1] for doc in raw_doc]
tokens = [[word for word in doc.lower().split() if word not in freq_stopwords and len(word)>1] for doc in raw_doc]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
end = time()
print 'Tokens, dict and corpus loaded in ',(end-start)
"""
start = time()
tokens = [[word for word in doc.lower().split()] for doc in raw_doc]
dictionary = corpora.Dictionary(tokens)
# stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
freq_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]
freq_stopwords = [dictionary[ids] for ids in freq_ids]
stoplist = set(nltk_stopwords + custom_stopwords + freq_stopwords)
tokens = [[word for word in doc.lower().split() if word not in stoplist and len(word)>1] for doc in raw_doc]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(tokens) for token in tokens]
end = time()
print 'Tokens, dict and corpus loaded in ',(end-start)
"""

KeyboardInterrupt: 

In [35]:
print freq_stopwords[0:100]

[u'vladi_pimenov', u'lrc_bridgeline', u'stephanie_panus', u'ene_ect', u'elizabeth_sager', u'technote', u'eesiisonewyork', u'jane_tholt', u'eric_linder_', u'aapending', u'lynn_blair_', u'joseph_parks_nov', u'sandra_brawner', u'mbamarketing', u'john_griffith_jan', u'phillip_love_jan', u'chris_dorland_dec', u'houstonfundamentals', u'joseph_quenet_dec', u'richard_sanders_jan', u'steven_merriss', u'sally_beck_', u'barry_tycholiz_mar', u'bradley_mckay_jun', u'jonathan_mckay_', u'larry_campbell_jun', u'bill_williams_iii', u'daren_farmer', u'techmemos', u'john_arnold_', u'jason_wolfe_nov', u'holden_salisbury_', u'sparefinders', u'dana_davis_dec', u'robin_rodrigue', u'david_delainey_', u'vladi_pimenov_', u'james_schwieger_nov', u'stacey_white_mar', u'sally_beck_nov', u'phillip_platter', u'john_hodge_', u'joseph_quenet_jun', u'randall_gay_jan', u'directaccess', u'james_steffes_', u'john_zufferlie_jun', u'jeff_dasovich_', u'errol_mclaughlin_mar', u'sally_beck_dec', u'judy_townsend', u'mangmt', u'

In [42]:
# create simple lda model (takes around 3700s)
start = time()
lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, update_every=1, chunksize=10000, passes=1)
end = time()
print 'lda generated in ',(end-start)

IndexError: index 835562 is out of bounds for axis 1 with size 835562

In [29]:
pickle.dump(lda,open("lda_model.p","wb"))
print 'model_saved'

model_saved


In [30]:
lda.print_topics(20)

[(0,
  u'0.012*"know" + 0.011*"get" + 0.011*"message" + 0.009*"would" + 0.009*"original" + 0.009*"time" + 0.008*"like" + 0.008*"let" + 0.007*"thanks" + 0.006*"one"'),
 (1,
  u'0.033*"tls" + 0.017*"aw" + 0.015*"fx" + 0.009*"ig" + 0.009*"id" + 0.006*"gpia" + 0.005*"jb" + 0.005*"zw" + 0.003*"pz" + 0.003*"ly"'),
 (2,
  u'0.015*"aw" + 0.010*"lc" + 0.006*"yb" + 0.005*"gb" + 0.004*"gr" + 0.003*"ig" + 0.003*"zsb" + 0.002*"gd" + 0.002*"yw" + 0.002*"cm"'),
 (3,
  u'0.123*"enron" + 0.117*"ect" + 0.061*"hou" + 0.027*"pm" + 0.022*"corp" + 0.021*"forwarded" + 0.018*"ewc" + 0.010*"carol" + 0.009*"ees" + 0.008*"na"'),
 (4,
  u'0.035*"enron_development" + 0.019*"broker" + 0.017*"wtg" + 0.009*"deal" + 0.008*"ees" + 0.008*"swap" + 0.006*"raptor" + 0.006*"ena" + 0.005*"enron" + 0.005*"ss"'),
 (5,
  u'0.018*"enron" + 0.008*"new" + 0.006*"business" + 0.005*"availability" + 0.005*"site" + 0.004*"year" + 0.004*"one" + 0.004*"program" + 0.004*"time" + 0.003*"employees"'),
 (6,
  u'0.032*"ptu" + 0.028*"kicagica

In [60]:
print stopwords.words('english')

LookupError: 
**********************************************************************
  Resource u'corpora/stopwords' not found.  Please use the NLTK
  Downloader to obtain the resource:  >>> nltk.download()
  Searched in:
    - '/home/hweilun/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************