In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
from nltk.stem import WordNetLemmatizer 
from nltk.stem.porter import PorterStemmer
import re
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
import gensim
from gensim import corpora
import pyLDAvis.gensim
from wordcloud import WordCloud

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JZ\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\JZ\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\JZ\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
file_path = 'C:/temp/dataset/data/enron_emails.csv'
df = pd.read_csv(file_path, sep = ',')

In [3]:
df.head(2)

Unnamed: 0,Message-ID,From,To,Date,content,detail_content,Unnamed: 6
0,<8345058.1075840404046.JavaMail.evans@thyme>,('advdfeedback@investools.com'),('advdfeedback@investools.com'),1/29/2002 23:20,INVESTools Advisory\nA Free Digest of Trusted ...,investools advisory free digest trusted invest...,
1,<1512159.1075863666797.JavaMail.evans@thyme>,('richard.sanders@enron.com'),('richard.sanders@enron.com'),9/20/2000 19:07,----- Forwarded by Richard B Sanders/HOU/ECT o...,forwarded richard b sanders hou ect pm justin ...,


In [4]:
df.shape

(14156, 7)

In [5]:
df = df.loc[:, ['Message-ID', 'From', 'To', 'Date','detail_content']]

In [None]:
df.head(2)

In [None]:
df.isnull().sum()

In [6]:
df = df.dropna()

In [None]:
df.shape

In [None]:
df.head()

In [7]:
df = df.reset_index(drop=True)


In [None]:
df.head()

In [None]:
# Tokenization
#text =df.apply(lambda row: word_tokenize(row['detail_content']), axis=1)
#text = text.rstrip()
#text = re.sub(r'[^a-zA-Z])', ' ', text)

In [8]:
# Define stopwords to exclude
stop = set(stopwords.words('english'))
stop.update(("to", "cc", "subject", "http", "from", "sent", "etc", "u", "www", "com"))

In [9]:
# Define punctuations to exclude
exclude = set(string.punctuation)

In [10]:
# Define lemmatizer from nltk
lemma = WordNetLemmatizer()

In [23]:

porter = PorterStemmer()

In [24]:
# Define word cleaning function
def clean(text, stop):
    text = text.rstrip()
    stop_free = " ".join([i for i in text.lower().split() if((i not in stop) and (not i.isdigit()))])
    punc_free = ''.join(i for i in stop_free if i not in exclude)
    normalized = " ".join(lemma.lemmatize(i) for i in punc_free.split())  
    cleaned_text = " ".join(porter.stem(token) for token in normalized.split())
    return cleaned_text

In [None]:
#Stem words

cleaned_text = " ".join(porter.stem(token) for token in normalized.split())
print (cleaned_text)

In [25]:
# Clean the emails in df and print results
text_clean=[]
for text in df['detail_content']:
    text_clean.append(clean(text, stop).split())  
    #print(text_clean)

In [26]:
# Define the dictionary
dictionary = corpora.Dictionary(text_clean)

In [27]:
#print(dictionary)

Dictionary(20388 unique tokens: ['account', 'accur', 'acquir', 'acr', 'address']...)


In [28]:
# Define corpus
corpus = [dictionary.doc2bow(text) for text in (text_clean)]

In [None]:
#print(corpus)

In [37]:
# Define the LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, random_state=1,passes=5)

In [38]:
# Save the topics and top 5 words
topics = ldamodel.print_topics(num_words=5)

In [17]:
# Print the result
for topic in topics:
    print(topic)

(0, '0.017*"enron" + 0.011*"please" + 0.010*"thanks" + 0.009*"e" + 0.008*"know"')
(1, '0.036*"enron" + 0.018*"ect" + 0.008*"hou" + 0.007*"company" + 0.005*"energy"')
(2, '0.047*"td" + 0.037*"net" + 0.035*"money" + 0.032*"tr" + 0.029*"width"')
(3, '0.026*"enron" + 0.013*"message" + 0.010*"original" + 0.009*"development" + 0.008*"wj"')
(4, '0.047*"image" + 0.012*"click" + 0.012*"se" + 0.011*"ne" + 0.011*"sp"')


In [39]:
# Prepare model for visualizatioon
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)


In [40]:
# display the topics
pyLDAvis.display(lda_display)


In [41]:
# Define function that retrieves topic details per row 
def get_topic_details(ldamodel, corpus):
    topic_details_df = pd.DataFrame()
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                topic_details_df = topic_details_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
    topic_details_df.columns = ['Dominant_Topic', '% Score', 'Topic_Keywords']
    return topic_details_df

In [42]:
# Concatenate original text with the outpu of the model detail
contents = pd.DataFrame({'Original Text': text_clean})
topic_details = pd.concat([get_topic_details(ldamodel, corpus), contents], axis=1)

In [50]:
topic_details.head()

Unnamed: 0,Dominant_Topic,% Score,Topic_Keywords,Original Text
0,0.0,0.9153,"enron, compani, energi, market, employe, time,...","[investool, advisori, free, digest, trust, inv..."
1,1.0,0.9622,"enron, ect, messag, hou, thank, pleas, pm, ori...","[forward, richard, b, sander, hou, ect, pm, ju..."
2,1.0,0.9784,"enron, ect, messag, hou, thank, pleas, pm, ori...","[hey, wear, target, purpl, shirt, today, mine,..."
3,0.0,0.9936,"enron, compani, energi, market, employe, time,...","[lesli, milosevich, santa, clara, avenu, alame..."
4,0.0,0.9942,"enron, compani, energi, market, employe, time,...","[rini, twait, e, th, ave, longmont, co, rtwait..."
