In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv('enron_emails.csv')
df.head()
df.drop('clean_content', inplace=True, axis=1)

In [3]:
emails = df['content'].to_numpy().copy()

In [4]:
print(emails[0])

INVESTools Advisory
A Free Digest of Trusted Investment Advice

To unsubscribe from this free newsletter, please see below.

In This Issue:

1. Fried Sells 4 Stocks, Gains +46.8% in 3 Months (KM)
2. Rowe: January Index Confirms Bull Market for 2002 (ALOY)
3. Small-Cap Advisor Earns +31.6% in 2001 (LBIX)
4. Compounding Returns with Pine Trees (PCL)
5. Undervalued, High-yield Bank Puts Customers First (ASO)


*************** A Word from our Sponsor *******************
Top Wall Street Watcher Ben Zacks: +51.7%/year 5-Year Gain!

Moving with the best and brightest of Wall Street's big-
money machines earned Ben Zacks a +51.7% five-year average
annual gain. Start outperforming long-term. Get Zacks'
latest 13-stock buylist with your FREE 30-day trial:
http://www.investools.com/c/go/ZAKS/MTXTU-zaksTB1?s=S600
***********************************************************



INVESTools Advisory
By John Brobst, INVESTools.com


1. Fried Sells 4 Stocks, Locks in +46.8% 

## Remove escape characters
\b\n\r\t and others

In [5]:
for i in range(len(emails)):
    emails[i] = re.sub('[\b\n\r\t\\\'\"]', ' ', emails[i])

## Remove digits and punctuation

In [6]:
import string
punctuation = set(string.punctuation)
print(punctuation)

{'-', '[', '|', '@', ':', '+', '_', '^', ')', ',', '<', '~', '*', '=', '{', '>', '?', '"', '%', '\\', "'", '/', '}', '.', ']', '&', '#', '!', '`', '$', ';', '('}


In [7]:
for i in range(len(emails)):
    emails[i] = re.sub('[0-9]', ' ', emails[i])

In [8]:
def replace_single_char(string, remove, replace):
    myStr = ""
    for s in string:
        if s not in remove:
            myStr += s
        else:
            myStr += replace
            
    return myStr

for i in range(len(emails)):
    emails[i] = replace_single_char(emails[i], punctuation, ' ')

## Switch to lowercase
## Remove leading and trailing whitespaces

In [9]:
emails = [x.lower().lstrip().rstrip() for x in emails]

## Tokenize words

In [10]:
from nltk import word_tokenize

In [11]:
emails_tkd = emails.copy()
for i in range(len(emails)):
    emails_tkd[i] = word_tokenize(emails[i])

## Remove stopwords and special words

special words: http https image

In [12]:
from nltk.corpus import stopwords

stopw = set(stopwords.words('english'))
specialw = ('http','https','image')

In [13]:
for i in range(len(emails_tkd)):
    email = emails_tkd[i]
    email_clean = [x for x in email if x not in stopw]
    email_clean = [x for x in email_clean if len(x)>3]
    email_clean = [x for x in email_clean if x not in specialw]
    emails_tkd[i] = email_clean

In [14]:
print('http' in emails_tkd[0])

False


## Lemmatize all words
Words in third person are changed to first person and verbs in past and future tenses are changed into present tense

In [15]:
from nltk.stem import WordNetLemmatizer

In [16]:
lemma = WordNetLemmatizer()

In [17]:
help(lemma.lemmatize)

Help on method lemmatize in module nltk.stem.wordnet:

lemmatize(word: str, pos: str = 'n') -> str method of nltk.stem.wordnet.WordNetLemmatizer instance
    Lemmatize `word` using WordNet's built-in morphy function.
    Returns the input word unchanged if it cannot be found in WordNet.
    
    :param word: The input word to lemmatize.
    :type word: str
    :param pos: The Part Of Speech tag. Valid options are `"n"` for nouns,
        `"v"` for verbs, `"a"` for adjectives, `"r"` for adverbs and `"s"`
        for satellite adjectives.
    :param pos: str
    :return: The lemma of `word`, for the given `pos`.



In [18]:
print(lemma.lemmatize('boxes', pos='v'))
print(lemma.lemmatize('boxes', pos='n'))
print(lemma.lemmatize('went'))
print(lemma.lemmatize('went', pos='v'))
print(lemma.lemmatize('went', pos='n'))

box
box
went
go
went


In [19]:
for i in range(len(emails_tkd)):
    email = emails_tkd[i]
    email_clean = [lemma.lemmatize(x) for x in email]
    emails_tkd[i] = email_clean

## Stemming all words
Words are reduced to their root form

In [20]:
from nltk.stem.snowball import SnowballStemmer
snow = SnowballStemmer(language='english')

In [21]:
for i in range(len(emails_tkd)):
    email = emails_tkd[i]
    email_clean = [snow.stem(x) for x in email]
    emails_tkd[i] = email_clean

## Convert back into strings of text

In [22]:
for i in range(len(emails_tkd)):
    email = emails_tkd[i]
    email_string = ' '.join(email)
    emails_tkd[i] = email_string

In [23]:
df['cleaned_content'] = emails_tkd

In [24]:
df

Unnamed: 0,Message-ID,From,To,Date,content,cleaned_content
0,<8345058.1075840404046.JavaMail.evans@thyme>,('advdfeedback@investools.com'),('advdfeedback@investools.com'),2002-01-29 23:20:55,INVESTools Advisory\r\nA Free Digest of Truste...,investool advisori free digest trust invest ad...
1,<1512159.1075863666797.JavaMail.evans@thyme>,('richard.sanders@enron.com'),('richard.sanders@enron.com'),2000-09-20 19:07:00,----- Forwarded by Richard B Sanders/HOU/ECT o...,forward richard sander justin boyd richard san...
2,<26118676.1075862176383.JavaMail.evans@thyme>,('m..love@enron.com'),('m..love@enron.com'),2001-10-30 16:15:17,hey you are not wearing your target purple shi...,wear target purpl shirt today mine want look s...
3,<10369289.1075860831062.JavaMail.evans@thyme>,('leslie.milosevich@kp.org'),('leslie.milosevich@kp.org'),2002-01-30 17:54:18,Leslie Milosevich\r\n1042 Santa Clara Avenue\r...,lesli milosevich santa clara avenu alameda les...
4,<26728895.1075860815046.JavaMail.evans@thyme>,('rtwait@graphicaljazz.com'),('rtwait@graphicaljazz.com'),2002-01-30 19:36:01,"Rini Twait\r\n1010 E 5th Ave\r\nLongmont, CO 8...",rini twait longmont rtwait graphicaljazz write...
...,...,...,...,...,...,...
2085,<19039088.1075851547721.JavaMail.evans@thyme>,('andy.zipper@enron.com'),('andy.zipper@enron.com'),2001-10-22 14:00:17,"i bot 1,000/d at 3.175 apr/oct02. put it again...",digit thank
2086,<6813352.1075842016977.JavaMail.evans@thyme>,('andy.zipper@enron.com'),('andy.zipper@enron.com'),2002-01-25 17:39:38,I'm okay. How are you ?,okay
2087,<4833106.1075842022184.JavaMail.evans@thyme>,('tradersummary@syncrasy.com'),('tradersummary@syncrasy.com'),2002-02-01 16:15:17,\r\n[IMAGE]=09\r\n\r\n\r\n[IMAGE] [IMAGE][IMAG...,syncrasi texa avenu suit ouston syncrasi sale ...
2088,<3550151.1075842023814.JavaMail.evans@thyme>,('lmrig@qwest.net'),('lmrig@qwest.net'),2002-01-29 02:01:00,\r\n\r\nTransmission Expansion and Systems in ...,transmiss expans system transit confer miami f...


In [26]:
df.to_csv('enron_emails_clean.csv', index=False)