In [None]:
import nltk

import matplotlib.pyplot as plt
%matplotlib inline

## you may need to download a few things first
#nltk.download()

from wordcloud import WordCloud
from nltk.corpus import PlaintextCorpusReader
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
#nltk.download("stopwords")

# see https://github.com/cjhutto/vaderSentiment
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#nltk.download('vader_lexicon')

In [None]:
corpus_path = "./datasets/processed/messages/"
pstemmer = PorterStemmer()
sstemmer = SnowballStemmer("english")
vader_analyzer = SentimentIntensityAnalyzer()

In [None]:
enron_corpus = PlaintextCorpusReader(corpus_path, '.*\.txt')

In [None]:
counter = 0
n = 10
for filename in enron_corpus.fileids():
    if counter < n:
        print(filename)
        counter += 1
    else:
        continue

In [None]:
# let's open up the first doc in the corpus
raw_text = enron_corpus.raw("<1000115.1075852075775.JavaMail.evans@thyme>.txt")
raw_text

In [None]:
# tokenizing the text word by word
wt = word_tokenize(raw_text, language="english")
wt

In [None]:
# tokenizing the text sentence by sentence
st = sent_tokenize(raw_text, language="english")
st

In [None]:
# list of typical English stopwords
english_stopwords = stopwords.words("english")
english_stopwords

In [None]:
wt_after_stopwords_removal = [word for word in set(wt) if word not in english_stopwords]
wt_after_stopwords_removal

In [None]:
for word in wt_after_stopwords_removal:
    print("stem of' '%s' is '%s'(portman stemmer) and '%s'(snowball stemmer)"
          % (word, pstemmer.stem(word[0]), sstemmer.stem(word)))

In [None]:
raw_texts = ""
for docfile in enron_corpus.fileids()[0:10]:
    raw_texts = enron_corpus.raw(docfile)

# Generate a word cloud image
wordcloud = WordCloud(max_words=10, stopwords=english_stopwords,
                      margin=10, random_state=1, max_font_size=100, 
                      background_color="white").generate(raw_texts)

plt.figure()
plt.figure(figsize=(12, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# lower max_font_size a little
wordcloud = WordCloud(max_font_size=40).generate(raw_texts)

plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [16]:
# let's calculate valence score for contents of the first file in the corpus
for sentence in enron_corpus.sents("<1000115.1075852075775.JavaMail.evans@thyme>.txt"):
    whole_sentence = " ".join(sentence)
    print("SENTENCE: \n%s \n" % whole_sentence)
    print("POLARITY SCORE: \n%s \n" % vader_analyzer.polarity_scores(whole_sentence))
    print("------------------------------------------------------------------------")

SENTENCE: 
That was pretty funny .... he is doing research on NW utilities , however that is not their mandate to market to them ... so I say , then why are you doing research ... he say ' s ... " I ' m not ... who said I was "... it was about that time I started to laugh ... oh well . 

POLARITY SCORE: 
{'neg': 0.0, 'neu': 0.803, 'pos': 0.197, 'compound': 0.8957} 

------------------------------------------------------------------------
SENTENCE: 
----- Original Message ----- From : Calger , Christopher F . Sent : Wednesday , October 24 , 2001 10 : 32 PM To : Tycholiz , Barry Subject : FW : Natural Gas Origination 

POLARITY SCORE: 
{'neg': 0.0, 'neu': 0.814, 'pos': 0.186, 'compound': 0.5859} 

------------------------------------------------------------------------
SENTENCE: 
I ' m glad I ' m in the west - I know who to call about gas deals ? you and ader ! 

POLARITY SCORE: 
{'neg': 0.0, 'neu': 0.798, 'pos': 0.202, 'compound': 0.5093} 

----------------------------------------------

# About the Scoring


* The ``compound`` score is computed by summing the valence scores of each word in the lexicon, adjusted according to the rules, and then normalized to be between -1 (most extreme negative) and +1 (most extreme positive). This is the most useful metric if you want a single unidimensional measure of sentiment for a given sentence. Calling it a 'normalized, weighted composite score' is accurate. 
 
  It is also useful for researchers who would like to set standardized thresholds for classifying sentences as either positive, neutral, or negative.  
  Typical threshold values (used in the literature cited on this page) are:

 #. **positive sentiment**: ``compound`` score >=  0.5
 #. **neutral  sentiment**: (``compound`` score > -0.5) and (``compound`` score < 0.5)
 #. **negative sentiment**: ``compound`` score <= -0.5

* The ``pos``, ``neu``, and ``neg`` scores are ratios for proportions of text that fall in each category (so these should all add up to be 1... or close to it with float operation).  These are the most useful metrics if you want multidimensional measures of sentiment for a given sentence.
