# This notebook generates WordCloud

The "Pre-Process Data" aims to do all the pre-processing

In [None]:
import pandas as pd

In [None]:
aliases = pd.read_csv('hillary-clinton-emails/Aliases.csv')
email_receivers = pd.read_csv('hillary-clinton-emails/EmailReceivers.csv')
emails = pd.read_csv('hillary-clinton-emails/Emails.csv')
persons = pd.read_csv('hillary-clinton-emails/Persons.csv')

In [None]:
aliases

In [None]:
persons[persons['Id']==87]

In [None]:
emails[emails['SenderPersonId']==87]

## The Senders are unique

In [None]:
emails.SenderPersonId.unique()

In [None]:
emails_text = emails.ExtractedBodyText.dropna()

In [None]:
emails_text.to_csv('emails-text.txt')

In [None]:
emails_text

## Turning the Corpus into Text File

It's now in pandas dataframe, need to turn it into a large .txt file.

## basic

In [None]:
import os
from os import path

import matplotlib.pyplot as plt
from wordcloud import WordCloud

# d = path.dirname(__file__), DIDN'T WORK.
d = os.getcwd() # alternative method to get the directory name.

# Read the whole text.
text = open(path.join(d, 'emails-text.txt')).read()
wordcloud = WordCloud(max_words=4000).generate(text)
# Open a plot of the generated image.

plt.imshow(wordcloud)
plt.axis("off")
plt.show()

![title](email-raw.png)

In this version, there are lots of stop words, like "will", "time", "pm", "new", "one", "year", "also", "call"

In [None]:
# this is R code
tdm = TermDocumentMatrix(x_corpus,
                         control = list(removePunctuation = TRUE,
                                        stopwords = c("office","autoreply","sid","confirmed","update","followup","talk","draft",
                                                      "statement","speech","mini","fyi","notes","today","tomorrow",
                                                      "sid","memo","call","meeting","meetings",stopwords("SMART"),stopwords("english")),
                                        removeNumbers = TRUE,tolower = TRUE))

## Color with Flag

In [None]:
import os
from os import path
from scipy.misc import imread
import matplotlib.pyplot as plt

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

d = os.getcwd()

# Read the whole text.
text = open(path.join(d, 'emails-text.txt')).read()

# read the mask / color image
# taken from http://jirkavinse.deviantart.com/art/quot-Real-Life-quot-Alice-282261010
coloring = imread(path.join(d, "us-flag.jpeg"))
us
wc = WordCloud(background_color="white", max_words=2000, mask=coloring,
               stopwords=STOPWORDS.add("said"),
               max_font_size=40, random_state=42)
# generate word cloud
wc.generate(text)

# create coloring from image
image_colors = ImageColorGenerator(coloring)

# show
plt.imshow(wc)
plt.axis("off")
plt.figure()
# recolor wordcloud and show
# we could also give color_func=image_colors directly in the constructor
plt.imshow(wc.recolor(color_func=image_colors))
plt.axis("off")
plt.figure()
plt.imshow(coloring, cmap=plt.cm.gray)
plt.axis("off")
plt.show()

![title](email-us.png)

# Processing Data

text pre-processing pipeline (e.g., tokenization, stopword removal, stemming, etc.)

### Remove Stopwords

In [None]:
from nltk.corpus import stopwords
import nltk

In [None]:
nltk.download()

In [None]:
stopwords.words('english')

In [None]:
nltk.corpus.reuters.words()

In [None]:
import os
from os import path
from scipy.misc import imread
import matplotlib.pyplot as plt

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

corpus = os.getcwd()

# Read the whole text.
text = open(path.join(d, 'emails-text.txt')).read()

In [None]:
len(text)

### Loading Corpus

In [2]:
import os
from nltk.corpus import PlaintextCorpusReader
corpus_root = os.getcwd()
wordlists = PlaintextCorpusReader(corpus_root, 'emails.txt')
wordlists.fileids()

['emails.txt']

In [3]:
emails_text = wordlists.words('emails.txt')

In [7]:
len(emails_text)

759839

In [6]:
len(set(emails_text))

41353

In [None]:
len(set(emails_text)) # 41353 distinct words

### Frequency Distribution

In [None]:
%matplotlib inline

In [None]:
from nltk.book import *
fdist = FreqDist(emails_text)

In [None]:
fdist.most_common(50)

In [None]:
"total length: 3649670"

In [None]:
fdist.plot(50, cumulative=True)

In [None]:
import os
from os import path
from scipy.misc import imread
import matplotlib.pyplot as plt

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

d = os.getcwd()

# Read the whole text.
text = open(path.join(d, 'emails-text.txt')).read()

# read the mask / color image
# taken from http://jirkavinse.deviantart.com/art/quot-Real-Life-quot-Alice-282261010
coloring = imread(path.join(d, "us-flag.jpeg"))

# preprocessing the text a little bit
text = text.replace("Secretary's", "Secretary")
text = text.replace("LUKE'S", "Luke")

# adding movie script specific stopwords
stopwords = STOPWORDS.copy()
stopwords.add("int")
stopwords.add("ext")

wc = WordCloud(background_color="white", max_words=2000, mask=coloring,
               stopwords=STOPWORDS.add("said"),
               max_font_size=40, random_state=42)
# generate word cloud
wc.generate(text)

# create coloring from image
image_colors = ImageColorGenerator(coloring)

# show
plt.imshow(wc)
plt.axis("off")
plt.figure()
# recolor wordcloud and show
# we could also give color_func=image_colors directly in the constructor
plt.imshow(wc.recolor(color_func=image_colors))
plt.axis("off")
plt.figure()
plt.imshow(coloring, cmap=plt.cm.gray)
plt.axis("off")
plt.show()