# This notebook generates WordCloud

The "Pre-Process Data" aims to do all the pre-processing

In [1]:
import pandas as pd
import os
from os import path
import nltk
from nltk.book import FreqDist

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [2]:
aliases = pd.read_csv('hillary-clinton-emails/Aliases.csv')
email_receivers = pd.read_csv('hillary-clinton-emails/EmailReceivers.csv')
emails = pd.read_csv('hillary-cl`inton-emails/Emails.csv')
persons = pd.read_csv('hillary-clinton-emails/Persons.csv')

### Creating the text

In [3]:
emails_text = emails.ExtractedBodyText.dropna()
emails_text.to_csv('emails.txt')

### Finding Frequencies

In [4]:
d = os.getcwd()
email = open(path.join(d, 'emails.txt')).read()

In [5]:
email = nltk.tokenize.word_tokenize(email)
fdist = FreqDist(email)

In [7]:
fdist.most_common(200)

[(',', 36559),
 ('.', 28273),
 ('the', 28261),
 ("''", 18540),
 ('to', 16899),
 ('and', 13901),
 ('of', 13800),
 ('a', 10679),
 ('in', 9711),
 ('that', 6373),
 ('is', 5887),
 ("'s", 5733),
 ('for', 5727),
 ('on', 5049),
 ('I', 5028),
 ('``', 4902),
 (':', 4161),
 ('with', 3956),
 ('you', 3735),
 ('it', 3232),
 ('as', 3086),
 ('The', 3004),
 ('be', 2961),
 ('have', 2956),
 ('was', 2821),
 ('are', 2560),
 (')', 2474),
 ('will', 2424),
 ('this', 2419),
 ('he', 2405),
 ('(', 2397),
 ('at', 2378),
 ('not', 2327),
 ('from', 2322),
 ('we', 2265),
 ('by', 2198),
 ('has', 2175),
 ('his', 2123),
 ('?', 2060),
 ('an', 1783),
 (';', 1628),
 ('but', 1579),
 ('they', 1554),
 ('would', 1535),
 ('who', 1514),
 ('about', 1513),
 ('do', 1505),
 ('am', 1495),
 ('pm', 1467),
 ('or', 1465),
 ('-', 1403),
 ("n't", 1360),
 ('their', 1348),
 ('can', 1287),
 ('said', 1230),
 ('had', 1217),
 ('Obama', 1209),
 ('Secretary', 1172),
 ('all', 1161),
 ('more', 1156),
 ('our', 1150),
 ('--', 1142),
 ('been', 1102),
 

In [14]:
type(fdist)

nltk.probability.FreqDist

In [17]:
type(fdist.items())

dict_items

## Turning the Corpus into Text File

It's now in pandas dataframe, need to turn it into a large .txt file.

## basic

In [None]:
import os
from os import path

import matplotlib.pyplot as plt
from wordcloud import WordCloud

# d = path.dirname(__file__), DIDN'T WORK.
d = os.getcwd() # alternative method to get the directory name.

# Read the whole text.
text = open(path.join(d, 'emails.txt')).read()
wordcloud = WordCloud(max_words=4000).generate(text)
# Open a plot of the generated image.

plt.imshow(wordcloud)
plt.axis("off")
plt.show()

## Not Processed

In [18]:
import os
from os import path

import matplotlib.pyplot as plt
from wordcloud import WordCloud

# d = path.dirname(__file__), DIDN'T WORK.
d = os.getcwd() # alternative method to get the directory name.

# Read the whole text.
text = open(path.join(d, 'emails.txt')).read()
wordcloud = WordCloud(max_words=200,stopwords=None).fit_words(fdist.items())
# Open a plot of the generated image.

plt.imshow(wordcloud)
plt.axis("off")
plt.show()

## Stopwords Removed, No Stemming

In [19]:
import os
from os import path

import matplotlib.pyplot as plt
from wordcloud import WordCloud

# d = path.dirname(__file__), DIDN'T WORK.
d = os.getcwd() # alternative method to get the directory name.

# Read the whole text.
text = open(path.join(d, 'emails.txt')).read()
wordcloud = WordCloud(max_words=200).generate(text) # this part gets the stuff
# Open a plot of the generated image.

plt.imshow(wordcloud)
plt.axis("off")
plt.show()

## Stopwords Removed, Stemming

In this version, there are lots of stop words, like "will", "time", "pm", "new", "one", "year", "also", "call"

In [None]:
# this is R code
tdm = TermDocumentMatrix(x_corpus,
                         control = list(removePunctuation = TRUE,
                                        stopwords = c("office","autoreply","sid","confirmed","update","followup","talk","draft",
                                                      "statement","speech","mini","fyi","notes","today","tomorrow",
                                                      "sid","memo","call","meeting","meetings",stopwords("SMART"),stopwords("english")),
                                        removeNumbers = TRUE,tolower = TRUE))

## Color with Flag

In [None]:
import os
from os import path
from scipy.misc import imread
import matplotlib.pyplot as plt

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

d = os.getcwd()

# Read the whole text.
text = open(path.join(d, 'emails-text.txt')).read()

# read the mask / color image
# taken from http://jirkavinse.deviantart.com/art/quot-Real-Life-quot-Alice-282261010
coloring = imread(path.join(d, "us-flag.jpeg"))
us
wc = WordCloud(background_color="white", max_words=2000, mask=coloring,
               stopwords=STOPWORDS.add("said"),
               max_font_size=40, random_state=42)
# generate word cloud
wc.generate(text)

# create coloring from image
image_colors = ImageColorGenerator(coloring)

# show
plt.imshow(wc)
plt.axis("off")
plt.figure()
# recolor wordcloud and show
# we could also give color_func=image_colors directly in the constructor
plt.imshow(wc.recolor(color_func=image_colors))
plt.axis("off")
plt.figure()
plt.imshow(coloring, cmap=plt.cm.gray)
plt.axis("off")
plt.show()

![title](email-us.png)

# Processing Data

text pre-processing pipeline (e.g., tokenization, stopword removal, stemming, etc.)

### Remove Stopwords

In [None]:
from nltk.corpus import stopwords
import nltk

In [None]:
nltk.download()

In [None]:
stopwords.words('english')

In [None]:
nltk.corpus.reuters.words()

In [None]:
import os
from os import path
from scipy.misc import imread
import matplotlib.pyplot as plt

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

corpus = os.getcwd()

# Read the whole text.
text = open(path.join(d, 'emails-text.txt')).read()

In [None]:
len(text)

### Loading Corpus

In [None]:
import os
from nltk.corpus import PlaintextCorpusReader
corpus_root = os.getcwd()
wordlists = PlaintextCorpusReader(corpus_root, 'emails.txt')
wordlists.fileids()

In [None]:
emails_text = wordlists.words('emails.txt')

In [None]:
len(emails_text)

In [None]:
len(set(emails_text))

In [None]:
len(set(emails_text)) # 41353 distinct words

### Frequency Distribution

In [None]:
%matplotlib inline

In [None]:
from nltk.book import *
fdist = FreqDist(emails_text)

In [None]:
fdist.most_common(50)

In [None]:
"total length: 3649670"

In [None]:
fdist.plot(50, cumulative=True)

In [None]:
import os
from os import path
from scipy.misc import imread
import matplotlib.pyplot as plt

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

d = os.getcwd()

# Read the whole text.
text = open(path.join(d, 'emails-text.txt')).read()

# read the mask / color image
# taken from http://jirkavinse.deviantart.com/art/quot-Real-Life-quot-Alice-282261010
coloring = imread(path.join(d, "us-flag.jpeg"))

# preprocessing the text a little bit
text = text.replace("Secretary's", "Secretary")
text = text.replace("LUKE'S", "Luke")

# adding movie script specific stopwords
stopwords = STOPWORDS.copy()
stopwords.add("int")
stopwords.add("ext")

wc = WordCloud(background_color="white", max_words=2000, mask=coloring,
               stopwords=STOPWORDS.add("said"),
               max_font_size=40, random_state=42)
# generate word cloud
wc.generate(text)

# create coloring from image
image_colors = ImageColorGenerator(coloring)

# show
plt.imshow(wc)
plt.axis("off")
plt.figure()
# recolor wordcloud and show
# we could also give color_func=image_colors directly in the constructor
plt.imshow(wc.recolor(color_func=image_colors))
plt.axis("off")
plt.figure()
plt.imshow(coloring, cmap=plt.cm.gray)
plt.axis("off")
plt.show()