1. Generate a word cloud based on the raw corpus -- I recommend you to use the Python word_cloud library. With the help of nltk (already available in your Anaconda environment), implement a standard text pre-processing pipeline (e.g., tokenization, stopword removal, stemming, etc.) and generate a new word cloud. Discuss briefly the pros and cons (if any) of the two word clouds you generated.

http://bbengfort.github.io/tutorials/2016/05/19/text-classification-nltk-sckit-learn.html

## Loading the Text File
The text file was created in the "Creating WordCloud" notebook

In [1]:
import pandas as pd
import nltk

In [2]:
from nltk.book import FreqDist

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [3]:
import os
from os import path

d = os.getcwd()

email_text = open(path.join(d, 'emails.txt')).read()
email_sample_text = open(path.join(d, 'emails-sample.txt')).read()

## Entire Pre-Processing Pipeline

In [13]:
"""
Preprocessing text and html (Tokenizing words and sentences, clean HTML, clean text, removing stopwords, stemming and lemmatization)
__author__ : Triskelion user@Kaggle (Thanks: Abhishek Thakur & Foxtrot user@Kaggle)
"""

# -*- coding: utf-8 -*-

from nltk import clean_html
from nltk import SnowballStemmer
from nltk import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re

# Tokenizing (Document to list of sentences. Sentence to list of words.)
def tokenize(str):
	'''Tokenizes into sentences, then strips punctuation/abbr, converts to lowercase and tokenizes words'''
	return 	[word_tokenize(" ".join(re.findall(r'\w+', t,flags = re.UNICODE | re.LOCALE)).lower()) 
			for t in sent_tokenize(str.replace("'", ""))]

#Removing stopwords. Takes list of words, outputs list of words.
def remove_stopwords(l_words, lang='english'):
	l_stopwords = stopwords.words(lang)
	content = [w for w in l_words if w.lower() not in l_stopwords]
	return content
		
		
#Stem all words with stemmer of type, return encoded as "encoding"
def stemming(words_l, type="PorterStemmer", lang="english", encoding="utf8"):
	supported_stemmers = ["PorterStemmer","SnowballStemmer","LancasterStemmer","WordNetLemmatizer"]
	if type is False or type not in supported_stemmers:
		return words_l
	else:
		l = []
		if type == "PorterStemmer":
			stemmer = PorterStemmer()
			for word in words_l:
				l.append(stemmer.stem(word).encode(encoding))
		if type == "SnowballStemmer":
			stemmer = SnowballStemmer(lang)
			for word in words_l:
				l.append(stemmer.stem(word).encode(encoding))
		if type == "LancasterStemmer":
			stemmer = LancasterStemmer()
			for word in words_l:
				l.append(stemmer.stem(word).encode(encoding))
		if type == "WordNetLemmatizer": #TODO: context
			wnl = WordNetLemmatizer()
			for word in words_l:
				l.append(wnl.lemmatize(word).encode(encoding))
		return l

#The preprocess pipeline. Returns as lists of tokens or as string. If stemmer_type = False or not supported then no stemming.		
def preprocess_pipeline(str, lang="english", stemmer_type="PorterStemmer", return_as_str=False, 
						do_remove_stopwords=False, do_clean_html=False):
	l = []
	words = []
	sentences = tokenize(str)
	for sentence in sentences:
		if do_remove_stopwords:
			words = remove_stopwords(sentence, lang)
		else:
			words = sentence
		words = stemming(words, stemmer_type)
# 		print(words)
		words = [word.decode("utf-8") for word in words]
# 		print(words)
		if return_as_str:
			l.append(" ".join(words))
		else:
			l.append(words)
	if return_as_str:
		return " ".join(l)
	else:
		return l

#test_sentence = "User-Testing Tester Tests! She had me at 'hello'?!? But then <abbr>ESPN</abbr> fainted... and Eighty cars drove past."
#print "\nOriginal:\n", test_sentence
#print "\nPorter:\n", preprocess_pipeline(test_sentence, "english", "PorterStemmer", True, False, True)
#print "\nLancaster:\n", preprocess_pipeline(test_sentence, "english", "LancasterStemmer", True, False, True)
#print "\nWordNet:\n", preprocess_pipeline(test_sentence, "english", "WordNetLemmatizer", True, False, True)
#print "\nStopword Tokenized Lancaster:\n", preprocess_pipeline(test_sentence, "english", "LancasterStemmer", False, True, True)
#print "\nOnly cleaning (HTML+Text):\n", preprocess_pipeline(test_sentence, "english", False, True, False, True)

## Seeing the result of the pre-process pipeline

In [5]:
test_sentence = "User-Testing Tester Tests! She had me at 'hello'?!? But then fainted... and Eighty cars drove past."

In [6]:
print("\nOriginal:\n", test_sentence)
print ("\nPorter:\n", preprocess_pipeline(test_sentence, "english", "PorterStemmer", True, False, True))
print ("\nLancaster:\n", preprocess_pipeline(test_sentence, "english", "LancasterStemmer", False, False, True))
print ("\nWordNet:\n", preprocess_pipeline(test_sentence, "english", "WordNetLemmatizer", False, False, True))
print ("\nStopword Tokenized Lancaster:\n", preprocess_pipeline(test_sentence, "english", "LancasterStemmer", True, True, True))
# print ("\nOnly cleaning (HTML+Text):\n", preprocess_pipeline(test_sentence, "english", False, True, False, True))


Original:
 User-Testing Tester Tests! She had me at 'hello'?!? But then fainted... and Eighty cars drove past.
[b'user', b'test', b'tester', b'test']
['user', 'test', 'tester', 'test']
[b'she', b'had', b'me', b'at', b'hello']
['she', 'had', 'me', 'at', 'hello']
[b'but', b'then', b'faint', b'and', b'eighti', b'car', b'drove', b'past']
['but', 'then', 'faint', 'and', 'eighti', 'car', 'drove', 'past']

Porter:
 user test tester test she had me at hello but then faint and eighti car drove past
[b'us', b'test', b'test', b'test']
['us', 'test', 'test', 'test']
[b'she', b'had', b'me', b'at', b'hello']
['she', 'had', 'me', 'at', 'hello']
[b'but', b'then', b'faint', b'and', b'eighty', b'car', b'drov', b'past']
['but', 'then', 'faint', 'and', 'eighty', 'car', 'drov', 'past']

Lancaster:
 [['us', 'test', 'test', 'test'], ['she', 'had', 'me', 'at', 'hello'], ['but', 'then', 'faint', 'and', 'eighty', 'car', 'drov', 'past']]




[b'user', b'testing', b'tester', b'test']
['user', 'testing', 'tester', 'test']
[b'she', b'had', b'me', b'at', b'hello']
['she', 'had', 'me', 'at', 'hello']
[b'but', b'then', b'fainted', b'and', b'eighty', b'car', b'drove', b'past']
['but', 'then', 'fainted', 'and', 'eighty', 'car', 'drove', 'past']

WordNet:
 [['user', 'testing', 'tester', 'test'], ['she', 'had', 'me', 'at', 'hello'], ['but', 'then', 'fainted', 'and', 'eighty', 'car', 'drove', 'past']]
[b'us', b'test', b'test', b'test']
['us', 'test', 'test', 'test']
[b'hello']
['hello']
[b'faint', b'eighty', b'car', b'drov', b'past']
['faint', 'eighty', 'car', 'drov', 'past']

Stopword Tokenized Lancaster:
 us test test test hello faint eighty car drov past


## Turning Tokens back into a large string of text

Let's use Porter because Lancaster is more aggressive and truncates words like Obama into obam. Obama is an important word. The sample takes 10 seconds to run.

In [7]:
email_sample_ppd = preprocess_pipeline(email_sample_text, "english", "LancasterStemmer", True, True, False) # does not return as string, instead returns

[b'democr', b'vot']
['democr', 'vot']
[b'ohio', b'vot', b'support', b'53', b'40', b'perc', b'giv', b'peopl', b'opt', b'govern', b'heal', b'ins', b'plan']
['ohio', 'vot', 'support', '53', '40', 'perc', 'giv', 'peopl', 'opt', 'govern', 'heal', 'ins', 'plan']
[b'independ', b'vot', b'support', b'publ', b'opt', b'55', b'38', b'perc']
['independ', 'vot', 'support', 'publ', 'opt', '55', '38', 'perc']
[b'afgh', b'unit', b'stat', b'right', b'thing', b'fight', b'afgh', b'ohio', b'vot', b'say', b'48', b'43', b'perc']
['afgh', 'unit', 'stat', 'right', 'thing', 'fight', 'afgh', 'ohio', 'vot', 'say', '48', '43', 'perc']
[b'vot', b'support', b'51', b'40', b'perc', b'recommend', b'top', b'u', b'command', b'afgh', b'40', b'000', b'u', b'troop', b'sent']
['vot', 'support', '51', '40', 'perc', 'recommend', 'top', 'u', 'command', 'afgh', '40', '000', 'u', 'troop', 'sent']
[b'55', b'perc', b'ohio', b'say', b'wil', b'see', b'larg', b'numb', b'am', b'troop', b'afgh', b'two', b'year', b'less', b'hard', b'mand

In [None]:
3 * 20 = 60

In [8]:
email_sample_ppd = preprocess_pipeline(email_sample_text, "english", "PorterStemmer", True, True, False) # does not return as string, instead returns

[b'democrat', b'vote']
['democrat', 'vote']
[b'ohio', b'voter', b'support', b'53', b'40', b'percent', b'give', b'peopl', b'option', b'govern', b'health', b'insur', b'plan']
['ohio', 'voter', 'support', '53', '40', 'percent', 'give', 'peopl', 'option', 'govern', 'health', 'insur', 'plan']
[b'independ', b'voter', b'support', b'public', b'option', b'55', b'38', b'percent']
['independ', 'voter', 'support', 'public', 'option', '55', '38', 'percent']
[b'afghanistan', b'unit', b'state', b'right', b'thing', b'fight', b'afghanistan', b'ohio', b'voter', b'say', b'48', b'43', b'percent']
['afghanistan', 'unit', 'state', 'right', 'thing', 'fight', 'afghanistan', 'ohio', 'voter', 'say', '48', '43', 'percent']
[b'voter', b'support', b'51', b'40', b'percent', b'recommend', b'top', b'u', b'command', b'afghanistan', b'40', b'000', b'u', b'troop', b'sent']
['voter', 'support', '51', '40', 'percent', 'recommend', 'top', 'u', 'command', 'afghanistan', '40', '000', 'u', 'troop', 'sent']
[b'55', b'percent',

### Put String to Text File

In [9]:
text_file = open("email_sample_processed.txt", "w")
text_file.write(email_sample_ppd)
text_file.close()

In [10]:
email_sample_ppd = nltk.tokenize.word_tokenize(email_sample_ppd)

In [11]:
fdist1 = FreqDist(email_sample_ppd)

## Most common words in the Sample

In [12]:
fdist1.most_common(100)

[('state', 192),
 ('pm', 117),
 ('china', 115),
 ('depart', 105),
 ('obama', 93),
 ('would', 87),
 ('u', 81),
 ('2009', 79),
 ('percent', 71),
 ('call', 69),
 ('meet', 65),
 ('secretari', 64),
 ('democrat', 64),
 ('presid', 64),
 ('new', 63),
 ('offici', 61),
 ('work', 59),
 ('time', 59),
 ('rate', 59),
 ('offic', 57),
 ('polici', 53),
 ('clinton', 51),
 ('one', 48),
 ('need', 48),
 ('also', 47),
 ('us', 46),
 ('like', 45),
 ('countri', 45),
 ('republican', 45),
 ('30', 44),
 ('among', 44),
 ('elect', 43),
 ('black', 42),
 ('well', 42),
 ('good', 41),
 ('women', 41),
 ('nation', 41),
 ('econom', 41),
 ('1', 41),
 ('h', 41),
 ('year', 41),
 ('10', 40),
 ('govern', 40),
 ('world', 38),
 ('public', 38),
 ('chang', 37),
 ('white', 37),
 ('foreign', 37),
 ('approv', 37),
 ('global', 36),
 ('12', 36),
 ('00', 36),
 ('get', 35),
 ('much', 35),
 ('2', 34),
 ('w', 34),
 ('said', 34),
 ('gov', 34),
 ('11', 34),
 ('even', 34),
 ('see', 33),
 ('3', 32),
 ('take', 32),
 ('issu', 32),
 ('4', 32),
 (

We will eliminate words like would, u, 2009, call, meet, etc.

We want the full list to be more accurate. Don't know how long this will take... over 5 minutes, then timeout.

## Full Text

In [12]:
email_text1 = open(path.join(d, 'emails1.txt')).read()
email_text2 = open(path.join(d, 'emails2.txt')).read()
email_text3 = open(path.join(d, 'emails3.txt')).read()
email_text4 = open(path.join(d, 'emails4.txt')).read()

### Running Pre-process for full text

In [None]:
split -l 200000 filename # run it in bash

In [14]:
email_ppd1 = preprocess_pipeline(email_text1, "english", "PorterStemmer", True, True, False) # does not return as string, instead returns

In [15]:
email_ppd2 = preprocess_pipeline(email_text2, "english", "PorterStemmer", True, True, False) # does not return as string, instead returns

In [16]:
email_ppd3 = preprocess_pipeline(email_text3, "english", "PorterStemmer", True, True, False) # does not return as string, instead returns

In [17]:
email_ppd4 = preprocess_pipeline(email_text4, "english", "PorterStemmer", True, True, False) # does not return as string, instead returns

In [18]:
email_ppd = email_ppd1 + email_ppd2 + email_ppd3 + email_ppd4

In [None]:
email_ppd = nltk.tokenize.word_tokenize(email_ppd)
fdist1 = FreqDist(email_ppd)

In [25]:
fdist1.most_common(200)

[('state', 3254),
 ('pm', 2137),
 ('call', 1608),
 ('would', 1537),
 ('secretari', 1409),
 ('time', 1378),
 ('1', 1341),
 ('work', 1294),
 ('offic', 1269),
 ('obama', 1262),
 ('u', 1259),
 ('said', 1241),
 ('presid', 1233),
 ('one', 1232),
 ('depart', 1225),
 ('new', 1106),
 ('meet', 1074),
 ('also', 1010),
 ('hous', 1009),
 ('like', 999),
 ('w', 967),
 ('get', 964),
 ('2010', 957),
 ('american', 954),
 ('us', 945),
 ('year', 943),
 ('10', 933),
 ('2', 930),
 ('govern', 915),
 ('30', 908),
 ('2009', 908),
 ('say', 896),
 ('want', 887),
 ('h', 869),
 ('see', 851),
 ('peopl', 831),
 ('fyi', 830),
 ('need', 820),
 ('nation', 817),
 ('go', 814),
 ('4', 812),
 ('gov', 793),
 ('know', 789),
 ('right', 774),
 ('8', 762),
 ('polit', 753),
 ('clinton', 751),
 ('make', 743),
 ('3', 731),
 ('support', 730),
 ('think', 722),
 ('talk', 721),
 ('secur', 718),
 ('parti', 717),
 ('00', 677),
 ('first', 669),
 ('could', 667),
 ('issu', 660),
 ('today', 657),
 ('polici', 655),
 ('back', 637),
 ('two', 6

## Stemming, Stopwords

In [24]:
import os
from os import path

import matplotlib.pyplot as plt
from wordcloud import WordCloud

# d = path.dirname(__file__), DIDN'T WORK.
d = os.getcwd() # alternative method to get the directory name.

# Read the whole text.
text = open(path.join(d, 'emails.txt')).read()
wordcloud = WordCloud(max_words=200).fit_words(fdist1.most_common(200)) # this part gets the stuff
# Open a plot of the generated image.

plt.imshow(wordcloud)
plt.axis("off")
plt.show()

## Customized Stop-words
Specific to Hillary Clinton's emails, removing words that are common in everyday speech

In [None]:
fdist1.most_common(200)

In [34]:
hillary_stopwords = ['would','PM', 'said','also','like','say', 'want', 'see', 'need', 'go']

In [35]:
filtered_email_ppd = email_ppd[:] #make a copy of the word_list
for word in email_ppd: # iterate over word_list
    if word in hillary_stopwords: 
        filtered_email_ppd.remove(word) # remove word from filtered_word_list if it is a stopword

In [None]:
# filtered_email_ppd = nltk.tokenize.word_tokenize(filtered_email_ppd)
fdist1 = FreqDist(filtered_email_ppd)

In [None]:
import os
from os import path

import matplotlib.pyplot as plt
from wordcloud import WordCloud

# d = path.dirname(__file__), DIDN'T WORK.
d = os.getcwd() # alternative method to get the directory name.

# Read the whole text.
text = open(path.join(d, 'emails.txt')).read()
wordcloud = WordCloud(max_words=200).fit_words(fdist1.most_common(200)) # this part gets the stuff
# Open a plot of the generated image.

plt.imshow(wordcloud)
plt.axis("off")
plt.show()

## Eliminate more tokens

We will eliminate more tokens once we know which words are key.