http://bbengfort.github.io/tutorials/2016/05/19/text-classification-nltk-sckit-learn.html

## Loading the Text File
The text file was created in the "Creating WordCloud" notebook

In [33]:
import pandas as pd
import nltk

In [15]:
from nltk.book import FreqDist

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [1]:
import os
from os import path

d = os.getcwd()

email_text = open(path.join(d, 'emails.txt')).read()
email_sample_text = open(path.join(d, 'emails-sample.txt')).read()

## Entire Pre-Processing Pipeline

In [2]:
"""
Preprocessing text and html (Tokenizing words and sentences, clean HTML, clean text, removing stopwords, stemming and lemmatization)
__author__ : Triskelion user@Kaggle (Thanks: Abhishek Thakur & Foxtrot user@Kaggle)
"""

# -*- coding: utf-8 -*-

from nltk import clean_html
from nltk import SnowballStemmer
from nltk import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re

# Tokenizing (Document to list of sentences. Sentence to list of words.)
def tokenize(str):
	'''Tokenizes into sentences, then strips punctuation/abbr, converts to lowercase and tokenizes words'''
	return 	[word_tokenize(" ".join(re.findall(r'\w+', t,flags = re.UNICODE | re.LOCALE)).lower()) 
			for t in sent_tokenize(str.replace("'", ""))]

#Removing stopwords. Takes list of words, outputs list of words.
def remove_stopwords(l_words, lang='english'):
	l_stopwords = stopwords.words(lang)
	content = [w for w in l_words if w.lower() not in l_stopwords]
	return content
		
		
#Stem all words with stemmer of type, return encoded as "encoding"
def stemming(words_l, type="PorterStemmer", lang="english", encoding="utf8"):
	supported_stemmers = ["PorterStemmer","SnowballStemmer","LancasterStemmer","WordNetLemmatizer"]
	if type is False or type not in supported_stemmers:
		return words_l
	else:
		l = []
		if type == "PorterStemmer":
			stemmer = PorterStemmer()
			for word in words_l:
				l.append(stemmer.stem(word).encode(encoding))
		if type == "SnowballStemmer":
			stemmer = SnowballStemmer(lang)
			for word in words_l:
				l.append(stemmer.stem(word).encode(encoding))
		if type == "LancasterStemmer":
			stemmer = LancasterStemmer()
			for word in words_l:
				l.append(stemmer.stem(word).encode(encoding))
		if type == "WordNetLemmatizer": #TODO: context
			wnl = WordNetLemmatizer()
			for word in words_l:
				l.append(wnl.lemmatize(word).encode(encoding))
		return l

#The preprocess pipeline. Returns as lists of tokens or as string. If stemmer_type = False or not supported then no stemming.		
def preprocess_pipeline(str, lang="english", stemmer_type="PorterStemmer", return_as_str=False, 
						do_remove_stopwords=False, do_clean_html=False):
	l = []
	words = []
	sentences = tokenize(str)
	for sentence in sentences:
		if do_remove_stopwords:
			words = remove_stopwords(sentence, lang)
		else:
			words = sentence
		words = stemming(words, stemmer_type)
		print(words)
		words = [word.decode("utf-8") for word in words]
		print(words)
		if return_as_str:
			l.append(" ".join(words))
		else:
			l.append(words)
	if return_as_str:
		return " ".join(l)
	else:
		return l

#test_sentence = "User-Testing Tester Tests! She had me at 'hello'?!? But then <abbr>ESPN</abbr> fainted... and Eighty cars drove past."
#print "\nOriginal:\n", test_sentence
#print "\nPorter:\n", preprocess_pipeline(test_sentence, "english", "PorterStemmer", True, False, True)
#print "\nLancaster:\n", preprocess_pipeline(test_sentence, "english", "LancasterStemmer", True, False, True)
#print "\nWordNet:\n", preprocess_pipeline(test_sentence, "english", "WordNetLemmatizer", True, False, True)
#print "\nStopword Tokenized Lancaster:\n", preprocess_pipeline(test_sentence, "english", "LancasterStemmer", False, True, True)
#print "\nOnly cleaning (HTML+Text):\n", preprocess_pipeline(test_sentence, "english", False, True, False, True)

## Seeing the result of the pre-process pipeline

In [3]:
test_sentence = "User-Testing Tester Tests! She had me at 'hello'?!? But then fainted... and Eighty cars drove past."

In [10]:
print("\nOriginal:\n", test_sentence)
print ("\nPorter:\n", preprocess_pipeline(test_sentence, "english", "PorterStemmer", True, False, True))
print ("\nLancaster:\n", preprocess_pipeline(test_sentence, "english", "LancasterStemmer", False, False, True))
print ("\nWordNet:\n", preprocess_pipeline(test_sentence, "english", "WordNetLemmatizer", False, False, True))
print ("\nStopword Tokenized Lancaster:\n", preprocess_pipeline(test_sentence, "english", "LancasterStemmer", True, True, True))
# print ("\nOnly cleaning (HTML+Text):\n", preprocess_pipeline(test_sentence, "english", False, True, False, True))


Original:
 User-Testing Tester Tests! She had me at 'hello'?!? But then fainted... and Eighty cars drove past.
[b'user', b'test', b'tester', b'test']
['user', 'test', 'tester', 'test']
[b'she', b'had', b'me', b'at', b'hello']
['she', 'had', 'me', 'at', 'hello']
[b'but', b'then', b'faint', b'and', b'eighti', b'car', b'drove', b'past']
['but', 'then', 'faint', 'and', 'eighti', 'car', 'drove', 'past']

Porter:
 user test tester test she had me at hello but then faint and eighti car drove past
[b'us', b'test', b'test', b'test']
['us', 'test', 'test', 'test']
[b'she', b'had', b'me', b'at', b'hello']
['she', 'had', 'me', 'at', 'hello']
[b'but', b'then', b'faint', b'and', b'eighty', b'car', b'drov', b'past']
['but', 'then', 'faint', 'and', 'eighty', 'car', 'drov', 'past']

Lancaster:
 [['us', 'test', 'test', 'test'], ['she', 'had', 'me', 'at', 'hello'], ['but', 'then', 'faint', 'and', 'eighty', 'car', 'drov', 'past']]
[b'user', b'testing', b'tester', b'test']
['user', 'testing', 'tester', 't

## Turning Tokens back into a large string of text

Let's use Porter because Lancaster is more aggressive and truncates words like Obama into obam. Obama is an important word. The sample takes 10 seconds to run.

In [5]:
email_sample_ppd = preprocess_pipeline(email_sample_text, "english", "LancasterStemmer", True, True, False) # does not return as string, instead returns

[b'democr', b'vot']
['democr', 'vot']
[b'ohio', b'vot', b'support', b'53', b'40', b'perc', b'giv', b'peopl', b'opt', b'govern', b'heal', b'ins', b'plan']
['ohio', 'vot', 'support', '53', '40', 'perc', 'giv', 'peopl', 'opt', 'govern', 'heal', 'ins', 'plan']
[b'independ', b'vot', b'support', b'publ', b'opt', b'55', b'38', b'perc']
['independ', 'vot', 'support', 'publ', 'opt', '55', '38', 'perc']
[b'afgh', b'unit', b'stat', b'right', b'thing', b'fight', b'afgh', b'ohio', b'vot', b'say', b'48', b'43', b'perc']
['afgh', 'unit', 'stat', 'right', 'thing', 'fight', 'afgh', 'ohio', 'vot', 'say', '48', '43', 'perc']
[b'vot', b'support', b'51', b'40', b'perc', b'recommend', b'top', b'u', b'command', b'afgh', b'40', b'000', b'u', b'troop', b'sent']
['vot', 'support', '51', '40', 'perc', 'recommend', 'top', 'u', 'command', 'afgh', '40', '000', 'u', 'troop', 'sent']
[b'55', b'perc', b'ohio', b'say', b'wil', b'see', b'larg', b'numb', b'am', b'troop', b'afgh', b'two', b'year', b'less', b'hard', b'mand

In [39]:
email_sample_ppd = preprocess_pipeline(email_sample_text, "english", "PorterStemmer", True, True, False) # does not return as string, instead returns

[b'democrat', b'vote']
['democrat', 'vote']
[b'ohio', b'voter', b'support', b'53', b'40', b'percent', b'give', b'peopl', b'option', b'govern', b'health', b'insur', b'plan']
['ohio', 'voter', 'support', '53', '40', 'percent', 'give', 'peopl', 'option', 'govern', 'health', 'insur', 'plan']
[b'independ', b'voter', b'support', b'public', b'option', b'55', b'38', b'percent']
['independ', 'voter', 'support', 'public', 'option', '55', '38', 'percent']
[b'afghanistan', b'unit', b'state', b'right', b'thing', b'fight', b'afghanistan', b'ohio', b'voter', b'say', b'48', b'43', b'percent']
['afghanistan', 'unit', 'state', 'right', 'thing', 'fight', 'afghanistan', 'ohio', 'voter', 'say', '48', '43', 'percent']
[b'voter', b'support', b'51', b'40', b'percent', b'recommend', b'top', b'u', b'command', b'afghanistan', b'40', b'000', b'u', b'troop', b'sent']
['voter', 'support', '51', '40', 'percent', 'recommend', 'top', 'u', 'command', 'afghanistan', '40', '000', 'u', 'troop', 'sent']
[b'55', b'percent',

### Put String to Text File

In [11]:
text_file = open("email_sample_processed.txt", "w")
text_file.write(email_sample_ppd)
text_file.close()

In [40]:
email_sample_ppd = nltk.tokenize.word_tokenize(email_sample_ppd)

In [41]:
fdist1 = FreqDist(email_sample_ppd)

In [43]:
fdist1.most_common(100)

[('state', 192),
 ('pm', 117),
 ('china', 115),
 ('depart', 105),
 ('obama', 93),
 ('would', 87),
 ('u', 81),
 ('2009', 79),
 ('percent', 71),
 ('call', 69),
 ('meet', 65),
 ('democrat', 64),
 ('secretari', 64),
 ('presid', 64),
 ('new', 63),
 ('offici', 61),
 ('work', 59),
 ('rate', 59),
 ('time', 59),
 ('offic', 57),
 ('polici', 53),
 ('clinton', 51),
 ('one', 48),
 ('need', 48),
 ('also', 47),
 ('us', 46),
 ('countri', 45),
 ('republican', 45),
 ('like', 45),
 ('among', 44),
 ('30', 44),
 ('elect', 43),
 ('well', 42),
 ('black', 42),
 ('h', 41),
 ('women', 41),
 ('nation', 41),
 ('good', 41),
 ('econom', 41),
 ('year', 41),
 ('1', 41),
 ('10', 40),
 ('govern', 40),
 ('world', 38),
 ('public', 38),
 ('chang', 37),
 ('approv', 37),
 ('foreign', 37),
 ('white', 37),
 ('12', 36),
 ('global', 36),
 ('00', 36),
 ('much', 35),
 ('get', 35),
 ('said', 34),
 ('w', 34),
 ('gov', 34),
 ('11', 34),
 ('even', 34),
 ('2', 34),
 ('see', 33),
 ('4', 32),
 ('take', 32),
 ('3', 32),
 ('issu', 32),
 (

We will eliminate words like would, u, 2009, call, meet, etc.

We want the full list to be more accurate. Don't know how long this will take... over 5 minutes, then timeout.

### Running Pre-process for full text

In [None]:
email_ppd = preprocess_pipeline(email_text, "english", "PorterStemmer", True, True, False) # does not return as string, instead returns

[b'1', b'b6', b'thursday', b'march', b'3', b'2011', b'9', b'45', b'pm', b'h', b'latest', b'syria', b'aid', b'qaddafi', b'sid', b'hrc', b'memo', b'syria', b'aid', b'libya', b'030311', b'docx', b'hrc', b'memo', b'syria', b'aid', b'libya', b'030311', b'docx', b'march', b'3', b'2011', b'hillari', b'2', b'thx', b'4', b'h', b'hrod17', b'clintonemail', b'com', b'friday', b'march', b'11', b'2011', b'1', b'36', b'pm', b'huma', b'abedin', b'fw', b'h', b'latest', b'syria', b'aid', b'qaddafi', b'sid', b'hrc', b'memo', b'syria', b'aid', b'libya', b'030311', b'docx', b'pi', b'print']
['1', 'b6', 'thursday', 'march', '3', '2011', '9', '45', 'pm', 'h', 'latest', 'syria', 'aid', 'qaddafi', 'sid', 'hrc', 'memo', 'syria', 'aid', 'libya', '030311', 'docx', 'hrc', 'memo', 'syria', 'aid', 'libya', '030311', 'docx', 'march', '3', '2011', 'hillari', '2', 'thx', '4', 'h', 'hrod17', 'clintonemail', 'com', 'friday', 'march', '11', '2011', '1', '36', 'pm', 'huma', 'abedin', 'fw', 'h', 'latest', 'syria', 'aid', 'q

## Eliminate more tokens

We will eliminate more tokens once we know which words are key.