## Snooping through emails

In [8]:
import os
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import feature_extraction

import gensim

pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

%matplotlib inline
plt.style.use('ggplot')

In [9]:
emails = pd.read_csv("../datasets/emails.csv", names=["message"])
emails.reset_index(drop=True, inplace=True)

emails.head()

Unnamed: 0,message
0,water pipe replacement cost todd water pipe re...
1,cell phone reimbursement commissioner reimburs...
2,travel fri. 3/28 todd first class friday s fli...
3,romer 1 roston 01774 released msnbc.com state ...
4,flight information: friday march 28th travel j...


### Use Sklearn to build a corpus using CountVectorizer

In [10]:
vectorizer = feature_extraction.text.CountVectorizer(stop_words = 'english')

documents = vectorizer.fit_transform(emails.message)

### Build the corpus and use LdaModel to extract topics from it

In [11]:
corpus = gensim.matutils.Sparse2Corpus(documents, documents_columns = False)

id2word = dict(enumerate(vectorizer.get_feature_names()))

In [13]:
model = gensim.models.ldamodel.LdaModel(corpus = corpus, num_topics = 10, id2word = id2word)

In [14]:
model.print_topics()

[(0,
  u'0.008*"com" + 0.008*"state" + 0.006*"health" + 0.006*"msnbc" + 0.005*"http" + 0.005*"school" + 0.005*"www" + 0.004*"education" + 0.004*"program" + 0.004*"care"'),
 (1,
  u'0.009*"board" + 0.008*"alaska" + 0.007*"executive" + 0.006*"2009" + 0.006*"com" + 0.005*"10" + 0.005*"business" + 0.004*"11" + 0.004*"council" + 0.004*"new"'),
 (2,
  u'0.026*"alaska" + 0.022*"state" + 0.015*"com" + 0.010*"msnbc" + 0.009*"gas" + 0.006*"oil" + 0.005*"www" + 0.005*"governor" + 0.005*"http" + 0.004*"pipeline"'),
 (3,
  u'0.024*"alaska" + 0.018*"roston" + 0.018*"state" + 0.018*"crivella" + 0.017*"presented" + 0.017*"released" + 0.017*"west" + 0.016*"com" + 0.013*"msnbc" + 0.013*"governor"'),
 (4,
  u'0.009*"sent" + 0.008*"pra" + 0.007*"state" + 0.007*"device" + 0.007*"cellular" + 0.007*"blackberry" + 0.006*"need" + 0.005*"com" + 0.005*"letter" + 0.004*"know"'),
 (5,
  u'0.058*"com" + 0.030*"deliberative" + 0.030*"msnbc" + 0.021*"process" + 0.016*"http" + 0.016*"www" + 0.015*"posted" + 0.015*"pub

## Bonus

- Repeat the analysis but on the email inbox of CEO
  - (You'll need to extract dates, subject line, recipients, and message content from the ceo dataset)