In [None]:
# import libraries
from time import time
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# heart of DS
import numpy as np
import pandas as pd
from scipy import sparse

# gensim for topic modeling
from gensim import matutils
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel

# scikit-learn for text feature extraction & manifold learning 
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS

# -- visualizations --
# for manual reading
import ipywidgets as widgets
from IPython.display import display

from wordcloud.wordcloud import WordCloud

# for interactive plots
from bokeh.io import output_notebook 
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.models import HoverTool

# for general plots
import seaborn as sns 
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
%matplotlib inline

# for LDA visualizations
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

## Load Data

In [None]:
# set data path
data = ''

# read data into DataFrame
df = pd.read_csv(data)

print('Number of documents:', df.shape[0])
df.head()

## Set Constants

In [None]:
# set constants to remain dataset-agnostic for the remainder of the notebook
TXT_COL = 'text'
SNS_PALETTE = 'winter_r'

## Drop NaNs and Duplicates

In [None]:
n_docs = df.shape[0]
print('Number of docs:', n_docs)

df = df.dropna(subset=[TXT_COL])
nans_dropped = n_docs - df.shape[0]
print('Number of NaNs removed:', nans_dropped)

df = df.drop_duplicates(subset=[TXT_COL])
dupes_dropped = n_docs - nans_dropped - df.shape[0]
print('Number of duplicates removed:', dupes_dropped)
print()
print('Final number of docs:', df.shape[0])

## Read Random Samples

In [None]:
index = -1
dataset_size = df.shape[0]

# text fields
index_t = widgets.Text(description='Index:', value=str(index))
text_ta = widgets.Textarea(description='Text:', layout=widgets.Layout(width='100%', height='200px'))

# buttons
next_b = widgets.Button(description='Next')

def clear_fields():
    text_ta.value = ''

def next_doc(b):
    global index
    global sample_len
    
    # clear text fields
    clear_fields()
    
    # update text fields with text from random index
    index = np.random.randint(0, df.shape[0]-1)
    if index < dataset_size:
        #update text areas
        index_t.value = str(index)
        text_ta.value = df.iloc[index][TXT_COL]

next_b.on_click(next_doc)

display(index_t)
display(text_ta)
display(next_b)

## Transform text into vectors of term frequencies
* [CountVectorizer Documentation](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)

In [None]:
# initialize vectorizers that will create document x term frequency matrices out of our text
tf_unprepped_vec = CountVectorizer(ngram_range=(1,1), 
                                   token_pattern=r'(?u)\b\w+\b', #words with one character or more
                                   lowercase=False)

ENGLISH_STOP_WORDS = list(set(list(ENGLISH_STOP_WORDS)+['said','got']))
tf_vec = CountVectorizer(ngram_range=(1,1),  
                         strip_accents='unicode', 
                         lowercase=True,
                         token_pattern=r'(?u)\b\w{3,}\b', #words with three characters or more. Be sure we're not throwing out any important words!
                         min_df=10, # cut features that show up in at least 10 docs of the corpus
                         max_df=0.95, # cut features that show up in the top 5% of terms by document frequency
                         stop_words=ENGLISH_STOP_WORDS)

# fit the vocabularies and return the document x term frequency matrix per vectorizer
X_unprepped = tf_unprepped_vec.fit_transform(df[TXT_COL])
X = tf_vec.fit_transform(df[TXT_COL])

# print number of docs, size of vocabs, comparisons
print('Number of docs:', X_unprepped.shape[0])
print('Size of unpreprocessed vocab:', X_unprepped.shape[1])
print('Size of preprocessed vocab:', X.shape[1])
print()
print('Vocab trimmed by simple preprocessing:', (X_unprepped.shape[1] - X.shape[1]))

## Visualize docs as term-frequency feature-vectors

In [None]:
features = tf_vec.get_feature_names()
pd.DataFrame(X.A, columns=features).head()

## Feature Sparsity
* As we observe the DataFrame above, we notice many zeros for the many words in our vocabulary. This can be troublesome for modeling in terms of finding correlation between the sparse features and our desired outputs.
* A few things that can trim or densify our feature vectors include:
    * Further Preprocessing 
        * (e.g., stop word removal, lemma/stemming, and/or frequency filtering with min_df and max_df in the Vectorizer class)
    * Feature Selection 
        * (e.g., [chi-squared feature selection](http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.chi2.html) to find the features that are more present in one class over the others)
    * Different Feature Spaces 
        * (e.g., [word embeddings](https://medium.com/data-science-group-iitr/word-embedding-2d05d270b285) represent words in vector lengths of 50-200. By taking the sum or average of the embeddings from a document, we are still left with a 50-200 dimension feature space instead of the above's ~13k feature dimensions)
    
## What do we see vs. what an ML model sees?

In [None]:
sample = df.sample(n=1)[TXT_COL].iloc[0].strip()
print(sample)

In [None]:
# transform sample with our vectorizer
sample_X = tf_vec.transform([sample])
# get sample's features that have nonzero term frequencies
nonzero_fts = tf_vec.inverse_transform(sample_X)[0]
# get sample's nonzero term frequencies
nonzero_tf = sample_X[sample_X.nonzero()]

pd.DataFrame(nonzero_tf, columns=nonzero_fts)

## Bag of Words
* As we see in the DataFrame above, there is no order to our features. This is, yet, another thing that can be costly in modeling, especially when things such as adjectives, other descriptors, and word order play a big role in our ultimate task. Some examples:
    * **"this flappy bird game makes me so angry" --> Bad Review**
    * **"angry birds helps me take my anger out of the day" --> Good Review**
* *Similar* words/features, but *different* word orders can make all the difference!
* **Some ideas to make our life easier:**
    * **ngrams** (We can increase the range of words to be considered as a single feature!)
        - ngram range 2-3 for one of the above examples --> \['this flappy', 'flappy bird', ..., 'me so angry'\]
        - **WARNING**: If using a range, say 1-2, the feature space will increase by **N\*K-1** (N is size of unigram vocab, K is number of grams), so the feature engineering ideas from above should be applied when modeling comes into play.
    * **Dependency Parse Grams** (e.g., birds_angry, birds_helps, helps_me, ...)
    * **ngram embedding** sums/averages can come into play as well if ngram 'angry_birds' is embedded in a video game name space, or in between both good and bad review-word spaces
    * In the realm of Deep Learning, Recurrent Neural Nets (RNNs) and Convolutional Neural Nets (CNNs) are the heavy hitters for tackling the problem of word order. 
        - [Deep Learning Methods for Text Classification](https://medium.com/jatana/report-on-text-classification-using-cnn-rnn-han-f0e887214d5f)

## Get Distributions
* Are most of the documents around the same size in terms of token count? 
    * [Box-n-Whiskers](https://www.khanacademy.org/math/statistics-probability/summarizing-quantitative-data/box-whisker-plots/a/box-plot-review)
* What are the top & bottom features summed?
* Hey, how about a word cloud?

In [None]:
# get doc lengths
# axis=1 means summing across the columns. [:,0] is to grab our "total" column, index 0, of each row
doc_lengths = X.sum(axis=1).A[:,0] 

# set top_n to observe top n sum/avg features
top_n = 25

# get feature sums
features = np.array(tf_vec.get_feature_names())
feature_sums = X.sum(axis=0).A[0] # axis=0 means we are summing all the rows. [0] is to grab our "total" row

# THREE things happening here: 
    # [argsort...] is to return the indices of the sorted row
    # [::-1] reverses the sorted indices to order by DESC
    # [:top_n] grabs a slice of the reverse sorted list of indices from beginning to n
# get top n common features
top_n_sum_indices = np.argsort(feature_sums)[::-1][:top_n]
top_n_sums = feature_sums[top_n_sum_indices]
top_n_sums_features = features[top_n_sum_indices]

# get top n rare features
bottom_n_sum_indices = np.argsort(feature_sums)[:top_n]
bottom_n_sums = feature_sums[bottom_n_sum_indices]
bottom_n_sums_features = features[bottom_n_sum_indices]

# ---- plots ----

# plot document length distributions
fig, (ax, ax1) = plt.subplots(1,2, figsize=(14,8))
sns.boxplot(y=doc_lengths, ax=ax)
ax.set_title('Document Length Distributions')
sns.boxplot(y=doc_lengths, ax=ax1)

ax1.set_title('Document Length Distributions Zoomed')
ax1.set_ylim(top=np.percentile(doc_lengths,95), bottom=-5)
plt.show()

# plot common and rare feature counts
fig2, (ax2, ax3) = plt.subplots(1,2, figsize=(14,8))
sns.barplot(x=top_n_sums, y=top_n_sums_features, palette=SNS_PALETTE, ax=ax2)
ax2.set_title('%d most common features' % top_n)

sns.barplot(x=bottom_n_sums, y=bottom_n_sums_features, palette=SNS_PALETTE, ax=ax3)
ax3.set_title('%d least common features' % top_n)
plt.show()

# plot word cloud of most common features 
token_freqs = {tok:freq for tok,freq in zip(top_n_sums_features, top_n_sums)}
wc_generator = WordCloud(background_color='whitesmoke',colormap='winter_r')
wc = wc_generator.generate_from_frequencies(token_freqs)
plt.figure(figsize=(10,5))
plt.axis('off')
plt.imshow(wc, interpolation='bilinear')
plt.show()

## Do we come to any new observations?
* **short vs. medium vs. long documents**
    * Is some actor on some big monologue? Did someone's cat get too lonely(sitting on keyboard)? Are there multiple addendums?
    * This could drive questions such as:
        * "Where does this garbage come from??"
        * "How should we treat addendums? Should they be filtered out, kept, or demand our immediate attention?" 
        * "Should we only focus on documents of a certain size? Is there initial insight to document length as a feature for future modeling?"
* **common vs rare terms**
    * Are these terms expected?
    * Do any of the terms seem weird to exist?

## Observe Outliers

In [None]:
# grab documents with lengths that exceed standard deviation
doc_len_thresh = int(np.percentile(doc_lengths, 75))
fourth_quartile_data = df[doc_lengths > doc_len_thresh]
print('Number of documents with lengths greater than {}: {}'.format(doc_len_thresh, fourth_quartile_data.shape[0]))

In [None]:
index = -1
dataset_size = fourth_quartile_data.shape[0]

# text fields
index_t = widgets.Text(description='Index:', value=str(index))
text_ta = widgets.Textarea(description='Text:', layout=widgets.Layout(width='100%', height='200px'))

# buttons
next_b = widgets.Button(description='Next')

def clear_fields():
    text_ta.value = ''

def next_doc(b):
    global index
    global sample_len
    
    # clear text fields
    clear_fields()
    
    # update text fields with text from random index
    index = np.random.randint(0, fourth_quartile_data.shape[0]-1)
    if index < dataset_size:
        #update text areas
        index_t.value = str(index)
        text_ta.value = fourth_quartile_data.iloc[index][TXT_COL]

next_b.on_click(next_doc)

display(index_t)
display(text_ta)
display(next_b)

## Topic Modeling

### LDA Priors:
**A lower prior _(alpha and/or beta)_ causes further sparsity in post distributions, forcing:**
* topics to be more exclusive to documents (**alpha**)
* words to be more exclusive to topics (**beta**)

### alpha:
**document-to-topic sparsity ("how few topics to allow in a document")**
* **When observing the dataset, how many topics do you come across per document?**
    * Is the document something like a tweet? If so, there might only be one topic discussed in that document. This would suggest a lower, sparser **alpha** to force the topic suggestion to one topic per document.
    * Is the document something like a news article? There might be more topics discussed within. This would suggest a higher, denser **alpha** to force the topic suggestion to multiple topics per document.
    * In Gensim's LdaModel, set parameter **alpha='auto'** to start. Observe the topic assignment to documents, check the lda_model.alpha approximations, and update accordingly.


### beta: 
**topic-to-word sparsity ("how few words to allow in a topic")**
* **When observing the dataset, how many words do you come across that you think should belong to a topic?**
    * Is the corpus vocabulary small? If so, there might be more uniqueness in the words. This would suggest a lower, sparser **beta** to force the words to participate in less topics.
    * Is the corpus vocabulary large? There might be more general terms that would overlap in certain topics. This would suggest a higher, denser **beta** to force the words to participate in more topics.
    * In Gensim's LdaModel, set parameter **eta='auto'** to start. Observe the vocab overlap, check the lda_model.eta approximations, and update accordingly.

**How to Choose Number of Topics?** 
* [Coherence](http://qpleple.com/topic-coherence-to-evaluate-topic-models/)

**Helpful Resources**
* [Meet the Maker: David Blei](http://videolectures.net/mlss09uk_blei_tm/)
* [What is Topic Coherence?](https://rare-technologies.com/what-is-topic-coherence/)
* [Gensim's LDA Model](https://radimrehurek.com/gensim/models/ldamodel.html)
* [Gensim's Coherence Model](https://radimrehurek.com/gensim/models/coherencemodel.html)
* [Explanation of Topic Coherence](http://qpleple.com/topic-coherence-to-evaluate-topic-models/)
* [Step-by-step for pure Gensim Topic Modeling sans scikit-learn](https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/)
* [Guided LDA](https://github.com/vi3k6i5/GuidedLDA)

In [None]:
# convert scipy sparse matrix to gensim sparse matrix
if sparse.issparse(X):
     gen_X = matutils.Sparse2Corpus(sparse=X, documents_columns=False)
        
# create variables that gensim requires
word2id = tf_vec.vocabulary_
id2word = dict((v,k) for k,v in word2id.items())
gensim_dict = Dictionary.from_corpus(gen_X, id2word=id2word)

# loop and plot coherence of lda models
best_lda = None
min_cvs = []
max_cvs = []
mean_cvs = []
num_topics_list = list(range(2,60,8))
for num_topics in num_topics_list:
    t0 = time()
    lda_model = LdaModel(corpus=gen_X, num_topics=num_topics, id2word=id2word, 
                         alpha='auto', eta='auto',  # alpha='auto', eta='auto',
                         iterations=300, eval_every=None, random_state=13)
    t1 = time()
    cm = CoherenceModel(model=lda_model, corpus=gen_X, coherence='u_mass')
    cv_per_topic = cm.get_coherence_per_topic()
    mean_cv = np.mean(cv_per_topic)
    mean_cvs.append(mean_cv)
    min_cvs.append(np.min(cv_per_topic))
    max_cvs.append(np.max(cv_per_topic))
    print("Mean UMASS Coherence={} with num_topics={} in {} sec".format(mean_cv, num_topics, (t1 - t0)))
    if best_lda:
        if mean_cv > best_lda['mean_cv']:
            best_lda = {'mean_cv':mean_cv, 'cv_per_topic':cv_per_topic, 'num_topics':num_topics}
    else:
        best_lda = {'mean_cv':mean_cv, 'cv_per_topics':cv_per_topic, 'num_topics':num_topics}

# plot CVs
plt.figure(figsize=(10,6))
max_df = pd.DataFrame({'Coherence':max_cvs, 'Number of Topics':num_topics_list, 'Metric':['Max Coherence']*len(max_cvs)})
min_df = pd.DataFrame({'Coherence':min_cvs, 'Number of Topics':num_topics_list, 'Metric':['Min Coherence']*len(min_cvs)})
mean_df = pd.DataFrame({'Coherence':mean_cvs, 'Number of Topics':num_topics_list, 'Metric':['Mean Coherence']*len(mean_cvs)})
coherence_df = pd.concat([max_df, min_df, mean_df])
sns.pointplot(x='Number of Topics', y='Coherence', hue='Metric', data=coherence_df, palette=SNS_PALETTE, alpha=0.5)
plt.title('Max/Min/Mean Coherence Scores per Number of Topics')
plt.xlabel('Number of Topics')
plt.ylabel('Coherence Value')
plt.show()

In [None]:
num_topics = best_lda['num_topics']
lda_model = LdaModel(corpus=gen_X, num_topics=num_topics, 
                     id2word=id2word, alpha='auto', eta='auto', 
                     iterations=300, eval_every=None, random_state=13)

# get top topics per document
docs_topics = lda_model.get_document_topics(gen_X)

# get topics probas per word
topics_terms = lda_model.get_topics() 

# print top 10 words per topic
lda_model.print_topics(num_topics=num_topics, num_words=10)

In [None]:
lda_model.alpha

In [None]:
lda_model.eta

In [None]:
np.mean(lda_model.eta)

## Organize Texts and Topics into DataFrame

In [None]:
def format_topics_sentences(lda_model, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(lda_model[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prob_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                word_proba = lda_model.show_topic(topic_num)
                topic_keywords = ', '.join([word for word, prob in word_proba])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prob_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return sent_topics_df


df_topic_sents_keywords = format_topics_sentences(lda_model=lda_model, corpus=gen_X, texts=df[TXT_COL].values)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

## Visualize Topics via pyLDAvis
**Other LDA Visualization Tools:**
* [Stanford's Termite](http://vis.stanford.edu/papers/termite)

In [None]:
lda_vis = pyLDAvis.gensim.prepare(lda_model, gen_X, gensim_dict)
lda_vis

## Visualizations with t-SNE
* t-SNE reduces the dimensionality of our data while preserving the structure of local neighborhoods. 
    * This gives us an _idea_ of how our data groups together.
* [Notes from the Maker: Laurens van der Maaten](https://lvdmaaten.github.io/tsne) **<-- Seriously, his FAQ tips are pretty helpful.**
* [Famous Distill Article on Interpreting t-SNE](https://distill.pub/2016/misread-tsne/)

**Scalable t-SNE in python:**
* [FIt-SNE](https://github.com/KlugerLab/FIt-SNE)
* [LargeVis](https://github.com/lferry007/LargeVis)

**WARNING: t-SNE TAKES A WHILE**
* Sample of 10000 can take from 10-20 minutes.

In [None]:
%%time

# set perplexity (nearest neighbors)
perplexity = 30

# snag a sample if conditions are crazy
if X.shape[0] > 10000:
    print('Dataset size is big and I don\'t want to wait forever. Grabbing random sample.\n')
    sample_indices = np.random.randint(0, X.shape[0], size=10000)
else:
    sample_indices = np.array(range(X.shape[0]))

print('Fitting T-SNE on X with {} samples and {} features.\n'.format(sample_indices.shape[0],X.shape[1]))

tsne = TSNE(n_components=2, perplexity=perplexity, 
            n_iter=2000, n_iter_without_progress=200,
            random_state=13, init='pca', method='barnes_hut')
tsne_X = tsne.fit_transform(X[sample_indices].A)

## Visualize Data via T-SNE

In [None]:
## visualize with Bokeh
bokeh_X = ColumnDataSource(
        data=dict(
            x =       tsne_X[:,0],
            y =       tsne_X[:,1],
            Indices = list(range(tsne_X.shape[0])),
            Text =    [', '.join(ft_vec) for ft_vec in 
                       tf_vec.inverse_transform(tf_vec.transform(df.iloc[sample_indices][TXT_COL]))]
        )
    )

hover_tsne = HoverTool(names=['X'], tooltips=[('Text', '@Text'), ('Index Location','@Indices')])
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'box_zoom', 'reset', 'save']
plot_tsne = figure(plot_width=600, plot_height=600, tools=tools_tsne, title='t-SNE of Documents')
plot_tsne.circle('x', 'y', size=10, fill_color='blue', 
                 alpha=0.5, line_width=0, source=bokeh_X, name='X')

show(plot_tsne)

## Visualize Data w/ Topics via T-SNE

In [None]:
topics = df_dominant_topic.iloc[sample_indices]['Dominant_Topic'].values

## visualize with Bokeh
bokeh_X = ColumnDataSource(
        data=dict(
            x =       tsne_X[:,0],
            y =       tsne_X[:,1],
            Indices = list(range(tsne_X.shape[0])),
            colors =  ['#%02x%02x%02x' % (int(r),int(g),int(b)) 
                       for r,g,b,_ 
                       in 255*plt.cm.jet(topics)],
            Text =    [', '.join(ft_vec) for ft_vec in 
                       tf_vec.inverse_transform(tf_vec.transform(df.iloc[sample_indices][TXT_COL]))]
        )
    )

hover_tsne = HoverTool(names=['X'], tooltips=[('Text', '@Text'), ('Index Location','@Indices')])
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'box_zoom', 'reset', 'save']
plot_tsne = figure(plot_width=600, plot_height=600, tools=tools_tsne, title='t-SNE of Document with Topic Labels')
plot_tsne.circle('x', 'y', size=10, fill_color='colors', 
                 alpha=0.5, line_width=0, source=bokeh_X, name='X')

show(plot_tsne)

## Conclusion
EDA doesn't solve our problem, but serves as a wonderful set of tools to help us:
* observe
* question
* research
* hypothesize
* experiment
* analyze
* conclude about the value of the data at hand.

It helps us become familiar with the data, and helps us become confident about next steps. 

And if the conclusion is to move towards modeling the data, we'll be ready!