In [1]:
import os
os.chdir('..')
os.getcwd()

'/Users/signapoop/Desktop/fyp-graph-clustering'

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from time import time
import lda

In [3]:
from util.plot_embedding import plot_embedding

In [4]:
from sklearn.datasets import fetch_20newsgroups

# we only want to keep the body of the documents!
remove = ('headers', 'footers', 'quotes')

# fetch train and test data
newsgroups_train = fetch_20newsgroups(subset='train', remove=remove)
newsgroups_test = fetch_20newsgroups(subset='test', remove=remove)

In [5]:
print(len(newsgroups_train.target))
print(len(newsgroups_test.target))

11314
7532


In [6]:
# a list of cleaned news in string format
# only keep letters & make them all lower case
news = [' '.join(filter(str.isalpha, raw.lower().split())) for raw in newsgroups_train.data]
labels = list(newsgroups_train.target)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

n_topics = 20 # number of topics
n_iter = 500 # number of iterations

# vectorizer: ignore English stopwords & words that occur less than 5 times
cvectorizer = CountVectorizer(min_df=5, stop_words='english')
cvz = cvectorizer.fit_transform(news)

# train an LDA model
lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(cvz)

INFO:lda:n_documents: 11314
INFO:lda:vocab_size: 12176
INFO:lda:n_words: 616447
INFO:lda:n_topics: 20
INFO:lda:n_iter: 500
  if sparse and not np.issubdtype(doc_word.dtype, int):
INFO:lda:<0> log likelihood: -7817484
INFO:lda:<10> log likelihood: -5777165
INFO:lda:<20> log likelihood: -5522489
INFO:lda:<30> log likelihood: -5423408
INFO:lda:<40> log likelihood: -5370560
INFO:lda:<50> log likelihood: -5339491
INFO:lda:<60> log likelihood: -5317378
INFO:lda:<70> log likelihood: -5300659
INFO:lda:<80> log likelihood: -5289006
INFO:lda:<90> log likelihood: -5278211
INFO:lda:<100> log likelihood: -5270574
INFO:lda:<110> log likelihood: -5264278
INFO:lda:<120> log likelihood: -5259601
INFO:lda:<130> log likelihood: -5256433
INFO:lda:<140> log likelihood: -5253385
INFO:lda:<150> log likelihood: -5248674
INFO:lda:<160> log likelihood: -5247226
INFO:lda:<170> log likelihood: -5246047
INFO:lda:<180> log likelihood: -5243154
INFO:lda:<190> log likelihood: -5241173
INFO:lda:<200> log likelihood: -

In [8]:
print(X_topics.shape)
print(len(labels))

(11314, 20)
11314


In [9]:
threshold = 0.5
_idx = np.amax(X_topics, axis=1) > threshold  # idx of doc that above the threshold
X_topics = X_topics[_idx]
labels = [l for i,l in enumerate(labels) if _idx[i]]

In [10]:
# Get the first 200 characters of each news article
news_snippets = [n[:200] for i,n in enumerate(news) if _idx[i]]

In [11]:
topics = [newsgroups_train.target_names[l] for l in labels]

In [12]:
print(X_topics.shape)
print(len(labels))
print(len(news_snippets))

(4101, 20)
4101
4101


In [13]:
from sklearn import manifold
tsne_model = manifold.TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
t0 = time()
tsne_lda = tsne_model.fit_transform(X_topics)
print('sklearn took {} seconds'.format(time()-t0))

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 4101 samples in 0.004s...
[t-SNE] Computed neighbors for 4101 samples in 0.664s...
[t-SNE] Computed conditional probabilities for sample 1000 / 4101
[t-SNE] Computed conditional probabilities for sample 2000 / 4101
[t-SNE] Computed conditional probabilities for sample 3000 / 4101
[t-SNE] Computed conditional probabilities for sample 4000 / 4101
[t-SNE] Computed conditional probabilities for sample 4101 / 4101
[t-SNE] Mean sigma: 0.077350
[t-SNE] KL divergence after 250 iterations with early exaggeration: 61.185852
[t-SNE] Error after 1000 iterations: 0.793792
sklearn took 67.00283527374268 seconds


In [14]:
import bokeh.plotting as bp
from bokeh.plotting import save, show
from bokeh.models import HoverTool
from bokeh.io import output_notebook

n_top_words = 5 # number of keywords we show

# 20 colors
colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])

In [15]:
output_notebook()

In [16]:
_lda_keys = []
for i in range(X_topics.shape[0]):
    _lda_keys +=  X_topics[i].argmax(),
    
topic_summaries = []
topic_word = lda_model.topic_word_  # all topic words
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] # get!
    topic_summaries.append(' '.join(topic_words)) # append!

In [17]:
title = '20 newsgroups LDA viz'
num_example = len(X_topics)

plot_lda = bp.figure(plot_width=900, plot_height=700,
                     title=title,
                     tools="pan, wheel_zoom, box_zoom, reset, hover, previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

data_dict = {'content': news_snippets,
             'topic_key': topics, 
             'x': tsne_lda[:, 0], 
             'y': tsne_lda[:, 1],
             'color': colormap[labels][:num_example]}

mySource = bp.ColumnDataSource(data_dict)

plot_lda.circle(x='x', y='y', color='color', source=mySource)

# Select the centroid of each LDA cluster as the coordinates to plot crucial words
lda_labels = np.array(_lda_keys)
topic_coord = np.zeros((X_topics.shape[1], 2))
for i in range(X_topics.shape[1]):
    mask = np.argwhere(lda_labels == i).flatten()
    topic_coord[i] = np.average(tsne_lda[mask], axis=0)
    
# plot crucial words
for i in range(X_topics.shape[1]):
      plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]], text_font_size='10px')

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content - topic: @topic_key"}

# save the plot
#save(plot_lda, '{}.html'.format(title))
show(plot_lda)

In [18]:
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [19]:
topic_summaries # Notice that the topic summaries don't have a direct mapping to each label

['armenian turkish israel israeli jews',
 'space nasa launch satellite data',
 'information send list mail anonymous',
 'president government people new american',
 'hockey team game new nhl',
 'god jesus bible christian church',
 'time like think probably just',
 'file entry use line program',
 'people think does believe just',
 'medical use health patients water',
 'key encryption use chip government',
 'use window db windows image',
 'drive disk hard scsi dos',
 'university information research new center',
 'gun right law state people',
 'car like new used bike',
 'use card does know like',
 'just like know think going',
 'good year team game think',
 'said went came took started']