In [1]:
import os
os.chdir('..')
os.getcwd()

'/Users/signapoop/Desktop/fyp-graph-clustering'

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from time import time
import lda
import torch

In [86]:
from util.plot_embedding import plot_embedding, plot_embedding_subplot

In [4]:
from sklearn.datasets import fetch_20newsgroups

# we only want to keep the body of the documents!
remove = ('headers', 'footers', 'quotes')

# fetch train and test data
newsgroups_train = fetch_20newsgroups(subset='train', remove=remove)
newsgroups_test = fetch_20newsgroups(subset='test', remove=remove)

In [5]:
print(len(newsgroups_train.target))
print(len(newsgroups_test.target))

11314
7532


In [6]:
# a list of cleaned news in string format
# only keep letters & make them all lower case
news = [' '.join(filter(str.isalpha, raw.lower().split())) for raw in newsgroups_train.data]
labels = list(newsgroups_train.target)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

n_topics = 20 # number of topics
n_iter = 500 # number of iterations

# vectorizer: ignore English stopwords & words that occur less than 5 times
cvectorizer = CountVectorizer(min_df=5, stop_words='english')
cvz = cvectorizer.fit_transform(news)

# train an LDA model
lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(cvz)

INFO:lda:n_documents: 11314
INFO:lda:vocab_size: 12176
INFO:lda:n_words: 616447
INFO:lda:n_topics: 20
INFO:lda:n_iter: 500
  if sparse and not np.issubdtype(doc_word.dtype, int):
INFO:lda:<0> log likelihood: -7817484
INFO:lda:<10> log likelihood: -5772893
INFO:lda:<20> log likelihood: -5508281
INFO:lda:<30> log likelihood: -5411484
INFO:lda:<40> log likelihood: -5361775
INFO:lda:<50> log likelihood: -5329155
INFO:lda:<60> log likelihood: -5307600
INFO:lda:<70> log likelihood: -5290209
INFO:lda:<80> log likelihood: -5278155
INFO:lda:<90> log likelihood: -5267968
INFO:lda:<100> log likelihood: -5261749
INFO:lda:<110> log likelihood: -5253143
INFO:lda:<120> log likelihood: -5247496
INFO:lda:<130> log likelihood: -5242551
INFO:lda:<140> log likelihood: -5240071
INFO:lda:<150> log likelihood: -5238536
INFO:lda:<160> log likelihood: -5234051
INFO:lda:<170> log likelihood: -5234704
INFO:lda:<180> log likelihood: -5230868
INFO:lda:<190> log likelihood: -5230773
INFO:lda:<200> log likelihood: -

In [8]:
print(X_topics.shape)
print(len(labels))

(11314, 20)
11314


In [9]:
threshold = 0.5
_idx = np.amax(X_topics, axis=1) > threshold  # idx of doc that above the threshold
X_topics = X_topics[_idx]
labels = [l for i,l in enumerate(labels) if _idx[i]]

In [10]:
# Get the first 200 characters of each news article
news_snippets = [n[:200] for i,n in enumerate(news) if _idx[i]]

In [11]:
topics = [newsgroups_train.target_names[l] for l in labels]

In [12]:
print(X_topics.shape)
print(len(labels))
print(len(news_snippets))

(4109, 20)
4109
4109


In [13]:
from sklearn import manifold
tsne_model = manifold.TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
t0 = time()
tsne_lda = tsne_model.fit_transform(X_topics)
print('sklearn took {} seconds'.format(time()-t0))

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 4109 samples in 0.005s...
[t-SNE] Computed neighbors for 4109 samples in 0.598s...
[t-SNE] Computed conditional probabilities for sample 1000 / 4109
[t-SNE] Computed conditional probabilities for sample 2000 / 4109
[t-SNE] Computed conditional probabilities for sample 3000 / 4109
[t-SNE] Computed conditional probabilities for sample 4000 / 4109
[t-SNE] Computed conditional probabilities for sample 4109 / 4109
[t-SNE] Mean sigma: 0.082482
[t-SNE] KL divergence after 250 iterations with early exaggeration: 60.285706
[t-SNE] Error after 1000 iterations: 0.796428
sklearn took 60.82903003692627 seconds


In [14]:
import bokeh.plotting as bp
from bokeh.plotting import save, show
from bokeh.models import HoverTool
from bokeh.io import output_notebook

n_top_words = 5 # number of keywords we show

# 20 colors
colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])

In [15]:
output_notebook()

In [16]:
_lda_keys = []
for i in range(X_topics.shape[0]):
    _lda_keys +=  X_topics[i].argmax(),
    
topic_summaries = []
topic_word = lda_model.topic_word_  # all topic words
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] # get!
    topic_summaries.append(' '.join(topic_words)) # append!

In [17]:
title = '20 newsgroups LDA viz'
num_example = len(X_topics)

plot_lda = bp.figure(plot_width=900, plot_height=700,
                     title=title,
                     tools="pan, wheel_zoom, box_zoom, reset, hover, previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

data_dict = {'content': news_snippets,
             'topic_key': topics, 
             'x': tsne_lda[:, 0], 
             'y': tsne_lda[:, 1],
             'color': colormap[labels][:num_example]}

mySource = bp.ColumnDataSource(data_dict)

plot_lda.circle(x='x', y='y', color='color', source=mySource)

# Select the centroid of each LDA cluster as the coordinates to plot crucial words
lda_labels = np.array(_lda_keys)
topic_coord = np.zeros((X_topics.shape[1], 2))
for i in range(X_topics.shape[1]):
    mask = np.argwhere(lda_labels == i).flatten()
    topic_coord[i] = np.average(tsne_lda[mask], axis=0)
    
# plot crucial words
for i in range(X_topics.shape[1]):
      plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]], text_font_size='10px')

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content - topic: @topic_key"}

# save the plot
#save(plot_lda, '{}.html'.format(title))
show(plot_lda)

In [18]:
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [19]:
topic_summaries # Notice that the topic summaries don't have a direct mapping to each label

['space nasa launch data satellite',
 'db president think going know',
 'does use windows know card',
 'armenian turkish israel israeli jews',
 'new university american national money',
 'drive hard disk drives dos',
 'window image use available using',
 'god jesus does believe bible',
 'like just think know people',
 'medical health use patients study',
 'car just like good bike',
 'said went did people told',
 'team game play good players',
 'people think just like make',
 'used using use science time',
 'information send list mail anonymous',
 'file use program entry output',
 'gun people government right law',
 'hockey new la period nhl',
 'key encryption chip use government']

In [20]:
# Test set

In [22]:
test_news = [' '.join(filter(str.isalpha, raw.lower().split())) for raw in newsgroups_test.data]
test_labels = list(newsgroups_test.target)

# vectorizer: ignore English stopwords & words that occur less than 5 times
test_cvz = cvectorizer.transform(test_news)

# train an LDA model
X_test = lda_model.transform(test_cvz, max_iter=n_iter)

  if sparse and not np.issubdtype(doc_word.dtype, int):


In [23]:
print(X_test.shape)
print(len(test_labels))

(7532, 20)
7532


In [24]:
threshold = 0.5
_idx = np.amax(X_test, axis=1) > threshold  # idx of doc that above the threshold
X_test = X_test[_idx]
test_labels = [l for i,l in enumerate(test_labels) if _idx[i]]

In [25]:
print(X_test.shape)
print(len(test_labels))

(2168, 20)
2168


In [27]:
# Get the first 200 characters of each news article
news_snippets = [n[:200] for i,n in enumerate(test_news) if _idx[i]]
topics = [newsgroups_test.target_names[l] for l in labels]

In [28]:
tsne_model = manifold.TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
t0 = time()
test_tsne_lda = tsne_model.fit_transform(X_test)
print('sklearn took {} seconds'.format(time()-t0))

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 2168 samples in 0.003s...
[t-SNE] Computed neighbors for 2168 samples in 0.196s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2168
[t-SNE] Computed conditional probabilities for sample 2000 / 2168
[t-SNE] Computed conditional probabilities for sample 2168 / 2168
[t-SNE] Mean sigma: 0.095851
[t-SNE] KL divergence after 250 iterations with early exaggeration: 55.560898
[t-SNE] Error after 1000 iterations: 0.615389
sklearn took 31.697781801223755 seconds


In [32]:
_lda_keys = []
for i in range(X_test.shape[0]):
    _lda_keys +=  X_test[i].argmax(),
    
topic_summaries = []
topic_word = lda_model.topic_word_  # all topic words
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] # get!
    topic_summaries.append(' '.join(topic_words)) # append!

In [33]:
title = '20 newsgroups LDA viz'
num_example = len(X_test)

plot_lda = bp.figure(plot_width=900, plot_height=700,
                     title=title,
                     tools="pan, wheel_zoom, box_zoom, reset, hover, previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

data_dict = {'content': news_snippets,
             'topic_key': topics, 
             'x': test_tsne_lda[:, 0], 
             'y': test_tsne_lda[:, 1],
             'color': colormap[test_labels][:num_example]}

mySource = bp.ColumnDataSource(data_dict)

plot_lda.circle(x='x', y='y', color='color', source=mySource)

# Select the centroid of each LDA cluster as the coordinates to plot crucial words
lda_labels = np.array(_lda_keys)
topic_coord = np.zeros((X_test.shape[1], 2))
for i in range(X_test.shape[1]):
    mask = np.argwhere(lda_labels == i).flatten()
    topic_coord[i] = np.average(test_tsne_lda[mask], axis=0)
    
# plot crucial words
for i in range(X_test.shape[1]):
      plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]], text_font_size='10px')

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content - topic: @topic_key"}

# save the plot
#save(plot_lda, '{}.html'.format(title))
show(plot_lda)



In [31]:
len(_lda_keys)

4109

In [34]:
print(X_topics.shape)
print(len(labels))
print(tsne_lda.shape)

(4109, 20)
4109
(4109, 2)


In [64]:
X_topics = torch.from_numpy(X_topics)
X_topics = X_topics.type(torch.FloatTensor)
labels = torch.from_numpy(np.array(labels))

TypeError: expected np.ndarray (got Tensor)

In [74]:
import pickle
with open('20news_lda_train_tsne.pkl', 'wb') as f:
    pickle.dump([X_topics, labels, tsne_lda], f)

In [75]:
print(X_test.shape)
print(len(test_labels))
print(test_tsne_lda.shape)

torch.Size([2168, 20])
2168
(2168, 2)


In [52]:
X_test = torch.from_numpy(X_test)
X_test = X_test.type(torch.FloatTensor)
test_labels = torch.from_numpy(np.array(test_labels))

In [82]:
from core.DataEmbeddingGraph import DataEmbeddingGraph

num_datasets = 100
all_test_data = []
for i in range(num_datasets):
    num_samples = np.random.randint(200, 500)
    _idx = np.random.choice(range(len(test_labels)), num_samples, replace=False)
    r_inputs = X_test[_idx]
    r_labels = test_labels[_idx]

    # Package into graph block
    G = DataEmbeddingGraph(r_inputs, r_labels, 'spectral')
    all_test_data.append(G)

In [84]:
with open('20news_lda_test_tsne.pkl', 'wb') as f:
    pickle.dump([all_test_data], f)

In [90]:
from util.evaluation_metrics import evaluate_embedding_metrics

In [None]:
trustworthiness, one_nn, five_nn, time_elapsed = evaluate_embedding_metrics(all_test_data, 'tsne')
print("Trust = {:.4f}, 1-NN = {:.4f}, 5-NN = {:.4f}, time to compute = {:.2f}s".format(trustworthiness, one_nn, five_nn, time_elapsed))