In [1]:
import os
os.chdir('..')
os.getcwd()

'/home/leowyaoyang/fyp-graph-clustering'

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from time import time
import torch

In [3]:
import lda

In [4]:
from util.plot_embedding import plot_embedding, plot_embedding_subplot

In [5]:
from sklearn.datasets import fetch_20newsgroups

# we only want to keep the body of the documents!
remove = ('headers', 'footers', 'quotes')

# fetch train and test data
newsgroups_train = fetch_20newsgroups(subset='train', remove=remove)
newsgroups_test = fetch_20newsgroups(subset='test', remove=remove)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [6]:
print(len(newsgroups_train.target))
print(len(newsgroups_test.target))

11314
7532


In [7]:
# a list of cleaned news in string format
# only keep letters & make them all lower case
news = [' '.join(filter(str.isalpha, raw.lower().split())) for raw in newsgroups_train.data]
labels = list(newsgroups_train.target)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

n_topics = 20 # number of topics
n_iter = 500 # number of iterations

# vectorizer: ignore English stopwords & words that occur less than 5 times
cvectorizer = CountVectorizer(min_df=5, stop_words='english')
cvz = cvectorizer.fit_transform(news)

# train an LDA model
lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(cvz)

INFO:lda:n_documents: 11314
INFO:lda:vocab_size: 12176
INFO:lda:n_words: 616447
INFO:lda:n_topics: 20
INFO:lda:n_iter: 500
  if sparse and not np.issubdtype(doc_word.dtype, int):
INFO:lda:<0> log likelihood: -7817484
INFO:lda:<10> log likelihood: -5761604
INFO:lda:<20> log likelihood: -5489959
INFO:lda:<30> log likelihood: -5400827
INFO:lda:<40> log likelihood: -5354927
INFO:lda:<50> log likelihood: -5325837
INFO:lda:<60> log likelihood: -5307682
INFO:lda:<70> log likelihood: -5293821
INFO:lda:<80> log likelihood: -5283104
INFO:lda:<90> log likelihood: -5272458
INFO:lda:<100> log likelihood: -5265636
INFO:lda:<110> log likelihood: -5260454
INFO:lda:<120> log likelihood: -5256858
INFO:lda:<130> log likelihood: -5251894
INFO:lda:<140> log likelihood: -5249783
INFO:lda:<150> log likelihood: -5245249
INFO:lda:<160> log likelihood: -5242031
INFO:lda:<170> log likelihood: -5239764
INFO:lda:<180> log likelihood: -5237793
INFO:lda:<190> log likelihood: -5235507
INFO:lda:<200> log likelihood: -

In [9]:
print(X_topics.shape)
print(len(labels))

(11314, 20)
11314


In [10]:
threshold = 0.5
_idx = np.amax(X_topics, axis=1) > threshold  # idx of doc that above the threshold
X_topics = X_topics[_idx]
labels = [l for i,l in enumerate(labels) if _idx[i]]

In [11]:
# Get the first 200 characters of each news article
news_snippets = [n[:200] for i,n in enumerate(news) if _idx[i]]

In [12]:
topics = [newsgroups_train.target_names[l] for l in labels]

In [13]:
print(X_topics.shape)
print(len(labels))
print(len(news_snippets))

(3862, 20)
3862
3862


In [14]:
from sklearn import manifold
tsne_model = manifold.TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
t0 = time()
tsne_lda = tsne_model.fit_transform(X_topics)
print('sklearn took {} seconds'.format(time()-t0))

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 3862 samples in 0.004s...
[t-SNE] Computed neighbors for 3862 samples in 0.737s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3862
[t-SNE] Computed conditional probabilities for sample 2000 / 3862
[t-SNE] Computed conditional probabilities for sample 3000 / 3862
[t-SNE] Computed conditional probabilities for sample 3862 / 3862
[t-SNE] Mean sigma: 0.090505
[t-SNE] KL divergence after 250 iterations with early exaggeration: 57.786564
[t-SNE] Error after 1000 iterations: 0.709394
sklearn took 54.85776448249817 seconds


In [15]:
import bokeh.plotting as bp
from bokeh.plotting import save, show
from bokeh.models import HoverTool
from bokeh.io import output_notebook

n_top_words = 5 # number of keywords we show

# 20 colors
colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])

In [16]:
output_notebook()

In [17]:
_lda_keys = []
for i in range(X_topics.shape[0]):
    _lda_keys +=  X_topics[i].argmax(),
    
topic_summaries = []
topic_word = lda_model.topic_word_  # all topic words
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] # get!
    topic_summaries.append(' '.join(topic_words)) # append!

In [18]:
title = '20 newsgroups LDA viz'
num_example = len(X_topics)

plot_lda = bp.figure(plot_width=900, plot_height=700,
                     title=title,
                     tools="pan, wheel_zoom, box_zoom, reset, hover, previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

data_dict = {'content': news_snippets,
             'topic_key': topics, 
             'x': tsne_lda[:, 0], 
             'y': tsne_lda[:, 1],
             'color': colormap[labels][:num_example]}

mySource = bp.ColumnDataSource(data_dict)

plot_lda.circle(x='x', y='y', color='color', source=mySource)

# Select the centroid of each LDA cluster as the coordinates to plot crucial words
lda_labels = np.array(_lda_keys)
topic_coord = np.zeros((X_topics.shape[1], 2))
for i in range(X_topics.shape[1]):
    mask = np.argwhere(lda_labels == i).flatten()
    topic_coord[i] = np.average(tsne_lda[mask], axis=0)
    
# plot crucial words
for i in range(X_topics.shape[1]):
      plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]], text_font_size='10px')

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content - topic: @topic_key"}

# save the plot
#save(plot_lda, '{}.html'.format(title))
show(plot_lda)

In [19]:
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [20]:
topic_summaries # Notice that the topic summaries don't have a direct mapping to each label

['space nasa launch satellite lunar',
 'drive disk card windows dos',
 'information university send list anonymous',
 'armenian turkish israel jews israeli',
 'price new like good sell',
 'just like think good know',
 'people does think say believe',
 'god jesus christian bible believe',
 'said went people took did',
 'db available image software version',
 'know like just does good',
 'time use power using test',
 'key encryption use chip government',
 'car bike just new used',
 'file use window program set',
 'gun law state right police',
 'president going think money make',
 'medical use health study patients',
 'la period year power second',
 'team game hockey play players']

# Test set

In [22]:
test_news = [' '.join(filter(str.isalpha, raw.lower().split())) for raw in newsgroups_test.data]
test_labels = list(newsgroups_test.target)

# vectorizer: ignore English stopwords & words that occur less than 5 times
test_cvz = cvectorizer.transform(test_news)

X_test = lda_model.transform(test_cvz, max_iter=n_iter)

  if sparse and not np.issubdtype(doc_word.dtype, int):


In [23]:
print(X_test.shape)
print(len(test_labels))

(7532, 20)
7532


In [24]:
threshold = 0.5
_idx = np.amax(X_test, axis=1) > threshold  # idx of doc that above the threshold
X_test = X_test[_idx]
test_labels = [l for i,l in enumerate(test_labels) if _idx[i]]

In [25]:
print(X_test.shape)
print(len(test_labels))

(2009, 20)
2009


In [26]:
# Get the first 200 characters of each news article
news_snippets = [n[:200] for i,n in enumerate(test_news) if _idx[i]]
topics = [newsgroups_test.target_names[l] for l in labels]

In [27]:
tsne_model = manifold.TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
t0 = time()
test_tsne_lda = tsne_model.fit_transform(X_test)
print('sklearn took {} seconds'.format(time()-t0))

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 2009 samples in 0.002s...
[t-SNE] Computed neighbors for 2009 samples in 0.224s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2009
[t-SNE] Computed conditional probabilities for sample 2000 / 2009
[t-SNE] Computed conditional probabilities for sample 2009 / 2009
[t-SNE] Mean sigma: 0.105892
[t-SNE] KL divergence after 250 iterations with early exaggeration: 52.507931
[t-SNE] Error after 1000 iterations: 0.509053
sklearn took 26.584518432617188 seconds


In [28]:
_lda_keys = []
for i in range(X_test.shape[0]):
    _lda_keys +=  X_test[i].argmax(),
    
topic_summaries = []
topic_word = lda_model.topic_word_  # all topic words
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] # get!
    topic_summaries.append(' '.join(topic_words)) # append!

In [29]:
topic_summaries

['space nasa launch satellite lunar',
 'drive disk card windows dos',
 'information university send list anonymous',
 'armenian turkish israel jews israeli',
 'price new like good sell',
 'just like think good know',
 'people does think say believe',
 'god jesus christian bible believe',
 'said went people took did',
 'db available image software version',
 'know like just does good',
 'time use power using test',
 'key encryption use chip government',
 'car bike just new used',
 'file use window program set',
 'gun law state right police',
 'president going think money make',
 'medical use health study patients',
 'la period year power second',
 'team game hockey play players']

In [30]:
title = '20 newsgroups LDA viz'
num_example = len(X_test)

plot_lda = bp.figure(plot_width=900, plot_height=700,
                     title=title,
                     tools="pan, wheel_zoom, box_zoom, reset, hover, previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

data_dict = {'content': news_snippets,
             'topic_key': topics, 
             'x': test_tsne_lda[:, 0], 
             'y': test_tsne_lda[:, 1],
             'color': colormap[test_labels][:num_example]}

mySource = bp.ColumnDataSource(data_dict)

plot_lda.circle(x='x', y='y', color='color', source=mySource)

# Select the centroid of each LDA cluster as the coordinates to plot crucial words
lda_labels = np.array(_lda_keys)
topic_coord = np.zeros((X_test.shape[1], 2))
for i in range(X_test.shape[1]):
    mask = np.argwhere(lda_labels == i).flatten()
    topic_coord[i] = np.average(test_tsne_lda[mask], axis=0)
    
# plot crucial words
for i in range(X_test.shape[1]):
      plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]], text_font_size='10px')

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content - topic: @topic_key"}

# save the plot
#save(plot_lda, '{}.html'.format(title))
show(plot_lda)



In [None]:
len(_lda_keys)

In [None]:
print(X_topics.shape)
print(len(labels))
print(tsne_lda.shape)

In [None]:
X_topics = torch.from_numpy(X_topics)
X_topics = X_topics.type(torch.FloatTensor)
labels = torch.from_numpy(np.array(labels))

In [None]:
import pickle
with open('20news_lda_train_tsne.pkl', 'wb') as f:
    pickle.dump([X_topics, labels, tsne_lda], f)

In [None]:
print(X_test.shape)
print(len(test_labels))
print(test_tsne_lda.shape)

In [None]:
X_test = torch.from_numpy(X_test)
X_test = X_test.type(torch.FloatTensor)
test_labels = torch.from_numpy(np.array(test_labels))

In [None]:
from core.DataEmbeddingGraph import DataEmbeddingGraph

num_datasets = 100
all_test_data = []
for i in range(num_datasets):
    num_samples = np.random.randint(200, 500)
    _idx = np.random.choice(range(len(test_labels)), num_samples, replace=False)
    r_inputs = X_test[_idx]
    r_labels = test_labels[_idx]

    # Package into graph block
    G = DataEmbeddingGraph(r_inputs, r_labels, 'spectral')
    all_test_data.append(G)

In [None]:
with open('20news_lda_test_tsne.pkl', 'wb') as f:
    pickle.dump([all_test_data], f)

In [46]:
import os
parent_dir = os.path.abspath('..')
filename = parent_dir + '/data/20news_lda_test_tsne.pkl'

In [47]:
import pickle
with open(filename, 'rb') as f:
    [all_test_data] = pickle.load(f)

In [48]:
from util.evaluation_metrics import evaluate_embedding_metrics
trustworthiness, one_nn, five_nn, time_elapsed = evaluate_embedding_metrics(all_test_data, 'tsne')
print("Trust = {:.4f}, 1-NN = {:.4f}, 5-NN = {:.4f}, time to compute = {:.2f}s".format(trustworthiness, one_nn, five_nn, time_elapsed))

Trust = 0.9873, 1-NN = 0.3561, 5-NN = 0.4001, time to compute = 5.38s


In [42]:
for i in range(20):
    mask = np.argwhere(np.array(test_labels) == i).flatten()
    centroid = np.average(X_test[mask], axis=0)
    print(np.argmax(centroid))

6
9
1
1
1
14
4
13
13
18
19
12
11
17
5
7
15
3
6
7


In [43]:
for i in range(20):
    mask = np.argwhere(np.array(test_labels) == i).flatten()
    centroid = np.average(X_topics[mask], axis=0)
    print(np.argmax(centroid))

6
1
6
1
6
1
6
6
1
10
1
10
1
6
1
1
1
1
1
1


In [32]:
X_test

array([[1.00258009e-02, 6.49196323e-01, 4.12415443e-03, ...,
        1.65626769e-02, 1.97129232e-04, 2.27672881e-04],
       [5.49273325e-06, 2.34167992e-03, 2.99297278e-03, ...,
        3.12961190e-04, 1.45098564e-03, 4.29339228e-04],
       [5.22906693e-04, 5.75961152e-01, 3.26490572e-03, ...,
        1.53504326e-03, 2.90283494e-03, 3.68227191e-02],
       ...,
       [3.36489384e-03, 9.15562122e-02, 5.93954298e-01, ...,
        3.49890806e-02, 5.60624376e-03, 1.51548026e-05],
       [4.28937079e-04, 5.27654155e-01, 1.69638049e-03, ...,
        9.18740391e-04, 2.38101125e-04, 1.03941828e-04],
       [3.52803620e-03, 9.03895431e-03, 6.39381066e-04, ...,
        5.95077616e-04, 3.44087263e-04, 3.81209637e-03]])