In [1]:
import os
os.chdir('..')
os.getcwd()

'/Users/signapoop/Desktop/fyp-graph-clustering'

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from time import time
import torch

In [3]:
import lda

In [4]:
from util.plot_embedding import plot_embedding, plot_embedding_subplot

In [5]:
from sklearn.datasets import fetch_20newsgroups

# we only want to keep the body of the documents!
remove = ('headers', 'footers', 'quotes')

# fetch train and test data
newsgroups_train = fetch_20newsgroups(subset='train', remove=remove)
newsgroups_test = fetch_20newsgroups(subset='test', remove=remove)

In [6]:
print(len(newsgroups_train.target))
print(len(newsgroups_test.target))

11314
7532


In [7]:
# a list of cleaned news in string format
# only keep letters & make them all lower case
news = [' '.join(filter(str.isalpha, raw.lower().split())) for raw in newsgroups_train.data]
labels = list(newsgroups_train.target)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

n_topics = 20 # number of topics
n_iter = 500 # number of iterations

# vectorizer: ignore English stopwords & words that occur less than 5 times
cvectorizer = CountVectorizer(min_df=5, stop_words='english')
cvz = cvectorizer.fit_transform(news)

# train an LDA model
lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(cvz)

INFO:lda:n_documents: 11314
INFO:lda:vocab_size: 12176
INFO:lda:n_words: 616447
INFO:lda:n_topics: 20
INFO:lda:n_iter: 500
  if sparse and not np.issubdtype(doc_word.dtype, int):
INFO:lda:<0> log likelihood: -7817484
INFO:lda:<10> log likelihood: -5769535
INFO:lda:<20> log likelihood: -5509527
INFO:lda:<30> log likelihood: -5417295
INFO:lda:<40> log likelihood: -5370800
INFO:lda:<50> log likelihood: -5340037
INFO:lda:<60> log likelihood: -5316506
INFO:lda:<70> log likelihood: -5302747
INFO:lda:<80> log likelihood: -5289565
INFO:lda:<90> log likelihood: -5279488
INFO:lda:<100> log likelihood: -5274963
INFO:lda:<110> log likelihood: -5269197
INFO:lda:<120> log likelihood: -5265240
INFO:lda:<130> log likelihood: -5260573
INFO:lda:<140> log likelihood: -5257788
INFO:lda:<150> log likelihood: -5255488
INFO:lda:<160> log likelihood: -5252418
INFO:lda:<170> log likelihood: -5249118
INFO:lda:<180> log likelihood: -5248181
INFO:lda:<190> log likelihood: -5245181
INFO:lda:<200> log likelihood: -

In [9]:
print(X_topics.shape)
print(len(labels))

(11314, 20)
11314


In [10]:
threshold = 0.5
_idx = np.amax(X_topics, axis=1) > threshold  # idx of doc that above the threshold
X_topics = X_topics[_idx]
labels = [l for i,l in enumerate(labels) if _idx[i]]

In [11]:
# Get the first 200 characters of each news article
news_snippets = [n[:200] for i,n in enumerate(news) if _idx[i]]

In [12]:
topics = [newsgroups_train.target_names[l] for l in labels]

In [13]:
print(X_topics.shape)
print(len(labels))
print(len(news_snippets))

(3944, 20)
3944
3944


In [14]:
from sklearn import manifold
tsne_model = manifold.TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
t0 = time()
tsne_lda = tsne_model.fit_transform(X_topics)
print('sklearn took {} seconds'.format(time()-t0))

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 3944 samples in 0.003s...
[t-SNE] Computed neighbors for 3944 samples in 0.555s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3944
[t-SNE] Computed conditional probabilities for sample 2000 / 3944
[t-SNE] Computed conditional probabilities for sample 3000 / 3944
[t-SNE] Computed conditional probabilities for sample 3944 / 3944
[t-SNE] Mean sigma: 0.083223
[t-SNE] KL divergence after 250 iterations with early exaggeration: 59.157070
[t-SNE] Error after 1000 iterations: 0.760193
sklearn took 60.41336703300476 seconds


In [15]:
import bokeh.plotting as bp
from bokeh.plotting import save, show
from bokeh.models import HoverTool
from bokeh.io import output_notebook

n_top_words = 5 # number of keywords we show

# 20 colors
colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])

In [16]:
output_notebook()

In [17]:
train_lda_keys = []
for i in range(X_topics.shape[0]):
    train_lda_keys +=  X_topics[i].argmax(),
    
topic_summaries = []
topic_word = lda_model.topic_word_  # all topic words
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] # get!
    topic_summaries.append(' '.join(topic_words)) # append!

In [37]:
title = '20 newsgroups LDA viz'

plot_lda = bp.figure(plot_width=900, plot_height=700,
                     title=title,
                     tools="pan, wheel_zoom, box_zoom, reset, hover, previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

data_dict = {'content': news_snippets,
             'topic_key': topics, 
             'x': tsne_lda[:, 0], 
             'y': tsne_lda[:, 1],
             'color': colormap[labels]}

mySource = bp.ColumnDataSource(data_dict)

plot_lda.circle(x='x', y='y', color='color', source=mySource)

# Select the centroid of each LDA cluster as the coordinates to plot crucial words
train_lda_labels = np.array(train_lda_keys)
topic_coord = np.zeros((X_topics.shape[1], 2))
for i in range(X_topics.shape[1]):
    mask = np.argwhere(train_lda_labels == i).flatten()
    topic_coord[i] = np.average(tsne_lda[mask], axis=0)
    
# plot crucial words
for i in range(X_topics.shape[1]):
      plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]], text_font_size='10px')

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content - topic: @topic_key"}

# save the plot
#save(plot_lda, '{}.html'.format(title))
show(plot_lda)



In [19]:
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [20]:
topic_summaries # Notice that the topic summaries don't have a direct mapping to each label

['israel israeli people rights state',
 'just like think good really',
 'know like does just read',
 'armenian turkish armenians people war',
 'space nasa launch satellite lunar',
 'key encryption government chip use',
 'god jesus bible christian believe',
 'drive card disk use dos',
 'does people think just believe',
 'said went know just people',
 'president going people think money',
 'window db use using windows',
 'team game hockey games play',
 'available software information ftp anonymous',
 'file entry number program use',
 'car good new like just',
 'use power used water current',
 'gun law right police state',
 'medical use health number patients',
 'university new information april san']

# Test set

In [21]:
test_news = [' '.join(filter(str.isalpha, raw.lower().split())) for raw in newsgroups_test.data]
test_labels = list(newsgroups_test.target)

# vectorizer: ignore English stopwords & words that occur less than 5 times
test_cvz = cvectorizer.transform(test_news)

X_test = lda_model.transform(test_cvz, max_iter=n_iter)

  if sparse and not np.issubdtype(doc_word.dtype, int):


In [22]:
print(X_test.shape)
print(len(test_labels))

(7532, 20)
7532


In [23]:
threshold = 0.5
_idx = np.amax(X_test, axis=1) > threshold  # idx of doc that above the threshold
X_test = X_test[_idx]
test_labels = [l for i,l in enumerate(test_labels) if _idx[i]]

In [24]:
print(X_test.shape)
print(len(test_labels))

(2082, 20)
2082


In [25]:
# Get the first 200 characters of each news article
news_snippets = [n[:200] for i,n in enumerate(test_news) if _idx[i]]
topics = [newsgroups_test.target_names[l] for l in labels]

In [26]:
tsne_model = manifold.TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
t0 = time()
test_tsne_lda = tsne_model.fit_transform(X_test)
print('sklearn took {} seconds'.format(time()-t0))

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 2082 samples in 0.002s...
[t-SNE] Computed neighbors for 2082 samples in 0.153s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2082
[t-SNE] Computed conditional probabilities for sample 2000 / 2082
[t-SNE] Computed conditional probabilities for sample 2082 / 2082
[t-SNE] Mean sigma: 0.100650
[t-SNE] KL divergence after 250 iterations with early exaggeration: 54.196217
[t-SNE] Error after 1000 iterations: 0.547289
sklearn took 30.79921317100525 seconds


In [27]:
test_lda_keys = []
for i in range(X_test.shape[0]):
    test_lda_keys +=  X_test[i].argmax(),

In [28]:
title = '20 newsgroups LDA viz'

plot_lda = bp.figure(plot_width=900, plot_height=700,
                     title=title,
                     tools="pan, wheel_zoom, box_zoom, reset, hover, previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

data_dict = {'content': news_snippets,
             'topic_key': topics, 
             'x': test_tsne_lda[:, 0], 
             'y': test_tsne_lda[:, 1],
             'color': colormap[test_lda_keys]}

mySource = bp.ColumnDataSource(data_dict)

plot_lda.circle(x='x', y='y', color='color', source=mySource)

# Select the centroid of each LDA cluster as the coordinates to plot crucial words
test_lda_labels = np.array(test_lda_keys)
topic_coord = np.zeros((X_test.shape[1], 2))
for i in range(X_test.shape[1]):
    mask = np.argwhere(test_lda_labels == i).flatten()
    topic_coord[i] = np.average(test_tsne_lda[mask], axis=0)
    
# plot crucial words
for i in range(X_test.shape[1]):
      plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]], text_font_size='10px')

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content - topic: @topic_key"}

# save the plot
#save(plot_lda, '{}.html'.format(title))
show(plot_lda)



In [30]:
len(test_lda_keys)

2082

In [31]:
print(X_topics.shape)
print(len(labels))
print(tsne_lda.shape)

(3944, 20)
3944
(3944, 2)


In [32]:
labels = torch.from_numpy(np.array(labels))

In [33]:
import pickle
with open('20news_train_tsne.pkl', 'wb') as f:
    pickle.dump([X_topics, labels, tsne_lda], f)

In [34]:
print(X_test.shape)
print(len(test_labels))
print(test_tsne_lda.shape)

(2082, 20)
2082
(2082, 2)


In [35]:
test_labels = torch.from_numpy(np.array(test_labels))

In [36]:
with open('20news_test_tsne.pkl', 'wb') as f:
    pickle.dump([X_test, test_labels, test_tsne_lda], f)

In [38]:
with open('20news_metadata.pkl', 'wb') as f:
    pickle.dump([lda_model, topic_summaries, newsgroups_train.target_names], f)

In [None]:
from core.DataEmbeddingGraph import DataEmbeddingGraph

num_datasets = 100
all_test_data = []
for i in range(num_datasets):
    num_samples = np.random.randint(200, 500)
    _idx = np.random.choice(range(len(test_labels)), num_samples, replace=False)
    r_inputs = X_test[_idx]
    r_labels = test_labels[_idx]

    # Package into graph block
    G = DataEmbeddingGraph(r_inputs, r_labels, 'spectral')
    all_test_data.append(G)

In [None]:
with open('20news_lda_test_tsne.pkl', 'wb') as f:
    pickle.dump([all_test_data], f)

In [None]:
for i in range(20):
    mask = np.argwhere(np.array(test_labels) == i).flatten()
    centroid = np.average(X_test[mask], axis=0)
    print(np.argmax(centroid))

In [None]:
for i in range(20):
    mask = np.argwhere(np.array(test_labels) == i).flatten()
    centroid = np.average(X_topics[mask], axis=0)
    print(np.argmax(centroid))

In [None]:
X_test