In [1]:
import os
os.chdir('..')
os.getcwd()

'/Users/signapoop/Desktop/fyp-graph-clustering'

In [2]:
%matplotlib inline
import pickle
import numpy as np
import matplotlib.pyplot as plt
import nltk
from sklearn.manifold import TSNE

from core.DimReduction import DimReduction

In [3]:
import bokeh.plotting as bp
from bokeh.plotting import save, show
from bokeh.models.glyphs import Text
from bokeh.models import HoverTool
from bokeh.io import output_notebook
from bokeh.transform import linear_cmap
from bokeh.palettes import Spectral6

output_notebook()

In [4]:
STOP_WORDS = nltk.corpus.stopwords.words()

In [5]:
# The words in the file are arranged in descending order of popularity
filename = '/Users/signapoop/Desktop/data/wiki-news-300d-1M.vec'
max_samples = 4000
n_dim = 300
labels = []
i = 0
X = np.zeros((max_samples, n_dim))
with open(filename, "r") as f:
    for line in f:
        if i == max_samples: break
        all_features = line.split(' ')
        label = all_features[0]
        if len(all_features) != n_dim+1 or not label.isalpha() or label.lower() in STOP_WORDS or len(label)<=2:
            continue
        
        labels.append(label)
        X[i] = np.array([float(x) for x in all_features[1:]])
        i += 1

In [6]:
X.shape

(4000, 300)

In [7]:
# Shuffle rows in the array
idx = np.random.permutation(len(labels))
X = X[idx]
labels = [labels[i] for i in idx]
# np.random.shuffle(X)

In [8]:
n_train = 3000
X_train = X[:n_train]
labels_train = labels[:n_train]
X_test = X[n_train:]
labels_test = labels[n_train:]

In [9]:
# dim_red = DimReduction(n_components=2)
# X_emb = dim_red.fit_transform(X_train, 'tsne')
# print(X_emb.shape)

In [10]:
embedder = TSNE(perplexity=15, n_components=3, init='pca', n_iter=2500, random_state=23, verbose=1)
X_emb = embedder.fit_transform(X_train)
print(X_emb.shape)

[t-SNE] Computing 46 nearest neighbors...
[t-SNE] Indexed 3000 samples in 0.022s...
[t-SNE] Computed neighbors for 3000 samples in 4.203s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3000
[t-SNE] Computed conditional probabilities for sample 2000 / 3000
[t-SNE] Computed conditional probabilities for sample 3000 / 3000
[t-SNE] Mean sigma: 0.565958
[t-SNE] KL divergence after 250 iterations with early exaggeration: 102.063530
[t-SNE] Error after 2500 iterations: 1.953880
(3000, 3)


In [None]:
with open('/Users/signapoop/Desktop/data/fasttext_train_tsne.pkl', 'rb') as f:
    X_, labels_train, X_emb = pickle.load(f)

In [11]:
n_plot = 500
X_plot = X_emb[:n_plot]
labels_plot = labels_train[:n_plot]

In [12]:
title = 'fast text visualisation'

plot_fig = bp.figure(plot_width=800, plot_height=600,
                     title=title,
                     tools="pan, wheel_zoom, box_zoom, reset, hover, previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

# Use the third dimension as the color map
if X_plot.shape[1] == 3:
    colors = X_plot[:,2]
    mapper = linear_cmap(field_name='colors', palette=Spectral6 ,low=min(colors) ,high=max(colors))
    source = bp.ColumnDataSource(dict(x=X_plot[:,0], y=X_plot[:,1], text=labels_plot, colors=colors))
    plot_fig.text("x", "y", text="text", text_font_size='10px', text_color=mapper, source=source)
else:
    source = bp.ColumnDataSource(dict(x=X_plot[:,0], y=X_plot[:,1], text=labels_plot))
    plot_fig.text("x", "y", text="text", text_font_size='10px', source=source)

show(plot_fig)

## On test data

In [13]:
embedder = TSNE(perplexity=15, n_components=3, init='pca', n_iter=2500, random_state=23, verbose=1)
test_X_emb = embedder.fit_transform(X_test)
print(test_X_emb.shape)

[t-SNE] Computing 46 nearest neighbors...
[t-SNE] Indexed 1000 samples in 0.005s...
[t-SNE] Computed neighbors for 1000 samples in 0.510s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1000
[t-SNE] Mean sigma: 0.549858
[t-SNE] KL divergence after 250 iterations with early exaggeration: 133.171921
[t-SNE] Error after 2500 iterations: 2.040648
(1000, 3)


In [16]:
n_plot = 500
test_X_plot = test_X_emb[:n_plot]
test_labels_plot = labels_test[:n_plot]

In [17]:
title = 'fast text visualisation'

plot_fig = bp.figure(plot_width=800, plot_height=600,
                     title=title,
                     tools="pan, wheel_zoom, box_zoom, reset, hover, previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

# Use the third dimension as the color map
if test_X_plot.shape[1] == 3:
    colors = test_X_plot[:,2]
    mapper = linear_cmap(field_name='colors', palette=Spectral6 ,low=min(colors) ,high=max(colors))
    source = bp.ColumnDataSource(dict(x=test_X_plot[:,0], y=test_X_plot[:,1], text=test_labels_plot, colors=colors))
    plot_fig.text("x", "y", text="text", text_font_size='10px', text_color=mapper, source=source)
else:
    source = bp.ColumnDataSource(dict(x=test_X_plot[:,0], y=test_X_plot[:,1], text=test_labels_plot))
    plot_fig.text("x", "y", text="text", text_font_size='10px', source=source)

show(plot_fig)

## Save data

In [18]:
print(X_train.shape)
inputs = X_train
labels = []
with open('fasttext_train_tsne.pkl', 'wb') as f:
    pickle.dump([inputs, labels_train, X_emb], f)

(3000, 300)


In [19]:
print(X_test.shape)
inputs = X_test
labels = []
with open('fasttext_test_tsne.pkl', 'wb') as f:
    pickle.dump([inputs, labels_test, test_X_emb], f)

(1000, 300)
