In [1]:
import numpy as np
import pandas as pd

SOURCE_FILE = 'HN_posts_year_to_Sep_26_2016.csv'

hn = pd.read_csv(SOURCE_FILE)

In [2]:
import pykka

from sklearn.feature_extraction.text import TfidfVectorizer

vectoriser = TfidfVectorizer(max_df=0.5, min_df=1, stop_words='english', use_idf=True)
tfidf_matrix = vectoriser.fit_transform(hn['title'])


In [3]:
from Actors import Word_to_Vec

# Start W2V Actor
w2v = Word_to_Vec.start().proxy()


In [4]:
from Actors import Title_to_Vec
import time

POOL_SIZE = 8
hn_small = hn.head(5000)

# Start resolvers
resolvers = [Title_to_Vec.start(vectoriser, w2v).proxy() for _ in range(POOL_SIZE)]
start = time.time()
# Distribute work by mapping titles to resolvers (not blocking)
hosts = []
for i, title in enumerate(hn_small['title']):
    hosts.append(resolvers[i % len(resolvers)].resolve(title))

# Gather results (blocking)
title_to_vectors = zip(hn_small['title'], pykka.get_all(hosts))
print time.time() - start

22.4271318913


In [5]:
# Clean up
pykka.ActorRegistry.stop_all();

In [6]:
from sklearn.manifold import TSNE

tnse_model = TSNE(n_components=3, method="barnes_hut")
X_2d  = tnse_model.fit_transform([t[1] for t in title_to_vectors])

In [11]:
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
from plotly.graph_objs import *
import colorlover as cl

init_notebook_mode(connected=True) # inject plotly.js into the notebook

PLOT_LIMIT = 500
trace = Scatter3d(
    x = X_2d[:PLOT_LIMIT, 0],
    y = X_2d[:PLOT_LIMIT, 1],
    z = X_2d[:PLOT_LIMIT, 2],
    mode = "markers",
    marker=dict(
     #   color='rgb(128, 128, 128)',
        colorscale='Viridis',   
        size=2,
        symbol='circle',
        line=dict(
     #       color='rgb(204, 204, 204)',
            width=1
        ),
        opacity=0.8
    ),
    text = [t[0] for t in title_to_vectors[:PLOT_LIMIT]]
    )

iplot({
        "data": [trace],
        "layout": Layout(title="HN (first 500 posts)")
    })

In [24]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=50, n_jobs=-1, max_iter=2000, n_init=50).fit(X_2d)


In [25]:
PLOT_LIMIT = 500
trace = Scatter3d(
    x = X_2d[:PLOT_LIMIT, 0],
    y = X_2d[:PLOT_LIMIT, 1],
    z = X_2d[:PLOT_LIMIT, 2],
    mode = "markers",
    marker=dict(
     #   color='rgb(128, 128, 128)',
        color = np.multiply(kmeans.labels_, 10),
        colorscale='Viridis',   
        size=2,
        symbol='circle',
        line=dict(
     #       color='rgb(204, 204, 204)',
            width=1
        ),
        opacity=0.8
    ),
    text = [t[0] for t in title_to_vectors[:PLOT_LIMIT]]
    )

iplot({
        "data": [trace],
        "layout": Layout(title="HN (first 500 posts)")
    })