In [2]:
import numpy as np
import pandas as pd

SOURCE_FILE = 'HN_posts_year_to_Sep_26_2016.csv'

hn = pd.read_csv(SOURCE_FILE)

In [3]:
import pykka

from sklearn.feature_extraction.text import TfidfVectorizer

vectoriser = TfidfVectorizer(max_df=0.5, min_df=1, stop_words='english', use_idf=True)
tfidf_matrix = vectoriser.fit_transform(hn['title'])


In [4]:
from Actors import Word_to_Vec

# Start W2V Actor
w2v = Word_to_Vec.start().proxy()


In [7]:
from Actors import Title_to_Vec
import time

POOL_SIZE = 8
hn_small = hn.head(25000)

# Start resolvers
resolvers = [Title_to_Vec.start(vectoriser, w2v).proxy() for _ in range(POOL_SIZE)]
start = time.time()
# Distribute work by mapping titles to resolvers (not blocking)
hosts = []
for i, title in enumerate(hn_small['title']):
    hosts.append(resolvers[i % len(resolvers)].resolve(title))

# Gather results (blocking)
title_to_vectors = zip(hn_small['title'], pykka.get_all(hosts))
print time.time() - start

98.7575619221


In [8]:
# Clean up
pykka.ActorRegistry.stop_all(); # Remember to start the W2V actor if rerunning!

In [214]:
sample = pd.DataFrame(title_to_vectors, columns=["title", "vector"])
# Lets remove all the points that are at the origin
sample = sample[sample["vector"].map(lambda x: np.sum(x) > 0)]
# Now we take some random samples for visualising
sample = sample.sample(50)

In [215]:
from sklearn.manifold import TSNE

tnse_model = TSNE(n_components=3, method="barnes_hut")
#X_2d  = tnse_model.fit_transform([t[1] for t in title_to_vectors])
X_2d  = tnse_model.fit_transform(sample.ix[:, 1].tolist())

In [209]:
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
from plotly.graph_objs import *
import colorlover as cl

init_notebook_mode(connected=True) # inject plotly.js into the notebook

trace = Scatter3d(
    x = X_2d[:, 0],
    y = X_2d[:, 1],
    z = X_2d[:, 2],
    mode = "markers",
    marker=dict(
     #   color='rgb(128, 128, 128)',
        colorscale='Viridis',   
        size=2,
        symbol='circle',
        line=dict(
     #       color='rgb(204, 204, 204)',
            width=1
        ),
        opacity=0.8
    ),
    text = sample.ix[:, 0].tolist()#[t[0] for t in title_to_vectorsT]]
    )

iplot({
        "data": [trace],
        "layout": Layout(title="HN (first 500 posts)")
    })

In [216]:
from sklearn.cluster import KMeans
CLUSTERS = 10
kmeans = KMeans(n_clusters=CLUSTERS, n_jobs=-1, max_iter=1000).fit(X_2d)


In [226]:

trace = Scatter3d(
    x = X_2d[:, 0],
    y = X_2d[:, 1],
    z = X_2d[:, 2],
    mode = "markers",
    marker=dict(
     #   color='rgb(128, 128, 128)',
        color = np.multiply(kmeans.labels_, 10),
        colorscale='Viridis',   
        size=2,
        symbol='circle',
        line=dict(
     #       color='rgb(204, 204, 204)',
            width=1
        ),
        opacity=0.8
    ),
    text = sample.ix[:, 0].tolist() 
    )

iplot({
        "data": [trace],
        "layout": Layout(title="HN (first 500 posts)")
    })

In [212]:
kmeans.cluster_centers_

array([[  7.06930286e-05,   9.49362232e-05,   3.00939439e-05],
       [  9.02704703e-06,  -5.68670344e-05,  -7.75417243e-05],
       [ -4.28169298e-05,  -1.25836709e-04,   5.38268972e-05],
       [ -1.74900398e-06,  -1.03101656e-05,   8.84626163e-05],
       [ -1.90572884e-05,   1.02346515e-04,  -5.76309377e-05],
       [ -1.19494021e-04,  -5.22498328e-05,  -9.00319393e-05],
       [ -1.45428620e-06,  -1.76019587e-04,   6.82475079e-06],
       [  6.82050102e-05,  -4.47791770e-05,  -1.54758472e-05],
       [ -5.66788139e-05,  -8.95585821e-05,  -9.04722909e-05],
       [  1.26422298e-05,   5.39783489e-05,   1.09040572e-04]])

In [229]:
import random 
data = []
clusters = []
annotations = []
for i in range(CLUSTERS):
    name = "Cluster %d" % i
    colour = "#%06x" % random.randint(0, 0xFFFFFF)
    ix = np.in1d(kmeans.labels_.ravel(), i).reshape(kmeans.labels_.shape)
    indices = np.where(ix)
    x = X_2d[indices,0][0]
    y = X_2d[indices,1][0]
    z = X_2d[indices,2][0]
    text = sample.iloc[indices[0], 0].values
    
    trace = dict(
        name = name,
        x = x, y = y, z = z,
        text = text,
        type = "scatter3d",    
        mode = 'markers',
        marker = dict( size=2, line=dict(width=0), color=colour ) 
    )
    data.append( trace )
    
    cluster = dict(
        color=colour,
        opacity = 0.3,
        type = "mesh3d",
        x = x, y = y, z = z,
        name = name,
        alphahull=7
        
    )
    clusters.append( cluster )
    
    #annotation = dict(
        #x = kmeans.cluster_centers_[i][0],
        #y = kmeans.cluster_centers_[i][1],
        #z = kmeans.cluster_centers_[i][2],
        #text = name,
        #xref="x", yref="y", zref="z"
    #)
    #annotations.append(annotation)
    
layout = dict(
    title="HN Post Titles",
    scene=dict(
        xaxis=dict(
            gridcolor='rgb(255, 255, 255)',
            zerolinecolor='rgb(255, 255, 255)',
            showbackground=True,
            backgroundcolor='rgb(230, 230,230)'
        ),
        yaxis=dict(
            gridcolor='rgb(255, 255, 255)',
            zerolinecolor='rgb(255, 255, 255)',
            showbackground=True,
            backgroundcolor='rgb(230, 230,230)'
        ),
        zaxis=dict(
            gridcolor='rgb(255, 255, 255)',
            zerolinecolor='rgb(255, 255, 255)',
            showbackground=True,
            backgroundcolor='rgb(230, 230,230)'
        ),
        aspectratio = dict( x=1, y=1, z=0.7 ),
        aspectmode = 'manual'
    ),
   # annotations=annotations
)

fig = dict(data = data+clusters, layout = layout)
iplot(fig, filename='pandas-3d-scatter-hn-titles', validate=False)