In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [2]:
!rm -rf metastore_db/

It would be bit cleaner to use nltk to do the tokenization, but we don't have nltk installed in our cluster.

In [3]:
def tokenize(s):
    import re
    stopwords = set(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now'])
    word_regex = '^[a-z][a-z\'-]+[a-z]$'
    s = s.lower()
    arr = s.split()
    terms = []
    for term in arr:
        if re.match(word_regex, term) != None and len(term) > 3 and term not in stopwords:
            terms.append(term)
    return terms


In [4]:
test_strings = ['the quick brown fox jumps over the brown fence.',
              'the boy paints a tall fence brown!',
              'basketball players are tall.',
              'quick basketball players jump high']

In [5]:
tokens = sc.parallelize(test_strings).map(tokenize)

In [6]:
tokens.collect()

[['quick', 'brown', 'jumps', 'brown'],
 ['paints', 'tall', 'fence'],
 ['basketball', 'players'],
 ['quick', 'basketball', 'players', 'jump', 'high']]

In [7]:
vocab = tokens.flatMap(lambda words: words).distinct()
vocab.collect()

['quick',
 'fence',
 'players',
 'jump',
 'basketball',
 'high',
 'brown',
 'jumps',
 'paints',
 'tall']

In [8]:
from collections import Counter
import numpy as np

#sc.broadcast shares an immutable object throughout the cluster
broadcastVocab = sc.broadcast(vocab.collect())

def bow_vectorize(tokens):
    word_counts = Counter(tokens)
    vector = [word_counts[v] if v in word_counts else 0 for v in broadcastVocab.value]
    return np.array(vector)

In [9]:
tokens.map(bow_vectorize).collect()

[array([1, 0, 0, 0, 0, 0, 2, 1, 0, 0]),
 array([0, 1, 0, 0, 0, 0, 0, 0, 1, 1]),
 array([0, 0, 1, 0, 1, 0, 0, 0, 0, 0]),
 array([1, 0, 1, 1, 1, 1, 0, 0, 0, 0])]

In [10]:
broadcastVocab.value

['quick',
 'fence',
 'players',
 'jump',
 'basketball',
 'high',
 'brown',
 'jumps',
 'paints',
 'tall']

In [11]:
term_freq = tokens.map(lambda terms: Counter(terms))

In [12]:
doc_freq = term_freq.flatMap(lambda counts: counts.keys()).map(lambda keys: (keys, 1)).reduceByKey(lambda a, b: a + b)
doc_freq.collect()

[('quick', 2),
 ('fence', 1),
 ('players', 2),
 ('jump', 1),
 ('basketball', 2),
 ('high', 1),
 ('brown', 1),
 ('jumps', 1),
 ('paints', 1),
 ('tall', 1)]

In [13]:
total_docs = term_freq.count()
total_docs

4

In [14]:
import math

idf = doc_freq.map(lambda tup: (tup[0], math.log(float(total_docs)/ (1 + tup[1])))).collect()
idf

[('quick', 0.28768207245178085),
 ('fence', 0.6931471805599453),
 ('players', 0.28768207245178085),
 ('jump', 0.6931471805599453),
 ('basketball', 0.28768207245178085),
 ('high', 0.6931471805599453),
 ('brown', 0.6931471805599453),
 ('jumps', 0.6931471805599453),
 ('paints', 0.6931471805599453),
 ('tall', 0.6931471805599453)]

In [15]:
broadcast_idf = sc.broadcast(idf)

In [16]:
def tfidf_vectorize(tokens):
    word_counts = Counter(tokens)
    doc_length = sum(word_counts.values())
    
    vector = [ word_counts.get(word[0], 0) * word[1] / float(doc_length) for word in broadcast_idf.value ]
    return np.array(vector)

In [17]:
tfidf = tokens.map(tfidf_vectorize)
tfidf.collect()

[array([ 0.07192052,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.34657359,  0.1732868 ,  0.        ,  0.        ]),
 array([ 0.        ,  0.23104906,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.23104906,  0.23104906]),
 array([ 0.        ,  0.        ,  0.14384104,  0.        ,  0.14384104,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ]),
 array([ 0.05753641,  0.        ,  0.05753641,  0.13862944,  0.05753641,
         0.13862944,  0.        ,  0.        ,  0.        ,  0.        ])]

In [18]:
bow = tokens.map(bow_vectorize).cache()
bow.collect()

[array([1, 0, 0, 0, 0, 0, 2, 1, 0, 0]),
 array([0, 1, 0, 0, 0, 0, 0, 0, 1, 1]),
 array([0, 0, 1, 0, 1, 0, 0, 0, 0, 0]),
 array([1, 0, 1, 1, 1, 1, 0, 0, 0, 0])]

In [19]:
from pyspark.mllib.clustering import KMeans, KMeansModel
from math import sqrt

In [20]:
clusters = KMeans.train(tfidf, 2, maxIterations=10, initializationMode="random")

In [21]:
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

In [22]:
WSSSE = tfidf.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

Within Set Sum of Squared Error = 0.667330168411981


In [23]:
clusters.centers

[array([ 0.0191788 ,  0.07701635,  0.06712582,  0.04620981,  0.06712582,
         0.04620981,  0.        ,  0.        ,  0.07701635,  0.07701635]),
 array([ 0.07192052,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.34657359,  0.1732868 ,  0.        ,  0.        ])]

In [24]:
top_n = 3
print([idf[idx][0] for idx in [np.argsort(x)[::-1][:top_n] for x in clusters.centers][0]])
print([idf[idx][0] for idx in [np.argsort(x)[::-1][:top_n] for x in clusters.centers][1]])

['tall', 'paints', 'fence']
['brown', 'jumps', 'quick']


In [25]:
sc.stop()