In [183]:
import time

# Create PySpark context
from pyspark import  SparkContext, SQLContext, Row
sc = SparkContext('local', 'pyspark')

## Load in Book Data into Spark RDD

In [184]:
book_filename = "BX-Books.csv"
book_filename = "books-mini.csv"

book_raw_data = sc.textFile(book_filename)
book_raw_data_header = [ word.replace("\"", "") for word in book_raw_data.take(1)[0].split(';')[1:3] ] # Used to skip the header
book_raw_data =  book_raw_data.map(lambda line: [ word.replace("\"", "") for word in line.split(";")[1:3]]) \
                            .filter(lambda line: line != book_raw_data_header) \
                            .map(lambda line: "%s %s" % (line[0], line[1])) \
                            .map(lambda line: line.split(" ")).zipWithIndex()
book_raw_data.take(2)

[([u'Classical', u'Mythology', u'Mark', u'P.', u'O.', u'Morford'], 0),
 ([u'Clara', u'Callan', u'Richard', u'Bruce', u'Wright'], 1)]

## SQL Table for lookup

In [185]:
sqlContext = SQLContext(sc)

book_table = book_raw_data.map(lambda p: Row(idx=p[1], title=p[0]))
schema_books = sqlContext.createDataFrame(book_table)
schema_books.registerTempTable("books")

# example
book = sqlContext.sql("SELECT * FROM books WHERE idx = 400 LIMIT 1")
book.collect()

[Row(idx=400, title=[u"Alice's", u'Adventures', u'in', u'Wonderland', u'and', u'Through', u'the', u'Looking', u'Glass', u'Lewis', u'Carroll'])]

In [186]:
titles = sqlContext.sql("SELECT title FROM books").map(lambda row: row[0])
titles.take(1)

[[u'Classical', u'Mythology', u'Mark', u'P.', u'O.', u'Morford']]

## Create TFIDF Model

In [187]:
from pyspark.mllib.feature import HashingTF
hashingTF = HashingTF()
tf = hashingTF.transform(titles)
from pyspark.mllib.feature import IDF
tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)

## Generate LSH Model

In [188]:
from pyspark_lsh import lsh
# p : integer, larger than the largest value in data.
# m : integer, number of bins for hashing.
# n : integer, number of rows to split the signatures into.
# b : integer, number of bands. Each band will have (n / b) element
# c : integer, minimum allowable cluster size.
lsh_model = lsh.run(tfidf, p = 1000, m = 100, n = 50, b = 10, c = 5)

## Filter Buckets

In [189]:
buckets_to_check = lsh_model.scores.filter(lambda bucket_score: bucket_score[1] > 20).collect()
buckets_to_check

[[32, 45.799693773357461],
 [4, 391.70507534087301],
 [6, 20.807461685823753],
 [8, 70.809365250332746],
 [10, 362.85477638830855],
 [16, 27.90909090909091],
 [18, 21.430256676173194],
 [20, 52.836774274343924],
 [26, 137.35868748984092],
 [30, 134.54889207369865],
 [3, 23.558127355960174],
 [35, 25.109014560318908],
 [13, 30.076869617310788],
 [17, 112.53723713897934],
 [19, 27.675226714128318],
 [23, 71.729652649726546],
 [25, 20.366054340396449],
 [27, 116.63559025570666],
 [29, 436.83751625564366],
 [39, 103.38084737407289]]

## Compare Titles within All Bucket

In [190]:
global_merge_list = {}

def parent(rep, v):
    if rep[v] == v:
        return v
    rep[v] = parent(rep, rep[v])
    return rep[v]

def merge(rep, L):
    for edge in L:
        u, v = edge
        if u not in rep:
            rep[u] = u
        if v not in rep:
            rep[v] = v
        rep[parent(rep, v)] = parent(rep, u)
    return rep

def jaccard(a, b):
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))


In [191]:
for (bucket_idx, score) in buckets_to_check:
    start = time.time()
    
    bv = lsh_model.buckets_vectors.filter(lambda bv: bv[0] == bucket_idx)
    # print bv.collect()

    books_in_bucket = []
    for x in bv.collect():
        books_in_bucket.append(sqlContext.sql("select * from books where idx = %s" % x[1]) \
                                        .map(lambda row: (row[0], row[1])).collect()[0])
    # books_in_bucket
    
    to_merge = []
    for idx1, book1 in enumerate(books_in_bucket):
        (gidx1, title1) = book1
        for idx2 in range(idx1+1, len(books_in_bucket)):
            book2 = books_in_bucket[idx2]
            (gidx2, title2) = book2

            x = set(title1)
            y = set(title2)

            if jaccard(x,y) > 0.85:
                to_merge.append((gidx1, gidx2))
    
    #pprint(to_merge)
    
    merge(global_merge_list, to_merge)
    
    print(time.time() - start)

pprint(global_merge_list)

2.04501390457
4.74173903465
0.988847017288
1.82256293297
3.73498296738
1.12818312645
1.11541390419
1.54771590233
3.44589614868
3.1401848793
1.44131398201
1.11329102516
1.72583198547
2.37709593773
1.10212802887
1.84017014503
0.981561183929
2.14570498466
4.44558501244
2.47338795662
{308: 308,
 405: 308,
 500: 563,
 501: 553,
 502: 502,
 503: 567,
 504: 504,
 505: 502,
 506: 568,
 507: 567,
 508: 504,
 509: 553,
 510: 502,
 530: 530,
 540: 540,
 553: 553,
 554: 530,
 563: 563,
 567: 567,
 568: 568,
 571: 568,
 576: 576,
 591: 576,
 593: 540,
 599: 553,
 604: 568}


In [192]:
for merge1, merge2 in global_merge_list.iteritems():
    if merge1 == merge2:
        book = sqlContext.sql("SELECT * FROM books WHERE idx = %s LIMIT 1" % merge1)
        print book.collect()

[Row(idx=530, title=[u'Harry', u'Potter', u'and', u'the', u'Chamber', u'of', u'Secrets', u'J.', u'K.', u'Rowling'])]
[Row(idx=540, title=[u'J.', u'K.', u'Rowling:', u'The', u'Wizard', u'Behind', u'Harry', u'Potter', u'Marc', u'Shapiro'])]
[Row(idx=553, title=[u'Harry', u'Potter', u'and', u'the', u"Sorcerer's", u'Stone', u'(Book', u'1)', u'J.', u'K.', u'Rowling'])]
[Row(idx=563, title=[u'Harry', u'Potter', u'and', u'the', u"Sorcerer's", u'Stone', u'(Harry', u'Potter', u'(Paperback))', u'J.', u'K.', u'Rowling'])]
[Row(idx=308, title=[u'The', u'Perfect', u'Storm', u':', u'A', u'True', u'Story', u'of', u'Men', u'Against', u'the', u'Sea', u'Sebastian', u'Junger'])]
[Row(idx=567, title=[u'Harry', u'Potter', u'and', u'the', u'Prisoner', u'of', u'Azkaban', u'(Book', u'3)', u'J.', u'K.', u'Rowling'])]
[Row(idx=568, title=[u'Harry', u'Potter', u'and', u'the', u'Order', u'of', u'the', u'Phoenix', u'(Book', u'5)', u'J.', u'K.', u'Rowling'])]
[Row(idx=576, title=[u'The', u'Science', u'of', u'Harry'

In [181]:
book = sqlContext.sql("SELECT * FROM books WHERE idx = %s LIMIT 1" % 521)
print book.collect()

[Row(idx=521, title=[u'Harry', u'Potter', u'y', u'el', u'c\xe1liz', u'de', u'fuego', u'J.', u'K.', u'Rowling'])]


1

----

----

## Thoughts

* Try KMeans from mllib, to avoid python_lsh

----

----

## Scrap

In [37]:
lsh_model.buckets_vectors

PythonRDD[103] at RDD at PythonRDD.scala:43

In [None]:
lsh_model.buckets.collect()

In [71]:
test_tf = HashingTF()
test_tf.transform(["james", "q"])

SparseVector(1048576, {1026665: 1.0, 1034736: 1.0})

In [51]:
len(tf.take(1)[0])

1048576

In [None]:
import numpy as np
maxvals = tfidf.map(max)

maxvals.max()

In [182]:
sc.stop()