In [1]:
import os
sparkFile = os.path.join(os.environ["SPARK_HOME"], 'python/pyspark/shell.py')
exec(compile(open(sparkFile, "rb").read(), sparkFile, 'exec'))

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.2.0
      /_/

Using Python version 3.5.4 (default, Oct 27 2017 11:48:53)
SparkSession available as 'spark'.


# The dataset

In [2]:
vocabulary = [ word for word in map(lambda x: x.strip(), open("data/vocab.nips.txt").readlines()) ]
print('the' in vocabulary or 'a' in vocabulary or 'to' in vocabulary)
print(vocabulary[:20])

False
['a2i', 'aaa', 'aaai', 'aapo', 'aat', 'aazhang', 'abandonment', 'abbott', 'abbreviated', 'abcde', 'abe', 'abeles', 'abi', 'abilistic', 'abilities', 'ability', 'abl', 'able', 'ables', 'ablex']


There are no stop words.

# Local-sensitivity hashing

## Shingling: the documents as sets

In [3]:
C = sc.textFile ("data/docword.nips.txt") \
        .map(lambda line: line.split()) \
        .filter(lambda line: len(line) == 3) \
        .map(lambda y: (y[0], int(y[1]) - 1)) \
        .groupByKey()

C is an RDD that represents the characteristic matrix of the dataset. A pair (K,V), represent the rows K of column K, which are the only non-zero values.

## Minhash: getting document signatures

In [4]:
import random

A = random.sample(range(1,1500), 100)
B = random.sample(range(1,1500), 100)

def min_hash(a, b, sig):
    hashes = [((a * x) + b) % len(vocabulary) for x in sig]
    return min(hashes)

def get_signature(p):
    doc,words = p
    signature = [ min_hash(a, b, words) for a,b in zip(A,B) ]
    return((doc, signature))

M = C. map(get_signature)

## LSH: getting candidate pairs

In [5]:
def chunk(l, n):
    l = [ x for x in l ]
    for i in range(0, len(l), int(len(l)/n)):
        yield frozenset(l[i:i + n])
        
def hash_bands(p):
    doc,sig = p   
    bands = [ ((i, hash(b)), doc) for i,b in enumerate(chunk(sig, 10)) ]
    return bands
    
B = M. flatMap(hash_bands)