# Locality-Sensitive Hashing

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline

## Question 2 (minhashing):
Consider the following matrix. Find minhash values for the following ordering of the rows (R4, R6, R1, R3, R5, R2).

**Recall**: The minhash value of a column is index of the first 1 entry under a permutation of the rows.

In [2]:
data = {'C1': [0, 1, 0, 0, 1, 0],
        'C2': [1, 0, 1, 0, 0, 1],
        'C3': [1, 1, 0, 1, 1, 0],
        'C4': [0, 1, 1, 0, 0, 0]}

df = pd.DataFrame(data)
print df

   C1  C2  C3  C4
0   0   1   1   0
1   1   0   1   1
2   0   1   0   1
3   0   0   1   0
4   1   0   1   0
5   0   1   0   0


In [3]:
permute = {0:4, 1:6, 2:1, 3:3, 4:5, 5:2}

for _ in range(1,5):
    s = df["C{}".format(_)]
    new = [s[3], s[5], s[0],s[2], s[4], s[1]]
    idx = new.index(1)
    print "MinHash of column {0} is row {1}".format(_, permute[idx])

MinHash of column 1 is row 5
MinHash of column 2 is row 6
MinHash of column 3 is row 4
MinHash of column 4 is row 3


## Question 3 (signatures):

In [4]:
data = {'C1': [1,2,3,4,5,6],
        'C2': [2,3,1,1,2,1],
        'C3': [1,4,2,3,5,6],
        'C4': [1,2,3,1,1,4],
        'C5': [2,3,1,2,1,1],
        'C6': [5,2,3,4,5,1],
        'C7': [4,2,2,4,1,4]}

df = pd.DataFrame(data)
print df

   C1  C2  C3  C4  C5  C6  C7
0   1   2   1   1   2   5   4
1   2   3   4   2   3   2   2
2   3   1   2   3   1   3   2
3   4   1   3   1   2   4   4
4   5   2   5   1   1   5   1
5   6   1   6   4   1   1   4


## Question 4 (shingles):
Find the set of 2-shingles for the "document" ABRACADABRA and also for the "document" BRICABRAC.

Answer the following questions:

How many 2-shingles does ABRACADABRA have? How many 2-shingles does BRICABRAC have? How many 2-shingles do they have in common? What is the Jaccard similarity between the two documents"?

In [5]:
def getKshingles(string,k):
    shingles = set()
    for idx in xrange(k,len(string)):
        shingles.add(string[idx - k:idx])
    return shingles

abra = getKshingles("ABRACADABRA", 2)
print abra
bric = getKshingles("BRICABRAC", 2)
print bric

inter = abra.intersection(bric)
print inter

union = abra.union(bric)
print union

print "Jaccard similiaryt is {}".format(len(inter)*1.0/len(union))

set(['AC', 'AB', 'AD', 'CA', 'DA', 'RA', 'BR'])
set(['AB', 'CA', 'RA', 'BR', 'IC', 'RI'])
set(['CA', 'AB', 'RA', 'BR'])
set(['AC', 'AB', 'AD', 'CA', 'DA', 'RA', 'BR', 'IC', 'RI'])
Jaccard similiaryt is 0.444444444444


## Question 6 (5 is omitted):
For the following points, find whether they are closer to (0,0) or (100,40) under the L1 and L2 norms:

(53,15) (63,8) (53,18) (54,8)

In [6]:
def distance(pt1, pt2, norm=2):
    differences = sum(abs(coord1-coord2)**norm for coord1, coord2 in zip(pt1,pt2))
    return np.power(differences, 1./norm)

for pt1 in ((53,15), (63,8), (53,18), (54,8)):
    print pt1
    for norm in range(1,3):
        pt2 = ((0,0), (100,40))
        one = distance(pt1, pt2[0], norm=norm)
        two = distance(pt1, pt2[1], norm=norm)
        if one < two:
            print pt1, "assigned to", pt2[0], "under L{} norm".format(norm)
        else:
            print pt1, "assigned to", pt2[1], "under L{} norm".format(norm)
    print '*' * 10

(53, 15)
(53, 15) assigned to (0, 0) under L1 norm
(53, 15) assigned to (100, 40) under L2 norm
**********
(63, 8)
(63, 8) assigned to (100, 40) under L1 norm
(63, 8) assigned to (100, 40) under L2 norm
**********
(53, 18)
(53, 18) assigned to (100, 40) under L1 norm
(53, 18) assigned to (100, 40) under L2 norm
**********
(54, 8)
(54, 8) assigned to (0, 0) under L1 norm
(54, 8) assigned to (0, 0) under L2 norm
**********
