In [1]:
# import packages
import numpy as np
import pandas as pd
from collections import defaultdict
from scipy.sparse import csc_matrix
np.random.seed(seed=2019)

#  load the data
FILE = '../data/user_movie.npy'
df = pd.DataFrame(np.load(FILE), columns = ['user','movie'])

In [159]:
import sys
import numpy as np
import time
from scipy import sparse

start = time.time()

print("Loading data...")
data = df

# the function used to calculate jaccard similarity with original user-movie sets
def jaccard_sim_set(s1, s2):
    intersection = s1.intersection(s2)
    union = s1.union(s2)
    return (len(intersection)/len(union))

# the function used to calculate approximate jaccard similarity with signature matrix
def jaccard_sim_sig(u1, u2):
    sim = float(np.count_nonzero(u1==u2))/len(u1)
    return sim

# the function used to get the indexes to the duplicate columns of a numpy array
def column_index(mat):
    index = np.lexsort(mat)
    diff = np.any(mat.T[index[1:]]!=mat.T[index[:-1]], axis = 1)
    edge = np.where(diff)[0] + 1
    re = np.split(index, edge)
    re = [g for g in re if len(g)>=2]
    return re

print("Constructing user-movie sets...")
n = np.max(data[:,0]) + 1 # the number of users

# for each user, construct a set that contains the movies this user has rated
user_movie = []
for i in range(n):
    user_movie.append(set())
for u, m in data:
    user_movie[u].add(m)

# construct the sparse matrix
data = sparse.csc_matrix(([1]*data.shape[0], (data[:,1], data[:,0])))

print("Constructing signature matrix...")
signature = 100 # the number of signatures
sig_mat = np.zeros([signature, data.shape[1]])

# set seed when constructing signature matrix
np.random.seed(int(sys.argv[1]))
for i in range(signature):
    row_sel = np.random.permutation(data.shape[0])
    data_perm = data[row_sel,:]
    for j in range(data.shape[1]):
        loc = data_perm.indices[data_perm.indptr[j]:data_perm.indptr[j+1]].min()
        sig_mat[i,j] = loc

print("Running LSH...")
row_start = 0
r = 5 # the number of rows within a band
all_buckets = []

# for each band, get the buckets that contain at least 2 users
while (row_start + r)<=signature:
    buckets = column_index(sig_mat[row_start:row_start+r,:])
    
    for value in buckets:
        if len(value)>1:
            all_buckets.append(value)
    
    row_start += r

# remove the duplicate buckets
all_buckets_tuple = map(tuple, all_buckets)
all_buckets = set(all_buckets_tuple)
all_buckets = list(all_buckets)

# for candidate pairs in each bucket, calculate similarity using signature matrix
sim_pairs = set()
for bucket in all_buckets:
    for i in range(len(bucket)-1):
        for j in range(i+1, len(bucket)):
            sim = jaccard_sim_sig(sig_mat[:,bucket[i]], sig_mat[:,bucket[j]])
            if sim>0.5:
                sim_pairs.add((bucket[i], bucket[j]))

# sort the similar pairs
sim_pairs_list = list(sim_pairs)
sim_pairs_list.sort()

# test whether these similar pairs are ture
sim_pairs_true = []
for pair in sim_pairs_list:
    sim = jaccard_sim_set(user_movie[pair[0]], user_movie[pair[1]])
    if sim>0.5:
        sim_pairs_true.append(pair)

f = open("results.txt", "w")
for pair in sim_pairs_true:
    f.write(str(pair[0]) + "," + str(pair[1]) + "\n")
f.close()

end = time.time()

print("Find " + str(len(sim_pairs_true)) + " similar pairs.")
print("Need " + str((end - start)/60) + " minutes.")



Loading data...
Constructing user-movie sets...


TypeError: '(slice(None, None, None), 0)' is an invalid key

In [42]:
m_by_u = df.groupby('user')['movie'].apply(list)

In [43]:
m_by_u

user
0    [1, 0, 2, 3]
1          [0, 2]
2          [2, 3]
Name: movie, dtype: object

In [44]:
n_user = len(df.user.unique())
n_movie = len(df.movie.unique())

In [45]:
n_sig = 3

In [46]:
mat = csc_matrix(([1]*df.shape[0], (df.iloc[:,1], df.iloc[:,0])))

In [88]:
mat.toarray()

array([[1, 1, 0],
       [1, 0, 0],
       [1, 1, 1],
       [1, 0, 1]], dtype=int64)

In [109]:
def minhashing(mat, n_sig = n_sig):
    sig_matrix = np.array([])
    for i in range(n_sig):
        seq = np.random.permutation(n_movie)
        perm = mat[seq]
        sig = seq[perm.argmax(axis=0)]
        if sig_matrix.any():
            # return the index of the first occurrence of 1
            sig_matrix = np.vstack((sig_matrix,sig))
        else:
            sig_matrix = sig
    return sig_matrix

In [110]:
sig_matrix = minhashing(mat,n_sig)

In [112]:
sig_matrix

array([[3, 0, 3],
       [3, 0, 3],
       [3, 0, 3]])

In [113]:
s = np.sum(sig_matrix[0*2:1*2,:], axis =0)

In [114]:
s

array([6, 0, 6])

In [127]:
bands = 3
rows = 1
def LSH_bucket(sig_matrix):
    bucket_list = defaultdict(set)
    if bands * rows != sig_matrix.shape[0]:
        print('#bands * #rows is not equal to #signatures')
    else:
        for b in range(bands):
            s = np.sum(sig_matrix[b*rows:(b+1)*rows,:], axis =0)
            for index, x in np.ndenumerate(s):
                bucket_list[x].add(index[0])
        return bucket_list.items()

In [131]:
def calculate_jsim(a,b):
    intersect = set(a) & set(b)
    union = set(a) | set(b)
    jsim = len(intersect)/len(union)
    return jsim

In [128]:
bucket_list = LSH_bucket(sig_matrix)

In [129]:
bucket_list

dict_items([(3, {0, 2}), (0, {1})])

In [150]:
unique_pairs = set()
sim_pairs = set()
for key, value in bucket_list:
    if len(value) > 1 :#and len(value) < 100: # remove the duplicate pairs
        for i in value:
            for j in value:
                if i < j and ((i,j) not in unique_pairs):
                    unique_pairs.add((i,j))
                    sigi = sig_matrix[:,i].ravel().tolist()
                    sigj = sig_matrix[:,j].ravel().tolist()
                    jsim = calculate_jsim(sigi, sigj)
                    if jsim > 0.5:
                        sim_pairs.add((i,j))

In [152]:
sim_pairs

{(0, 2)}

In [141]:
for key, value in bucket_list:
    print(key,value,len(value))
    for i in value:
        print(i)

3 {0, 2} 2
0
2
0 {1} 1
1


In [143]:
sigi = sig_matrix[:,1].ravel().tolist()[0]

In [158]:
sig_matrix[:,1].tolist()

[0, 0, 0]

In [148]:
sig_matrix

array([[3, 0, 3],
       [3, 0, 3],
       [3, 0, 3]])

In [154]:
true_pairs = []
for pair in sim_pairs:
    jsim = calculate_jsim(m_by_u[pair[0]], m_by_u[pair[1]])
    if jsim>0.4:
        true_pairs.append(pair)
true_pairs

[(0, 2)]

In [1]:
10%4

2