In [None]:
import numpy as np
import pandas as pd
import random
from scipy.sparse import coo_matrix,csc_matrix,csr_matrix,lil_matrix 
import timeit

data = np.load('user_movie_rating.npy')


In [None]:
print(data.shape)
print(data)
print(data[0:10])

In [99]:
#read in the first 1 million elements to reduce the size

user_ids = data[:1000000,0]
movie_ids = data[:1000000,1]
ratings = data[:1000000,2]

#and also the full size, but this will be to large to make an array
user_ids_full = data[:,0]
movie_ids_full = data[:,1]
ratings_full = data[:,2]

#how many users selected? 
print("Number of selected users with user_ids is: ", len(user_ids))
print("Number of selected users with user_ids_full is: ", len(user_ids_full))

# creating sparse matrix, only use part of the elements to save memory 
sparse_colMatrix = csc_matrix((ratings, (movie_ids, user_ids)))
sparse_colMatrix_full = csc_matrix((ratings_full, (movie_ids_full, user_ids_full)))
#colMatrix_full = sparse_colMatrix_full.toarray()
#print(sparse_colMatrix)

sparse_rowMatrix = csr_matrix((ratings, (movie_ids, user_ids)))
sparse_rowMatrix_full = csr_matrix((ratings_full, (movie_ids_full, user_ids_full)))
#rowMatrix_full = sparse_rowMatrix_full.toarray()
#print(sparse_rowMatrix)

print(data)

Number of selected users with user_ids is:  1000000
Number of selected users with user_ids_full is:  65225506
[[     1     30      3]
 [     1    157      3]
 [     1    173      4]
 ...
 [103703  17622      2]
 [103703  17627      4]
 [103703  17764      4]]


In [103]:
rowMatrix = sparse_rowMatrix.toarray()
colMatrix = sparse_colMatrix.toarray()

print(sum(rowMatrix[:,0]==0))

17771


In [72]:
np.random.seed(1702)
"""
This block creates signature matrices for two examples: one from the book and one from the lecture slides
"""

#This is the example from the lecture slides
example_matrix1 = csc_matrix(np.transpose(np.array([[1, 1, 0, 0, 0, 1, 1], 
                          [0, 0, 1, 1, 1, 0, 0],
                          [1, 0, 0, 0, 0, 1, 1],
                          [0, 1, 1, 1, 1, 0, 0]])))
test_perm1 = np.transpose(np.array([[1, 3, 7, 6, 2, 5, 4], 
                                    [4, 2, 1, 3, 6, 7, 5],
                                    [3, 4, 7, 6, 1, 2, 5]]))
test_perm1 = np.subtract(test_perm1, 1) #subtract 1 to start at index 0 like the example in the book

#This is the example from the book chapter 3.3.5
example_matrix2 = csc_matrix(np.transpose(np.array([[1, 0, 0, 1, 0], 
                          [0, 0, 1, 0, 0],
                          [0, 1, 0, 1, 1],
                          [1, 0, 1, 1, 0]])))
test_perm2 = np.transpose(np.array([[1, 2, 3, 4, 0], 
                                    [1, 4, 2, 0, 3]]))

def examples(example_matrix, permutation):
    print("Example Matrix:\n", example_matrix.toarray())
    print("Hash functions:\n", permutation)

    signatureMatrix = np.full((len(permutation[0]), example_matrix.shape[1]), np.inf)
    print("\nSignature Matrix:\n", signatureMatrix)

    for r in range(0, example_matrix.shape[0]):
        permutation_row = permutation[r]
        for c in range(0, example_matrix.shape[1]):
            if example_matrix[r, c] == 0:
                continue
            #if example_matrix[r, c] != 0:
            for i in range(0, signatureMatrix.shape[0]):
                signatureMatrix[i][c] = min(signatureMatrix[i][c], int(permutation_row[i]))

    print("\nSignature Matrix:\n", signatureMatrix)
    
    return signatureMatrix

    
    
S_test = examples(example_matrix1, test_perm1) #S_test is the resulting signature matrix

Example Matrix:
 [[1 0 1 0]
 [1 0 0 1]
 [0 1 0 1]
 [0 1 0 1]
 [0 1 0 1]
 [1 0 1 0]
 [1 0 1 0]]
Hash functions:
 [[0 3 2]
 [2 1 3]
 [6 0 6]
 [5 2 5]
 [1 5 0]
 [4 6 1]
 [3 4 4]]

Signature Matrix:
 [[inf inf inf inf]
 [inf inf inf inf]
 [inf inf inf inf]]

Signature Matrix:
 [[0. 1. 0. 1.]
 [1. 0. 3. 0.]
 [1. 0. 1. 0.]]


In [105]:
np.random.seed(1702)

"""
This block creates signature matrices for the actual data
"""

m = 0.01 #fraction of rows that we want to pick a rondom permutation from. According to the book (top of page 88), the
         #resulting signature matrix should still be valid. This also increases the calculation of the signature matrix 
         #by a factor 1/m, which helps a lot. 

begin = timeit.default_timer()

def minhashing(data_matrix):
    
    print("Original (sparse) Matrix:\n", data_matrix, "with shape:", data_matrix.shape)

    random_perm_matrix = np.zeros((data_matrix.shape[0], 100)) #make an empty (zeros) matrix for the 100 random permutations

    random_perm = np.arange(data_matrix.shape[0])
    for i in range(100):
        #Make 100 random permutations of the columns, and place them into the random permutation matrix
        np.random.shuffle(random_perm)
        random_perm_matrix[:, i] = random_perm
    print('The random matrix is\n',random_perm_matrix, "with shape:", random_perm_matrix.shape)

    signatureMatrix = np.full((random_perm_matrix.shape[1], data_matrix.shape[1]), np.inf)
    #make the signature matrix, which is initially filled with only inf-values
    print("\nSignature Matrix:\n", signatureMatrix, "with shape:", signatureMatrix.shape)
    
    for r in range(0, int(m*data_matrix.shape[0])): #row
        print("Row:", r)
        random_perm_matrix_row = random_perm_matrix[r]
        for c in range(0, data_matrix.shape[1]): #column
            if data_matrix[r, c] == 0:
                #in most cases, this is true so continue
                continue
            for i in range(0, random_perm_matrix.shape[1]):
                signatureMatrix[i][c] = min(signatureMatrix[i][c], random_perm_matrix_row[i])
                #if a column has 1 in row r, then for each item i=1,2...n(100) we set the signatureMatrix[i][c]
                #to the minimum of signatureMatrix[i][c] and random_perm_matrix_row[i]
                    

    print("\nSignature Matrix:\n", signatureMatrix)
    
    end = timeit.default_timer() - begin 
    print('The total time it took was ', end/60, ' minutes')
    
    return signatureMatrix

S = minhashing(sparse_rowMatrix_full) #S is the resulting signature matrix




  (1, 95)	3
  (1, 155)	4
  (1, 441)	3
  (1, 575)	4
  (1, 853)	4
  (1, 1189)	5
  (1, 1215)	4
  (1, 1264)	4
  (1, 1499)	3
  (1, 1717)	3
  (1, 1718)	3
  (1, 1803)	5
  (1, 2181)	3
  (1, 2326)	2
  (1, 2681)	3
  (1, 3400)	2
  (1, 3670)	5
  (1, 3690)	4
  (1, 3881)	5
  (1, 4756)	4
  (1, 4799)	1
  (1, 5268)	4
  (1, 5691)	3
  (1, 5884)	5
  (1, 6742)	4
  :	:
  (17770, 100304)	2
  (17770, 100391)	3
  (17770, 100507)	3
  (17770, 100560)	2
  (17770, 100610)	3
  (17770, 100664)	3
  (17770, 100666)	3
  (17770, 100701)	2
  (17770, 100720)	3
  (17770, 100731)	3
  (17770, 100764)	4
  (17770, 101076)	3
  (17770, 101298)	3
  (17770, 101859)	3
  (17770, 101923)	2
  (17770, 102017)	3
  (17770, 102037)	3
  (17770, 102107)	1
  (17770, 102256)	3
  (17770, 102580)	4
  (17770, 103055)	3
  (17770, 103108)	3
  (17770, 103344)	4
  (17770, 103382)	3
  (17770, 103442)	3
Example Matrix:
   (1, 95)	3
  (1, 155)	4
  (1, 441)	3
  (1, 575)	4
  (1, 853)	4
  (1, 1189)	5
  (1, 1215)	4
  (1, 1264)	4
  (1, 1499)	3
  (1, 1717)	3

In [45]:
print(S.shape)
for i in range(0, S.shape[1]):
    

(100, 103704)


In [69]:
b = 5 #number of bands
r = 20 #number of rows per band
t = 0.50 #threshold
n_buckets = S.shape[1]
print(n_buckets)
print(S.shape)

def split_vector(signature, b, r):
    # code splitting signature in b parts
    subvecs = []
    for i in range(0, S.shape[0], r):
        subvecs.append(signature[i : i+r])
    return subvecs

split_S = split_vector(S, b, r)
print(split_S[0].shape)

103704
(100, 103704)
(20, 103704)
