In [28]:
#Import all the neccessary libraries
import numpy as np
from scipy.sparse import csr_matrix,csc_matrix
import timeit
np.random.seed(1702)

In [27]:
#Load in the large data file and inspect the data 
data = np.load('user_movie_rating.npy')

#Split the data into three lists of ratings, movies and users respectively. 
user_ids_full = data[:,0]
movie_ids_full = data[:,1]
ratings_full = data[:,2]

#Creating a sparse row matrix 
sparse_rowMatrix_full = csr_matrix((ratings_full, (movie_ids_full, user_ids_full)))

#Remove the first column of the sparse row matrix, since it adds a column too many 
sparse_rowMatrix_full = sparse_rowMatrix_full[:,1:]
#Check the amount of users
print("Number of selected users with user_ids_full is: ", np.shape(sparse_rowMatrix_full[0,:])[1])

Number of selected users with user_ids_full is:  103703


In [25]:
#This code block creates a signature matrix for the data 

def minhashing(data_matrix,row_fraction,num_permutations):

    #Make an empty (zeros) matrix for the 100 random permutations
    random_perm_matrix = np.zeros((data_matrix.shape[0], num_permutations)) 
    random_perm = np.arange(data_matrix.shape[0])
    for i in range(num_permutations):
        #Make 100 random permutations of the columns, and place them into the random permutation matrix
        np.random.shuffle(random_perm)
        random_perm_matrix[:,i] = random_perm
    print('The random matrix is\n',random_perm_matrix, "with shape:", random_perm_matrix.shape)

    #Make the signature matrix, which is initially filled with only inf-values
    signatureMatrix = np.full((random_perm_matrix.shape[1], data_matrix.shape[1]), np.inf)
    #Loop through the a set fraction of the total rows of the data we will use 
    for r in range(0, int(row_fraction*data_matrix.shape[0])): 
        random_perm_matrix_row = random_perm_matrix[r]
            
        #We first get the relevant row of the data, and then only loop over the non-zero elements! 
        data_matrix_row = data_matrix.getrow(r)
        data_col_indices = data_matrix_row.indices #indices of the non-zero elements
        
        #Loop over the non-zero elements
        for idx in data_col_indices:
            signatureMatrix[:,idx] = np.minimum(signatureMatrix[:,idx],random_perm_matrix_row)              

    print("\nSignature Matrix:\n", signatureMatrix)
    
    return np.array(signatureMatrix,dtype=np.int16)

In [24]:
def split_vector(signature, b, r):
    # code splitting signature in b parts
    subvecs = []
    for i in range(0, S.shape[0],r):
        subvecs.append(signature[i : i+r])
    return np.array(subvecs,dtype=np.int32)

#Function to generate a random hash function
def generate_hash_function(size, total_users):
    a = np.random.randint(1, 1000, size)
    b = np.random.randint(1, 1000, size)
    return lambda x: tuple((a * x + b) % total_users)

def hashing(hash_functions,usersTotal):

    #Hashing the bands to various buckets 
    hash_table = {}
    hash_counter = 0 
    candidate_pairs = []
    for u in range(usersTotal):  # Iterate over all users
        for band_idx in range(b):
            current_band = split_S[band_idx, :, u]

            #Apply the hash function to the band
            hashed_value = hash_functions[band_idx](tuple(current_band))

            if hashed_value not in hash_table:
                hash_table[hashed_value] = [(u, band_idx)]
            else:
                for stored_pair in hash_table[hashed_value]:
                    stored_u, stored_band_idx = stored_pair
                    if np.array_equal(current_band, split_S[stored_band_idx, :, stored_u]):
                        candidate_pairs.append((u, stored_u))
                        hash_counter += 1

                #Add the current pair to the hash table
                hash_table[hashed_value].append((u, band_idx))

    #Finally, we sort the candidate pairs such that we start off with the smallest u1 
    candidate_pairs = np.array(candidate_pairs)
    candidate_pairs[:,0], candidate_pairs[:,1] = candidate_pairs[:,1], candidate_pairs[:,0].copy()
    sorted_idx = np.argsort(candidate_pairs[:,0])
    candidate_pairs = candidate_pairs[sorted_idx]
    print('The number of candidate pairs found is:',np.shape(candidate_pairs)[0])

    return candidate_pairs

In [16]:
def jaccard_similarity(candidate_pairs,data=sparse_rowMatrix_full):
    #In this function we use numpy to vectorize the Jaccard Similarity calculations for all candidate pairs
    #Using numpy drastically improves the efficiency of the code
    #The memory cost of converting sparse row matrices with datatype int64 costs a lot of RAM, so for that reason we convert the dtype to int8 first 
    users1 = data[:,candidate_pairs[:,0]].astype(np.int8).toarray()
    users2 = data[:,candidate_pairs[:,1]].astype(np.int8).toarray()

    one_gave_rating = np.logical_and(users1,users2) #The logical AND operator represents the intersection
    both_gave_rating = np.logical_or(users1,users2) #The logical OR operator represents the union 

    num_pairs = len(candidate_pairs[:,0])
    simPairs = []
    simPairs_value = []
    with open('js.txt','a') as file:
        for i in range(num_pairs):
            nom = np.sum(one_gave_rating[:,i])
            denom = np.sum(both_gave_rating[:,i])
            total = nom/denom
            if total > 0.5 and total not in simPairs_value:
                simPairs.append(candidate_pairs[i].tolist())
                simPairs_value.append(total)
                file.write(f"{candidate_pairs[i,0]}, {candidate_pairs[i,1]}\n")
    print('Accepted user pairs: \n',simPairs)
    print('Jaccard Similarity value: \n',simPairs_value)
    print('Number of similar user pairs:',len(simPairs_value))
    return simPairs,simPairs_value


Accepted user pairs: 
 [[413, 29806], [508, 102068], [3583, 33755], [5352, 42740], [6876, 36382], [7529, 58750], [7623, 36824], [7623, 58327], [8295, 33755], [8295, 49458], [9017, 61540], [9771, 55104], [10611, 53266], [11367, 74356], [15161, 51236], [15161, 35418], [15161, 53266], [16193, 18866], [20163, 63688], [20443, 74190], [25620, 49458], [25620, 96075], [26478, 85466], [26478, 38030], [28633, 30250], [30524, 55032], [31818, 58327], [32305, 35418], [32305, 47929], [33755, 98667], [33755, 101403], [33755, 100548], [33755, 54719], [33755, 98687], [33755, 73537], [33755, 91157], [33755, 51236], [33755, 91351], [33755, 46936], [33755, 85466], [33755, 63973], [33755, 68130], [35418, 53266], [36041, 85466], [36382, 91579], [37358, 68130], [38030, 85466], [38030, 58327], [38030, 99432], [46364, 74190], [46552, 85466], [47112, 63973], [47112, 100548], [47112, 73537], [47929, 100548], [47929, 81869], [48991, 81962], [49458, 96075], [49458, 63973], [51236, 101403], [51236, 85466], [51236, 

In [23]:
def apply_jaccardSim():

    b = 20  #Number of bands we want to use to split the signature matrix in        
    r = 9   #The rows of values per band 
    m = 0.1 #fraction of rows that we want to pick a random permutation from. According to the book (top of page 88), the
            #resulting signature matrix should still be valid. This also increases speed of the calculation of the signature matrix 
            #by a factor 1/m, which helps a lot. 

    sigMatrix = minhashing(sparse_rowMatrix_full,m,int(b*r))
    split_sigMatrix = split_vector(sigMatrix,b,r)
    dim_bandedMatrix = np.shape(split_sigMatrix)
    print('The signature matrix split into bands has shape',dim_bandedMatrix)

    hash_functions = [generate_hash_function(r, dim_bandedMatrix[2]) for _ in range(b)]
    candidate_pairs = hashing(hash_functions,dim_bandedMatrix[2])
    jaccard_similarity(candidate_pairs)    

apply_jaccardSim()

The random matrix is
 [[12137.  7382. 14975. ... 10237.  5995. 17203.]
 [12514. 14136. 17167. ...  5079. 17416. 16927.]
 [13725. 14087. 12637. ... 14096.  4122. 14938.]
 ...
 [12472.  8508.  4043. ... 15336. 14476.  6994.]
 [ 1877.  5676.  8428. ... 15009.  7281. 16299.]
 [16452.  6678.  3521. ... 11189.   847.  9936.]] with shape: (17771, 180)

Signature Matrix:
 [[ 113.  113.  113. ...  149.  113.  113.]
 [ 153.  153.  153. ...  228. 1108.  153.]
 [   8.    8.    8. ... 2149.   86.    8.]
 ...
 [  42.  232.  228. ...   42.  232.   42.]
 [ 221.  780.  189. ...  274.  353. 1193.]
 [ 178.  938.  123. ...  119.  498. 1402.]]
The signature matrix split into bands has shape (20, 9, 103703)
Accepted user pairs: 
 [[413, 29806], [508, 102068], [3583, 33755], [5352, 42740], [6876, 36382], [7529, 58750], [7623, 36824], [7623, 58327], [8295, 33755], [8295, 49458], [9017, 61540], [9771, 55104], [10611, 53266], [11367, 74356], [15161, 51236], [15161, 35418], [15161, 53266], [16193, 18866], [20163