In [12]:
#Import all the neccessary libraries
import numpy as np
from scipy.sparse import csr_matrix,csc_matrix
import timeit
np.random.seed(1702)

In [3]:
#Load in the large data file and inspect the data 
data = np.load('user_movie_rating.npy')

#Split the data into three lists of ratings, movies and users respectively. 
user_ids_full = data[:,0]
movie_ids_full = data[:,1]
ratings_full = data[:,2]

#Creating a sparse row matrix 
sparse_rowMatrix_full = csr_matrix((ratings_full, (movie_ids_full, user_ids_full)))

#Remove the first column of the sparse row matrix, since it adds a column too many 
sparse_rowMatrix_full = sparse_rowMatrix_full[:,1:]
#Check the amount of users
print("Number of selected users with user_ids_full is: ", np.shape(sparse_rowMatrix_full[0,:])[1])

Number of selected users with user_ids_full is:  103703


In [4]:
def minhashing(data_matrix,row_fraction,num_permutations):
    #This function creates a signature matrix for the data 

    #Make an empty (zeros) matrix for the 100 random permutations
    random_perm_matrix = np.zeros((data_matrix.shape[0], num_permutations)) 
    random_perm = np.arange(data_matrix.shape[0])
    for i in range(num_permutations):
        #Make 100 random permutations of the columns, and place them into the random permutation matrix
        np.random.shuffle(random_perm)
        random_perm_matrix[:,i] = random_perm
    print('The random matrix is\n',random_perm_matrix, "with shape:", random_perm_matrix.shape)

    #Make the signature matrix, which is initially filled with only inf-values
    signatureMatrix = np.full((random_perm_matrix.shape[1], data_matrix.shape[1]), np.inf)
    #Loop through the a set fraction of the total rows of the data we will use 
    for r in range(0, int(row_fraction*data_matrix.shape[0])): 
        random_perm_matrix_row = random_perm_matrix[r]
            
        #We first get the relevant row of the data, and then only loop over the non-zero elements! 
        data_matrix_row = data_matrix.getrow(r)
        data_col_indices = data_matrix_row.indices #indices of the non-zero elements
        
        #Loop over the non-zero elements
        for idx in data_col_indices:
            signatureMatrix[:,idx] = np.minimum(signatureMatrix[:,idx],random_perm_matrix_row)              

    print("\nSignature Matrix:\n", signatureMatrix)
    
    return np.array(signatureMatrix,dtype=np.int16)

In [8]:
def split_vector(signature, b, r):
    #Code splitting signature in b parts
    subvecs = []
    for i in range(0, signature.shape[0],r):
        subvecs.append(signature[i : i+r])
    return np.array(subvecs,dtype=np.int32)

def generate_hash_function(size, total_users):
    #Function to generate a random hash function
    a = np.random.randint(1, 1000, size)
    b = np.random.randint(1, 1000, size)
    return lambda x: tuple((a * x + b) % total_users)

def hashing(hash_functions,usersTotal,b,split_S):

    #Hashing the bands to various buckets 
    hash_table = {}
    hash_counter = 0 
    candidate_pairs = []
    for u in range(usersTotal):  # Iterate over all users
        for band_idx in range(b):
            current_band = split_S[band_idx, :, u]

            #Apply the hash function to the band
            hashed_value = hash_functions[band_idx](tuple(current_band))

            if hashed_value not in hash_table:
                hash_table[hashed_value] = [(u, band_idx)]
            else:
                for stored_pair in hash_table[hashed_value]:
                    stored_u, stored_band_idx = stored_pair
                    if np.array_equal(current_band, split_S[stored_band_idx, :, stored_u]):
                        candidate_pairs.append((u, stored_u))
                        hash_counter += 1

                #Add the current pair to the hash table
                hash_table[hashed_value].append((u, band_idx))

    #Finally, we sort the candidate pairs such that we start off with the smallest u1 
    candidate_pairs = np.array(candidate_pairs)
    candidate_pairs[:,0], candidate_pairs[:,1] = candidate_pairs[:,1], candidate_pairs[:,0].copy()
    sorted_idx = np.argsort(candidate_pairs[:,0])
    candidate_pairs = candidate_pairs[sorted_idx]
    print('The number of candidate pairs found is:',np.shape(candidate_pairs)[0])

    return candidate_pairs

In [6]:
def jaccard_similarity(candidate_pairs,data=sparse_rowMatrix_full):
    #In this function we use numpy to vectorize the Jaccard Similarity calculations for all candidate pairs
    #Using numpy drastically improves the efficiency of the code
    #The memory cost of converting sparse row matrices with datatype int64 costs a lot of RAM, so for that reason we convert the dtype to int8 first 
    users1 = data[:,candidate_pairs[:,0]].astype(np.int8).toarray()
    users2 = data[:,candidate_pairs[:,1]].astype(np.int8).toarray()

    one_gave_rating = np.logical_and(users1,users2) #The logical AND operator represents the intersection
    both_gave_rating = np.logical_or(users1,users2) #The logical OR operator represents the union 

    num_pairs = len(candidate_pairs[:,0])
    simPairs = []
    simPairs_value = []
    with open('js.txt','w') as file:
        for i in range(num_pairs):
            nom = np.sum(one_gave_rating[:,i])
            denom = np.sum(both_gave_rating[:,i])
            total = nom/denom
            if total > 0.5 and total not in simPairs_value:
                simPairs.append(candidate_pairs[i].tolist())
                simPairs_value.append(total)
                file.write(f"{candidate_pairs[i,0]}, {candidate_pairs[i,1]}\n")
    print('Accepted user pairs: \n',simPairs)
    print('Jaccard Similarity value: \n',simPairs_value)
    print('Number of similar user pairs:',len(simPairs_value))
    return simPairs,simPairs_value


In [13]:
def apply_jaccardSim():

    b = 15  #Number of bands we want to use to split the signature matrix in        
    r = 9   #The rows of values per band 
    m = 0.1 #fraction of rows that we want to pick a random permutation from. According to the book (top of page 88), the
            #resulting signature matrix should still be valid. This also increases speed of the calculation of the signature matrix 
            #by a factor 1/m, which helps a lot. 

    sigMatrix = minhashing(sparse_rowMatrix_full,m,int(b*r))
    split_sigMatrix = split_vector(sigMatrix,b,r)
    dim_bandedMatrix = np.shape(split_sigMatrix)
    print('The signature matrix split into bands has shape',dim_bandedMatrix)

    hash_functions = [generate_hash_function(r, dim_bandedMatrix[2]) for _ in range(b)]
    candidate_pairs = hashing(hash_functions,dim_bandedMatrix[2],b,split_sigMatrix)
    jaccard_similarity(candidate_pairs)    

apply_jaccardSim()

The random matrix is
 [[ 4057. 13863.  2109. ...  3391.  9959.  4308.]
 [ 8204.  1951.  8701. ... 11303. 14741.  9367.]
 [14864.  9137.  8224. ... 12215.  5385. 16173.]
 ...
 [ 9448. 10150.   178. ...  4648. 14730.  8901.]
 [14711.   928.  2745. ...  9221. 14720. 17470.]
 [10581.  7405.  6882. ... 13583.  6419.  9592.]] with shape: (17771, 135)

Signature Matrix:
 [[  46.  119.  105. ...  105.  119.  119.]
 [ 422.  680.  198. ...  252.  680. 2498.]
 [ 290.  290.  314. ...  290. 2427.  314.]
 ...
 [  90.  138.  306. ...  253.  944.  236.]
 [ 569.  294.  422. ...  362.  385.  589.]
 [ 370.  849.  370. ...  370.   71.  790.]]
The signature matrix split into bands has shape (15, 9, 103703)
The number of candidate pairs found is: 306959
Accepted user pairs: 
 [[413, 44622], [4649, 70717], [8295, 49458], [9017, 75288], [12038, 33755], [12895, 47929], [15161, 51236], [16193, 64420], [17097, 92294], [18866, 75159], [18866, 33145], [20163, 85466], [20163, 47929], [20443, 77806], [20788, 74190],