In [1]:
import numpy as np
import pandas as pd
import random
from scipy.sparse import coo_matrix,csc_matrix,csr_matrix,lil_matrix 

In [2]:
data = np.load('user_movie_rating.npy')

In [3]:
print(data.shape)
print(data[0:10])

(65225506, 3)
[[  1  30   3]
 [  1 157   3]
 [  1 173   4]
 [  1 175   5]
 [  1 191   2]
 [  1 197   3]
 [  1 241   3]
 [  1 295   4]
 [  1 299   3]
 [  1 329   4]]


In [4]:
#read in the first 1 million elements to reduce the size
user_ids = data[:1000000,0]
movie_ids = data[:1000000,1]
ratings = data[:1000000,2]

#how many users? 
print(user_ids[-1])
# creating sparse matrix, only use part of the elements to save memory 
sparse_colMatrix = csc_matrix((ratings, (user_ids, movie_ids)))
sparse_rowMatrix = csr_matrix((ratings, (user_ids, movie_ids)))

1611


In [17]:
#Important: set the random seed at the top here so we get consistent results from np.random.shuffle
np.random.seed(1702)

rowMatrix = sparse_rowMatrix.toarray()
colMatrix = sparse_colMatrix.toarray()
print(rowMatrix.shape)

#create random permutations of the array
random_perm_matrix = np.zeros((100,len(rowMatrix)))
print(random_perm_matrix.shape)
random_perm = np.arange(len(rowMatrix))
print(random_perm)
for i in range(100):
    np.random.shuffle(random_perm)
    random_perm_matrix[i,:] = random_perm
    #test:
    if i % 6 == 0:
        print(random_perm)

print('The random matrix is\n',random_perm_matrix)

test = np.count_nonzero(rowMatrix[:,0] != 0)
print(test)

(1612, 17771)
(100, 1612)
[   0    1    2 ... 1609 1610 1611]
[1534  823 1062 ... 1053 1220  770]
[ 153  915  361 ... 1077  141  650]
[1431 1236 1290 ...  371 1574  741]
[1354  715  472 ... 1210  189  684]
[ 399  980  137 ... 1226 1068   17]
[  52  335  684 ...  211  640 1272]
[ 995  697 1121 ...   35  575 1564]
[1109  798  887 ... 1134  533 1092]
[1079 1608 1068 ...  607  888  260]
[ 956  378  620 ... 1170 1556   80]
[ 416 1600  101 ... 1414 1137  524]
[1375  413   79 ...  929  781  200]
[1373 1147  860 ...  958  222  718]
[1001  710 1606 ... 1274  509 1447]
[ 358 1043  617 ...  507 1235 1128]
[1535  612  195 ...  765  197 1147]
[1151  284 1426 ... 1051 1105   94]
The random matrix is
 [[1534.  823. 1062. ... 1053. 1220.  770.]
 [1562.  658. 1586. ...  177. 1192.  575.]
 [ 145.  681.  201. ...  867.  777.  962.]
 ...
 [ 358.  713. 1268. ...  891.  873. 1095.]
 [1132.  795.  764. ... 1297.  186.  129.]
 [ 405. 1082. 1115. ...  120.  439.  313.]]
0


In [19]:
import timeit

#generate random elements:
print(rowMatrix)

signatureMatrix = np.zeros((100,len(rowMatrix[0])))
print(signatureMatrix)

begin = timeit.default_timer()
for i in range(100):
    #loop over the i rows of the signature matrix 
    print('We are in row number {}'.format(i))
    permutation = random_perm_matrix[i] #obtain a vector of random permutations with len(users)
    
    sorted_indices = np.argsort(permutation) #find the sorted order of indices
    
    #go through j random permutations of all users
    for j in range(len(permutation)):
        
        index = sorted_indices[j] #use a chronological order of indices 
        
        #condition to check whether there are 0 elements remaining in rowMatrix_i 
        if np.count_nonzero(signatureMatrix[i] == 0) == 0:
            continue 
        
        index = int(permutation[j])
        
        #loop through the row decided by permutation j
        for k in range(len(rowMatrix[0])):
            
            if rowMatrix[index,k] != 0: 
                signatureMatrix[i,k] = index+1 
                
end = timeit.default_timer() - begin 
print('The total time it took was {:.1f} minutes'.format(end/60))
print(signatureMatrix)
sig_shape = np.shape(signatureMatrix)
ratio = np.count_nonzero(signatureMatrix)/(sig_shape[0]*sig_shape[1])
print('The ratio of non-zero elements to the total amount of elements is {:.3f}'.format(ratio))

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
We are in row number 0
We are in row number 1
We are in row number 2
We are in row number 3
We are in row number 4
We are in row number 5
We are in row number 6
We are in row number 7
We are in row number 8
We are in row number 9
We are in row number 10
We are in row number 11
We are in row number 12
We are in row number 13
We are in row number 14
We are in row number 15
We are in row number 16
We are in row number 17
We are in row number 18
We are in row number 19
We are in row number 20
We are in row number 21
We are in row number 22
We are in row number 23
We are in row number 24
We are in row number 25
We are in row number 26
We are in row number 27
We are in row number 28
We are in row number 29
We are in row 