In [21]:
import numpy as np
import pandas as pd
import random
from scipy.sparse import coo_matrix,csc_matrix,csr_matrix,lil_matrix 

In [22]:
data = np.load('user_movie_rating.npy')

In [23]:
print(data.shape)
print(data[0:10])

(65225506, 3)
[[  1  30   3]
 [  1 157   3]
 [  1 173   4]
 [  1 175   5]
 [  1 191   2]
 [  1 197   3]
 [  1 241   3]
 [  1 295   4]
 [  1 299   3]
 [  1 329   4]]


In [24]:
#read in the first 1 million elements to reduce the size
user_ids = data[:1000000,0]
movie_ids = data[:1000000,1]
ratings = data[:1000000,2]

#how many users? 
print(user_ids[-1])
# creating sparse matrix, only use part of the elements to save memory 
sparse_colMatrix = csc_matrix((ratings, (user_ids, movie_ids)))
sparse_rowMatrix = csr_matrix((ratings, (user_ids, movie_ids)))

1611


In [68]:
#Important: set the random seed at the top here so we get consistent results from np.random.shuffle
np.random.seed(1702)

rowMatrix = sparse_rowMatrix.toarray()
rowMatrix = np.delete(rowMatrix,(0),axis=0)

colMatrix = sparse_colMatrix.toarray()
print(rowMatrix.shape)

#create random permutations of the array
random_perm_matrix = np.zeros((100,len(rowMatrix)))
print(random_perm_matrix.shape)
random_perm = np.arange(len(rowMatrix))
print(random_perm)
for i in range(100):
    np.random.shuffle(random_perm)
    random_perm_matrix[i,:] = random_perm
    #test:
    if i % 6 == 0:
        print(random_perm)

print('The random matrix is\n',random_perm_matrix)

(1611, 17771)
(100, 1611)
[   0    1    2 ... 1608 1609 1610]
[ 321  624  901 ... 1053 1220  770]
[   3 1444  935 ... 1583  177 1475]
[ 459  997 1185 ...  659  915 1188]
[ 278 1563  804 ...  780 1183 1257]
[ 690 1306  916 ...  389 1188  662]
[  35 1001 1424 ...  426  460 1306]
[ 752  479   84 ... 1209 1436 1343]
[ 904 1012  635 ...  281 1597  357]
[ 439 1608 1315 ...  461  892 1378]
[1247  615  782 ... 1314 1416  366]
[1337 1471 1309 ...  634  798  334]
[ 313 1464    6 ...  589  219  231]
[1456  953  296 ...  197  245  240]
[ 770  440 1469 ... 1200  815  826]
[1537 1333 1309 ... 1448 1350 1094]
[1078  627  758 ...  401   99  666]
[ 458 1398  708 ... 1381  635 1023]
The random matrix is
 [[ 321.  624.  901. ... 1053. 1220.  770.]
 [ 281.  729.  826. ...  584.  798. 1233.]
 [1146. 1264.  606. ... 1005. 1158.  220.]
 ...
 [ 558. 1492.  494. ... 1241. 1378.  292.]
 [ 150.  981.  494. ... 1224. 1081.  764.]
 [ 337. 1466. 1116. ... 1576.   21. 1019.]]


In [76]:
#generate random elements:
print(rowMatrix)

signatureMatrix = np.zeros((100,len(rowMatrix[0])))
print(signatureMatrix)

for i in range(100):
    print(i)
    permutation = random_perm_matrix[i]
    
    for j in range(len(permutation)):
        #obtain a row number to check 
        index = int(permutation[j])
        
        #loop through the row decided by permutation j
        for k in range(len(rowMatrix[0])):
            if rowMatrix[index,k] != 0: 
                signatureMatrix[i,k] = j+1

print(signatureMatrix)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
0
1457.0
4629949.0
11153699.0
19590538.0
1
478.0
4895731.0
11204488.0
19172790.0
2
430.0
4812114.0
11404374.0
19464216.0
3
795.0
5044458.0
11171511.0
19296026.0
4
507.0
4873120.0
11430130.0
19021619.0
5
688.0
4678277.0
11468164.0
19465194.0
6
506.0
4792051.0
11724728.0
19017834.0
7
717.0
4688362.0
11561028.0
19066071.0
8
316.0
4895578.0
11414886.0
19508892.0
9
982.0
4636627.0
11424978.0
19319057.0
10
347.0
4795182.0
11659598.0
18885351.0
11
347.0
4519821.0
11673840.0
19525591.0
12
679.0
4962165.0
11728288.0
18946224.0
13
716.0
4906938.0
11295709.0
19241412.0
14
1148.0
4846003.0
11782092.0
18938819.0
15
625.0
4863170.0
11508717.0
19075684.0
16
462.0
4669994.0
11501714.0
19323975.0
17
640.0
5127566.0
11582043.0
18788