In [5]:
import numpy as np
import pandas as pd
import random
from scipy.sparse import coo_matrix,csc_matrix,csr_matrix,lil_matrix 
import timeit
np.random.seed(1702) 


In [6]:
#Load in the large data file
data = np.load('user_movie_rating.npy')

In [7]:
print(data.shape)
print(data)
print(data[0:10])

unique_values_first_column = np.unique(data[:, 0])
num_unique_values_first_column = len(unique_values_first_column)
print(num_unique_values_first_column)

(65225506, 3)
[[     1     30      3]
 [     1    157      3]
 [     1    173      4]
 ...
 [103703  17622      2]
 [103703  17627      4]
 [103703  17764      4]]
[[  1  30   3]
 [  1 157   3]
 [  1 173   4]
 [  1 175   5]
 [  1 191   2]
 [  1 197   3]
 [  1 241   3]
 [  1 295   4]
 [  1 299   3]
 [  1 329   4]]
103703


In [8]:
#read in the first 1 million elements to reduce the size

user_ids = data[:1000000,0]
movie_ids = data[:1000000,1]
ratings = data[:1000000,2]

#and also the full size, but this will be to large to make an array
user_ids_full = data[:,0]
print(user_ids_full[0])
movie_ids_full = data[:,1]
ratings_full = data[:,2]

#how many users selected? 
print("Number of selected users with user_ids is: ", len(user_ids))
print("Number of selected users with user_ids_full is: ", len(user_ids_full))

# creating sparse matrices 
sparse_colMatrix = csc_matrix((ratings, (movie_ids, user_ids)))
sparse_colMatrix_full = csc_matrix((ratings_full, (movie_ids_full, user_ids_full)))
sparse_rowMatrix = csr_matrix((ratings, (movie_ids, user_ids)))
sparse_rowMatrix_full = csr_matrix((ratings_full, (movie_ids_full, user_ids_full)))

#Remove the first column of the sparse row and col matrix 
sparse_rowMatrix_full = sparse_rowMatrix_full[:,1:]
sparse_colMatrix_full = sparse_colMatrix_full[:,1:]

print(np.sum((sparse_rowMatrix_full != 0).sum(axis=0) == 0))
print(np.where((sparse_rowMatrix_full != 0).sum(axis=0) == 0)[1])
print(np.shape(sparse_rowMatrix_full))

print(np.sum((sparse_colMatrix_full != 0).sum(axis=0) == 0))
print(np.where((sparse_colMatrix_full != 0).sum(axis=0) == 0)[1])
print(np.shape(sparse_colMatrix_full))


1
Number of selected users with user_ids is:  1000000
Number of selected users with user_ids_full is:  65225506
0
[]
(17771, 103703)
0
[]
(17771, 103703)


In [9]:
rowMatrix = sparse_rowMatrix.toarray()
colMatrix = sparse_colMatrix.toarray()

print(sum(rowMatrix[:,0]==0))

17771


In [10]:
np.random.seed(1702)
"""
This block creates signature matrices for two examples: one from the book and one from the lecture slides
"""

#This is the example from the lecture slides
example_matrix1 = csc_matrix(np.transpose(np.array([[1, 1, 0, 0, 0, 1, 1], 
                          [0, 0, 1, 1, 1, 0, 0],
                          [1, 0, 0, 0, 0, 1, 1],
                          [0, 1, 1, 1, 1, 0, 0]])))

#This is the example from the book chapter 3.3.5
example_matrix2 = csc_matrix(np.transpose(np.array([[1, 0, 0, 1, 0], 
                          [0, 0, 1, 0, 0],
                          [0, 1, 0, 1, 1],
                          [1, 0, 1, 1, 0]])))


print(example_matrix1.toarray())



def random_projections(data_matrix, num_permutations):
    
    #print("Original (sparse) Matrix:\n", data_matrix, "with shape:", data_matrix.shape)
    
    #make an empty (zeros) matrix for the 100 random permutations
    random_perm_matrix = np.zeros((num_permutations, data_matrix.shape[0]))
    
    #make a random permutation with ~50% of the values 0's and ~50% 1's. These will then be randomly shuffled
    random_perm = np.ones(data_matrix.shape[0])
    random_perm[:int(data_matrix.shape[0]/2)] = -1
    for i in range(num_permutations):
        #Make 100 random permutations of the columns, and place them into the random permutation matrix
        np.random.shuffle(random_perm)
        random_perm_matrix[i, :] = random_perm
    print('The random matrix is\n',random_perm_matrix, "with shape:", random_perm_matrix.shape)
    

    signatureMatrix = np.zeros((random_perm_matrix.shape[0], data_matrix.shape[1]))
    #make the signature matrix, which is initially filled with only zeros
    print("\nSignature Matrix:\n", signatureMatrix, "with shape:", signatureMatrix.shape)
    

    for c in range(0, int(data_matrix.shape[1])): #column
        print("Column:", c)
        
        x_T = data_matrix[:, c].T

        for r in range(0, num_permutations):       

            v = np.array(random_perm_matrix[r, :])

            dot_product = x_T.dot(v)

            if dot_product > 0:
                signatureMatrix[r, c] = 1
            else:
                signatureMatrix[r, c] = -1             

    print("\nSignature Matrix:\n", signatureMatrix)
    
    return signatureMatrix

begin = timeit.default_timer()
S = random_projections(example_matrix1, 5) #S is the resulting signature matrix
end = timeit.default_timer() - begin 
print('The total time it took was ', end/60, ' minutes')

[[1 0 1 0]
 [1 0 0 1]
 [0 1 0 1]
 [0 1 0 1]
 [0 1 0 1]
 [1 0 1 0]
 [1 0 1 0]]
The random matrix is
 [[ 1. -1. -1.  1.  1.  1. -1.]
 [ 1.  1. -1.  1.  1. -1. -1.]
 [ 1.  1. -1. -1. -1.  1.  1.]
 [ 1.  1. -1.  1.  1. -1. -1.]
 [ 1.  1.  1. -1. -1. -1.  1.]] with shape: (5, 7)

Signature Matrix:
 [[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]] with shape: (5, 4)
Column: 0
Column: 1
Column: 2
Column: 3

Signature Matrix:
 [[-1.  1.  1. -1.]
 [-1.  1. -1.  1.]
 [ 1. -1.  1. -1.]
 [-1.  1. -1.  1.]
 [ 1. -1.  1. -1.]]
The total time it took was  0.00010228500000266649  minutes


In [11]:
np.random.seed(1702)

def random_projections(data_matrix, num_permutations):
    
    #print("Original (sparse) Matrix:\n", data_matrix, "with shape:", data_matrix.shape)
    
    #make an empty (zeros) matrix for the 100 random permutations
    random_perm_matrix = np.zeros((num_permutations, data_matrix.shape[0]))
    
    #make a random permutation with ~50% of the values 0's and ~50% 1's. These will then be randomly shuffled
    random_perm = np.ones(data_matrix.shape[0])
    random_perm[:int(data_matrix.shape[0]/2)] = -1
    for i in range(num_permutations):
        #Make 100 random permutations of the columns, and place them into the random permutation matrix
        np.random.shuffle(random_perm)
        random_perm_matrix[i, :] = random_perm
    print('The random matrix is\n',random_perm_matrix, "with shape:", random_perm_matrix.shape)
    

    signatureMatrix = np.zeros((random_perm_matrix.shape[0], data_matrix.shape[1]))
    #make the signature matrix, which is initially filled with only zeros
    print("\nSignature Matrix:\n", signatureMatrix, "with shape:", signatureMatrix.shape)
    

    for c in range(0, int(data_matrix.shape[1])): #column
        print("Column:", c)
        x_T = data_matrix[:, c].T

        for r in range(0, num_permutations): 

            v = random_perm_matrix[r, :]

            dot_product = x_T.dot(v)

            if dot_product > 0:
                signatureMatrix[r, c] = 1
            else:
                signatureMatrix[r, c] = -1  
                
            

    print("\nSignature Matrix:\n", signatureMatrix)
    
    return signatureMatrix

begin = timeit.default_timer()
S = random_projections(sparse_colMatrix_full, 100) #S is the resulting signature matrix
end = timeit.default_timer() - begin 
print('The total time it took was ', end/60, ' minutes')

The random matrix is
 [[ 1. -1.  1. ...  1. -1. -1.]
 [ 1. -1. -1. ... -1.  1. -1.]
 [-1.  1.  1. ...  1. -1.  1.]
 ...
 [ 1.  1.  1. ...  1.  1. -1.]
 [ 1.  1. -1. ...  1.  1.  1.]
 [-1.  1. -1. ... -1.  1.  1.]] with shape: (100, 17771)

Signature Matrix:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]] with shape: (100, 103703)
Column: 0
Column: 1
Column: 2
Column: 3
Column: 4
Column: 5
Column: 6
Column: 7
Column: 8
Column: 9
Column: 10
Column: 11
Column: 12
Column: 13
Column: 14
Column: 15
Column: 16
Column: 17
Column: 18
Column: 19
Column: 20
Column: 21
Column: 22
Column: 23
Column: 24
Column: 25
Column: 26
Column: 27
Column: 28
Column: 29
Column: 30
Column: 31
Column: 32
Column: 33
Column: 34
Column: 35
Column: 36
Column: 37
Column: 38
Column: 39
Column: 40
Column: 41
Column: 42
Column: 43
Column: 44
Column: 45
Column: 46
Column: 47
Column: 48
Column: 49
Column: 50
Column: 

In [60]:
def cosine_similarity(p1,p2):
    #Calculate the magnitudes of vectors p1 and p2 
    mag_p1 = np.linalg.norm(p1)
    mag_p2 = np.linalg.norm(p2)
    
    #calculate the cosine similarity between vectors p1 and p2, see lecture 6 slide 6
    cos_dist = np.arccos(np.dot(p1,p2)/(mag_p1*mag_p2))
    cos_sim = 1 - cos_dist/np.pi
        
    return cos_sim

In [72]:


cos_sim_list = []

begin = timeit.default_timer()

for i in range(0, 5000):
    print("i=", i)
    for j in range(i+1, 5000):
        cos_sim = cosine_similarity(S[:, i], S[:, j])
        if cos_sim>0.73:
            cos_sim_list.append((cos_sim, i, j))
    

end = timeit.default_timer() - begin 
print('The total time it took was ', end/60, ' minutes')

print("cosine similarity list:", cos_sim_list)

i= 0
i= 1
i= 2
i= 3
i= 4
i= 5
i= 6
i= 7
i= 8
i= 9
i= 10
i= 11
i= 12
i= 13
i= 14
i= 15
i= 16
i= 17
i= 18
i= 19
i= 20
i= 21
i= 22
i= 23
i= 24
i= 25
i= 26
i= 27
i= 28
i= 29
i= 30
i= 31
i= 32
i= 33
i= 34
i= 35
i= 36
i= 37
i= 38
i= 39
i= 40
i= 41
i= 42
i= 43
i= 44
i= 45
i= 46
i= 47
i= 48
i= 49
i= 50
i= 51
i= 52
i= 53
i= 54
i= 55
i= 56
i= 57
i= 58
i= 59
i= 60
i= 61
i= 62
i= 63
i= 64
i= 65
i= 66
i= 67
i= 68
i= 69
i= 70
i= 71
i= 72
i= 73
i= 74
i= 75
i= 76
i= 77
i= 78
i= 79
i= 80
i= 81
i= 82
i= 83
i= 84
i= 85
i= 86
i= 87
i= 88
i= 89
i= 90
i= 91
i= 92
i= 93
i= 94
i= 95
i= 96
i= 97
i= 98
i= 99
i= 100
i= 101
i= 102
i= 103
i= 104
i= 105
i= 106
i= 107
i= 108
i= 109
i= 110
i= 111
i= 112
i= 113
i= 114
i= 115
i= 116
i= 117
i= 118
i= 119
i= 120
i= 121
i= 122
i= 123
i= 124
i= 125
i= 126
i= 127
i= 128
i= 129
i= 130
i= 131
i= 132
i= 133
i= 134
i= 135
i= 136
i= 137
i= 138
i= 139
i= 140
i= 141
i= 142
i= 143
i= 144
i= 145
i= 146
i= 147
i= 148
i= 149
i= 150
i= 151
i= 152
i= 153
i= 154
i= 155
i= 156
i= 157
i= 1