In [6]:
import numpy as np
from sklearn.decomposition import NMF
def load_data(filename):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, dtype = np.str , delimiter='\t')

In [7]:
def get_err(U, V, Y, reg=0.0):
    """
    Takes as input a matrix Y of triples (i, j, Y_ij) where i is the index of a user,
    j is the index of a movie, and Y_ij is user i's rating of movie j and
    user/movie matrices U and V.

    Returns the mean regularized squared-error of predictions made by
    estimating Y_{ij} as the dot product of the ith row of U and the jth column of V^T.
    """
    summation = 0
    U_val = np.linalg.norm(U)
    V_val = np.linalg.norm(V)
    print(len(Y))
    for x in range (len(Y)): 
        j = Y[x][0] - 1
        i = Y[x][1] - 1
        yij = Y[x][2]
        dot = np.dot(U[i], V[j])
        summation += (yij - dot) ** 2
    return ((reg/2) * ((U_val)**2 + (V_val)**2) + (summation/2)/ len(Y))

In [8]:
def get_m(y_train): 
    avg = 0
    for i in y_train: 
        avg += i[2]
    return avg/len(y_train)

In [10]:
# import the data 
# calculate M and N
# set  k = 20

Y_train = load_data('data/train.txt')
Y_test = load_data('data/test.txt')
Y_train = np.array([list(map(int, x)) for x in Y_train])
Y_test = np.array([list(map(int, x)) for x in Y_test])
m = get_m(Y_train)
M = max(max(Y_train[:,0]), max(Y_test[:,0])) # users
N = max(max(Y_train[:,1]), max(Y_test[:,1])) # movies

k = 20

print(M)
print(N)



943
1682


In [11]:
# create and fill in Y- matrix
Ymat = np.zeros((N, M))
print(len(Ymat))
for rating in Y_train:
    uid = rating[0]
    mid = rating[1]
    rating = rating[2]
    Ymat[mid - 1][uid - 1] = rating
# Y is matrix of all ratings that users(col) have given to movies(row)
# fill in every empty 0 with the average of the average user id rating
# and the average movie id rating

# find average user id rating for given row
av_uid = []
av_mid = []

for n in range(N):
    nonzeros = 0
    total = 0
    for m in range(M):
        if Ymat[n][m] != 0:
            nonzeros += 1
            total += Ymat[n][m]
    if nonzeros == 0:
        av_mid.append(0)
    else:
        av_mid.append(total/nonzeros)

for m in range(M):
    nonzeros = 0
    total = 0
    for n in range(N):
        if Ymat[n][m] != 0:
            nonzeros += 1
            total += Ymat[n][m]
    if nonzeros == 0:
        av_uid.append(0)
    else:
        av_uid.append(total/nonzeros)
        

for m in range(M):
    for n in range(N):
        if Ymat[n][m] == 0:
            Ymat[n][m] = (av_uid[m] + av_mid[n])/2



1682


In [12]:
print(Ymat)

[[5.         4.         3.30565693 ... 5.         4.07351408 3.61186499]
 [3.         3.43040452 2.93698347 ... 3.58471074 3.70484061 5.        ]
 [4.         3.40006662 2.90664557 ... 3.55437284 3.67450271 3.21285362]
 ...
 [2.8313253  2.86842105 2.375      ... 3.02272727 3.14285714 2.68120805]
 [3.3313253  3.36842105 2.875      ... 3.52272727 3.64285714 3.18120805]
 [3.3313253  3.36842105 2.875      ... 3.52272727 3.64285714 3.18120805]]


In [14]:
# create the model
nmf = NMF(n_components = k, init = 'random', shuffle = True)

In [15]:
U = nmf.fit_transform(Ymat)
V = nmf.components_
print(np.shape(U))
print(np.shape(V))
print(V)

(1682, 20)
(20, 943)
[[0.20736621 0.44602622 0.22629595 ... 0.41524381 0.33187054 0.42492229]
 [0.         0.45695218 0.42134058 ... 0.30154788 0.38615475 0.46039445]
 [0.36587004 0.20560185 0.10393686 ... 0.34480225 0.42678462 0.40509103]
 ...
 [0.15863302 0.23350427 0.13622642 ... 0.43769825 0.45026616 0.01840167]
 [0.18255441 0.52843319 0.62903682 ... 0.19826113 0.36224348 0.36567793]
 [0.10205003 0.20739256 0.24136877 ... 0.2297373  0.17711808 0.        ]]


In [16]:
len(V)
len(U)
len(Y_train)

90000

In [54]:
# must find the error 
get_err(U, np.transpose(V), Y_train, 0)



90000


0.348679007231092

In [17]:
get_err(U, np.transpose(V), Y_test, 0)


10000


0.4412614459478028

In [18]:
print(U)


[[0.02193574 0.05746223 0.72099116 ... 0.74238921 0.         0.42483925]
 [0.46835071 0.06571074 0.50949554 ... 0.49663773 0.31009651 0.07113106]
 [0.06493029 0.40718394 0.48903147 ... 0.43469783 0.59294598 0.39356361]
 ...
 [0.37924459 0.34986336 0.40634546 ... 0.35209193 0.27985294 0.21963248]
 [0.35945947 0.42609922 0.28561292 ... 0.33093071 0.48915386 0.35668272]
 [0.2938432  0.41836639 0.20708005 ... 0.3106855  0.5670045  0.3589315 ]]


In [19]:
print(V)

[[0.20736621 0.44602622 0.22629595 ... 0.41524381 0.33187054 0.42492229]
 [0.         0.45695218 0.42134058 ... 0.30154788 0.38615475 0.46039445]
 [0.36587004 0.20560185 0.10393686 ... 0.34480225 0.42678462 0.40509103]
 ...
 [0.15863302 0.23350427 0.13622642 ... 0.43769825 0.45026616 0.01840167]
 [0.18255441 0.52843319 0.62903682 ... 0.19826113 0.36224348 0.36567793]
 [0.10205003 0.20739256 0.24136877 ... 0.2297373  0.17711808 0.        ]]


In [22]:
V = nmf.fit_transform(Ymat)
U = nmf.components_
A, S, B = np.linalg.svd(V, full_matrices = False)
print(np.shape(A))
A = np.transpose(A)
print(np.shape(A))
A_12 = A[:,0:2]
print(np.shape(A_12))
A_12 = np.transpose(A_12)
print(np.shape(A_12))
V_s = np.dot(A_12, np.transpose(V))
U_s = np.dot(A_12, U)

(1682, 20)
(20, 1682)
(20, 2)
(2, 20)
