In [1]:
import numpy as np
from sklearn.decomposition import NMF
def load_data(filename):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, dtype = np.str , delimiter='\t')

In [26]:
def get_err(U, V, Y, reg=0.0):
    """
    Takes as input a matrix Y of triples (i, j, Y_ij) where i is the index of a user,
    j is the index of a movie, and Y_ij is user i's rating of movie j and
    user/movie matrices U and V.

    Returns the mean regularized squared-error of predictions made by
    estimating Y_{ij} as the dot product of the ith row of U and the jth column of V^T.
    """
    summation = 0
    U_val = np.linalg.norm(U)
    V_val = np.linalg.norm(V)
    print(len(Y))
    for x in range (len(Y)): 
        j = Y[x][0] - 1
        i = Y[x][1] - 1
        yij = Y[x][2]
        dot = np.dot(U[i], V[j])
        summation += (yij - dot) ** 2
    return ((reg/2) * ((U_val)**2 + (V_val)**2) + (summation/2)/ len(Y))

In [3]:
def get_m(y_train): 
    avg = 0
    for i in y_train: 
        avg += i[2]
    return avg/len(y_train)

In [40]:
# import the data 
# calculate M and N
# set  k = 20

Y_train = load_data('/Users/josephcomo/desktop/data/train.txt')
Y_test = load_data('/Users/josephcomo/desktop/data/test.txt')
Y_train = np.array([list(map(int, x)) for x in Y_train])
Y_test = np.array([list(map(int, x)) for x in Y_test])
m = get_m(Y_train)
M = max(max(Y_train[:,0]), max(Y_test[:,0])) # users
N = max(max(Y_train[:,1]), max(Y_test[:,1])) # movies

k = 20

print(M)
print(N)



943
1682


In [47]:
# create and fill in Y- matrix
Ymat = np.zeros((N, M))
print(len(Ymat))
for rating in Y_train:
    uid = rating[0]
    mid = rating[1]
    rating = rating[2]
    Ymat[mid - 1][uid - 1] = rating
# Y is matrix of all ratings that users(col) have given to movies(row)
# fill in every empty 0 with the average of the average user id rating
# and the average movie id rating

# find average user id rating for given row
av_uid = []
av_mid = []

for n in range(N):
    nonzeros = 0
    total = 0
    for m in range(M):
        if Ymat[n][m] != 0:
            nonzeros += 1
            total += Ymat[n][m]
    if nonzeros == 0:
        av_mid.append(0)
    else:
        av_mid.append(total/nonzeros)

for m in range(M):
    nonzeros = 0
    total = 0
    for n in range(N):
        if Ymat[n][m] != 0:
            nonzeros += 1
            total += Ymat[n][m]
    if nonzeros == 0:
        av_uid.append(0)
    else:
        av_uid.append(total/nonzeros)
        

for m in range(M):
    for n in range(N):
        if Ymat[n][m] == 0:
            Ymat[n][m] = (av_uid[m] + av_mid[n])/2



1682


In [50]:
print(Ymat)

[[5.         4.         3.30565693 ... 5.         4.07351408 3.61186499]
 [3.         3.43040452 2.93698347 ... 3.58471074 3.70484061 5.        ]
 [4.         3.40006662 2.90664557 ... 3.55437284 3.67450271 3.21285362]
 ...
 [2.8313253  2.86842105 2.375      ... 3.02272727 3.14285714 2.68120805]
 [3.3313253  3.36842105 2.875      ... 3.52272727 3.64285714 3.18120805]
 [3.3313253  3.36842105 2.875      ... 3.52272727 3.64285714 3.18120805]]


In [51]:
# create the model
nmf = NMF(n_components = k, init = 'random', shuffle = True)

In [52]:
U = nmf.fit_transform(Ymat)
V = nmf.components_
print(np.shape(U))
print(np.shape(V))
print(V)

(1682, 20)
(20, 943)
[[0.37622928 0.38049334 0.36477101 ... 0.24417457 0.41548779 0.00907419]
 [1.1908228  0.43323128 0.32723359 ... 0.24715331 0.16331672 0.28613844]
 [0.14667202 0.37108    0.31271984 ... 0.28788284 0.36716587 0.27138349]
 ...
 [0.30773201 0.59711391 0.3130761  ... 0.57154587 0.50236282 0.20991555]
 [0.03200452 0.32467077 0.20051744 ... 0.56162983 0.74302809 0.55109477]
 [0.         0.38481595 0.28791045 ... 0.36729365 0.32137958 0.66122537]]


In [53]:
len(V)
len(U)
len(Y_train)

90000

In [54]:
# must find the error 
get_err(U, np.transpose(V), Y_train, 0)



90000


0.348679007231092

In [55]:
get_err(U, np.transpose(V), Y_test, 0)


10000


0.4416915984133395

In [57]:
print(U)


[[0.10857925 0.         0.         ... 0.21652421 0.68858534 0.        ]
 [0.4517871  0.15346512 0.56132522 ... 0.4182991  0.24775196 0.51927761]
 [0.63726648 0.42110184 0.56284404 ... 0.692915   0.18479861 0.50571327]
 ...
 [0.37767793 0.20431949 0.45602257 ... 0.39190568 0.3278826  0.37233183]
 [0.36897622 0.27340656 0.40601127 ... 0.31944217 0.2348388  0.39097003]
 [0.35678429 0.23241829 0.3441881  ... 0.2692276  0.14271895 0.33662331]]


In [56]:
print(V)

[[0.37622928 0.38049334 0.36477101 ... 0.24417457 0.41548779 0.00907419]
 [1.1908228  0.43323128 0.32723359 ... 0.24715331 0.16331672 0.28613844]
 [0.14667202 0.37108    0.31271984 ... 0.28788284 0.36716587 0.27138349]
 ...
 [0.30773201 0.59711391 0.3130761  ... 0.57154587 0.50236282 0.20991555]
 [0.03200452 0.32467077 0.20051744 ... 0.56162983 0.74302809 0.55109477]
 [0.         0.38481595 0.28791045 ... 0.36729365 0.32137958 0.66122537]]
