In [34]:
import numpy as np
from sklearn.decomposition import NMF
def load_data(filename):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, dtype = np.str , delimiter='\t')

In [74]:
def get_err(U, V, Y, reg=0.0):
    """
    Takes as input a matrix Y of triples (i, j, Y_ij) where i is the index of a user,
    j is the index of a movie, and Y_ij is user i's rating of movie j and
    user/movie matrices U and V.

    Returns the mean regularized squared-error of predictions made by
    estimating Y_{ij} as the dot product of the ith row of U and the jth column of V^T.
    """
    summation = 0
    U_val = np.linalg.norm(U)
    V_val = np.linalg.norm(V)
    print(len(Y))
    for x in range (len(Y)): 
        i = Y[x][0] - 1
        j = Y[x][1] - 1
        yij = Y[x][2] - 1
        dot = np.dot(U[i], V[j])
        summation += (yij - dot) ** 2
    return ((reg/2) * ((U_val)**2 + (V_val)**2) + (summation/2)/ len(Y))

In [35]:
def get_m(y_train): 
    avg = 0
    for i in y_train: 
        avg += i[2]
    return avg/len(y_train)

In [49]:
# import the data 
# calculate M and N
# set  k = 20

Y_train = load_data('/Users/josephcomo/desktop/data/train.txt')
Y_test = load_data('/Users/josephcomo/desktop/data/test.txt')
Y_train = np.array([list(map(int, x)) for x in Y_train])
Y_test = np.array([list(map(int, x)) for x in Y_test])
m = get_m(Y_train)
M = max(max(Y_train[:,0]), max(Y_test[:,0])) # users
N = max(max(Y_train[:,1]), max(Y_test[:,1])) # movies

k = 20



In [66]:
# create and fill in Y- matrix
Ymat = np.zeros((N, M))
for rating in Y_train:
    uid = rating[0]
    mid = rating[1]
    rating = rating[2]
    Y[uid - 1][mid - 1] = rating
# Y is matrix of all ratings that users(row) have given to movies(col)
# fill in every empty 0 with the average of the average user id rating
# and the average movie id rating

# find average user id rating for given row
av_uid = []
av_mid = []

for n in range(N):
    nonzeros = 0
    total = 0
    for m in range(M):
        if Y[m][n] != 0:
            nonzeros += 1
            total += Y[m][n]
    if nonzeros == 0:
        av_mid.append(0)
    else:
        av_mid.append(total/nonzeros)

for m in range(M):
    nonzeros = 0
    total = 0
    for n in range(N):
        if Y[m][n] != 0:
            nonzeros += 1
            total += Y[m][n]
    if nonzeros == 0:
        av_uid.append(0)
    else:
        av_uid.append(total/nonzeros)
        

for m in range(M):
    for n in range(N):
        if Y[m][n] == 0:
            Y[m][n] = (av_uid[m] + av_mid[n])/2
print(Y)

[[5.         3.         4.         ... 3.0813253  3.3313253  3.3313253 ]
 [4.         5.         3.         ... 3.09220532 3.34220532 3.34220532]
 [3.51667844 4.         3.11233729 ... 2.83602151 3.08602151 3.08602151]
 ...
 [5.         3.78669009 3.4906015  ... 3.21428571 3.46428571 3.46428571]
 [4.07351408 3.96526151 3.66917293 ... 3.39285714 3.64285714 3.64285714]
 [3.62144641 5.         3.21710526 ... 2.94078947 3.19078947 3.19078947]]


In [67]:
# create the model
nmf = NMF(n_components = k, init = 'random', shuffle = True)

In [68]:
U = nmf.fit_transform(Y)
V = nmf.components_
print(np.shape(U))
print(np.shape(V))
print(V)

(943, 20)
(20, 1682)
[[0.06164154 0.26596515 0.45794542 ... 0.24129932 0.31284554 0.11700927]
 [2.79223911 0.2413888  0.866924   ... 1.91116471 1.66279289 0.        ]
 [0.         0.29127409 0.48270922 ... 0.30707399 0.32903096 0.28458387]
 ...
 [0.39025042 0.27039557 0.25392669 ... 0.36907405 0.39860011 0.12793634]
 [0.00894618 0.25653243 0.27584471 ... 0.36895891 0.42445889 0.1471165 ]
 [0.57432386 1.17934415 0.33081935 ... 0.16431448 0.19724702 0.61164621]]


In [76]:
# must find the error 
get_err(U, np.transpose(V), Y_train, 0)



90000


0.9187308756292644

In [77]:
get_err(U, np.transpose(V), Y_test, 0)


10000


1.0069620860715593