In [12]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.decomposition import NMF
import pandas as pd # install pandas via conda

In [15]:
digits = load_digits()

data = digits["data"]
images = digits["images"]
target = digits["target"]
target_names = digits["target_names"]


print(data.shape)

(1797, 64)


In [45]:
def non_negative(data, num_components):
    H = np.abs(np.random.randn(num_components,data.shape[1]))
    Z = np.abs(np.random.randn(data.shape[0], num_components))

    #print(np.matmul(np.matmul(Z.T, Z), H.T).shape)
    #print(np.matmul(H.T, data).shape)

    for i in range(0,1000):
        H = np.multiply(H, np.multiply(np.matmul(np.transpose(Z), data), 1/np.clip(np.matmul(np.matmul(Z.T, Z), H), 0.0001, None)))
        Z = np.multiply(Z, np.multiply(np.matmul(data, H.T), 1 / np.clip(np.matmul(np.matmul(Z, H), H.T), 0.0001, None)))
    
    return H, Z

        
model = NMF(n_components=10, max_iter=1000)
W = model.fit_transform(data)
H = model.components_

H_1, Z = non_negative(data, 10)


(10, 64)
(1797, 10)


<div style="color: green; font-weight: bold">You forgot to recalculate X for each iteration. You could have also calculated the loss and plotted it to see that it decreases with t. Also you should've plotted H and compared it to sklearn's implementation</div>

In [87]:
ratings_cols = ['user id','movie id','rating','timestamp']
movies_cols = ['movie id','movie title','release date','video release date','IMDb URL','unknown','Action',
'Adventure','Animation','Childrens','Comedy','Crime',
'Documentary','Drama','Fantasy','Film-Noir','Horror',
'Musical','Mystery','Romance','Sci-Fi','Thriller',
'War' ,'Western']
users_cols = ['user id ','age','gender','occupation','zip code']

users = pd.read_csv('ml-100k/u.user', sep ='|', names = users_cols , encoding ='latin-1')
movies = pd.read_csv('ml-100k/u.item', sep='|', names = movies_cols , encoding ='latin-1')
ratings = pd.read_csv('ml-100k/u.data', sep ='\t', names = ratings_cols , encoding ='latin-1')
# peek at the dataframes , if you like :)
users.head()
movies.head()
ratings.head()
print(type(movies))

fill_value = 0
rat_df = ratings.pivot(index = 'user id', columns ='movie id', values = 'rating').fillna(fill_value)
rat_df.head()

print("Zeros before", len(np.where(rat_df.to_numpy() == 0)[0]))

H, Z = non_negative(rat_df.to_numpy(), 5)

print("Zeros after", len(np.where(H == 0)[0]), len(np.where(Z == 0)[0]))

reconstruction = pd.DataFrame(Z @ H, columns = rat_df.columns)


# print(reconstruction)
# print(np.argmax(reconstruction[10]))
# print(np.max(reconstruction[10]))
# print(movies.loc[movies['movie id'] == np.argmax(reconstruction[10])])
# print(np.transpose(np.argwhere(reconstruction[10] > 1.5))[0])
#print(ratings.loc[ratings['user id'] == 10].loc[ratings["movie id"].isin(np.transpose(np.argwhere(reconstruction[10] > 1.5))[0])])

<class 'pandas.core.frame.DataFrame'>
Zeros before 1486126
(5, 1682)
(943, 5)
Zeros after 996 235


<div style="color: green; font-weight: bold">Seems like a fine solution. You could have also used the top N rated movies instead of the movies above a threshold. You could have also investigated the latent dimensions to see if they coincide with genres.</div>

In [86]:
def recommend_movies(reconstruction, user_id, movies, ratings):
    #we took all the movies that have a rating over 2 and removed the movies that have already been rated
    return movies.loc[movies['movie id'].isin(np.transpose(np.argwhere(reconstruction[10] > 2))[0]) & ~movies['movie id'].isin(ratings.loc[ratings['user id'] == 10].loc[ratings["movie id"].isin(np.transpose(np.argwhere(reconstruction[10] > 1.5))[0])])]
    
predictions = recommend_movies(reconstruction, 30, movies, ratings)
print(predictions)

     movie id                       movie title release date   
88         89               Blade Runner (1982)  01-Jan-1982  \
472       473  James and the Giant Peach (1996)  12-Apr-1996   

     video release date                                           IMDb URL   
88                  NaN  http://us.imdb.com/M/title-exact?Blade%20Runne...  \
472                 NaN  http://us.imdb.com/M/title-exact?James%20and%2...   

     unknown  Action  Adventure  Animation  Childrens  ...  Fantasy   
88         0       0          0          0          0  ...        0  \
472        0       0          0          1          1  ...        0   

     Film-Noir  Horror  Musical  Mystery  Romance  Sci-Fi  Thriller  War   
88           1       0        0        0        0       1         0    0  \
472          0       0        1        0        0       0         0    0   

     Western  
88         0  
472        0  

[2 rows x 24 columns]
