In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix, csr_matrix
from scipy.spatial.distance import jaccard, cosine 
from pytest import approx
from sklearn.decomposition import NMF
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import normalize

In [2]:
MV_users = pd.read_csv('data/users.csv')
MV_movies = pd.read_csv('data/movies.csv')
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
from collections import namedtuple
Data = namedtuple('Data', ['users','movies','train','test'])
data = Data(MV_users, MV_movies, train, test)

In [4]:
class RecSys():
    def __init__(self,data):
        self.data=data
        self.allusers = list(self.data.users['uID'])
        self.allmovies = list(self.data.movies['mID'])
        self.genres = list(self.data.movies.columns.drop(['mID', 'title', 'year']))
        self.mid2idx = dict(zip(self.data.movies.mID,list(range(len(self.data.movies)))))
        self.uid2idx = dict(zip(self.data.users.uID,list(range(len(self.data.users)))))
        self.Mr=self.rating_matrix()
        self.Mm=None 
        self.sim=np.zeros((len(self.allmovies),len(self.allmovies)))
        
    def rating_matrix(self):
        """
        Convert the rating matrix to numpy array of shape (#allusers,#allmovies)
        """
        ind_movie = [self.mid2idx[x] for x in self.data.train.mID] 
        ind_user = [self.uid2idx[x] for x in self.data.train.uID]
        rating_train = list(self.data.train.rating)
        
        return np.array(coo_matrix((rating_train, (ind_user, ind_movie)), shape=(len(self.allusers), 
                                                                                 len(self.allmovies))).toarray())
    def predict_with_nfm(self):
        nmf =  NMF(n_components=5) #ratings from 1 to 5
        nmf_pred = nmf.fit_transform(self.data.train)
        predictions = nmf.transform(self.data.test)
        return np.argmax(predictions, axis=1)
    def predict_with_nfm(self):
        train = normalize(self.data.train)
        test = normalize(self.data.test)
        nmf =  NMF(n_components=5) #ratings from 1 to 5
        nmf_pred = nmf.fit_transform(train)
        predictions = nmf.transform(test)
        return np.argmax(predictions, axis=1)
        
        
    def rmse(self,yp):
        yp[np.isnan(yp)]=3 #In case there is nan values in prediction, it will impute to 3.
        yt=np.array(self.data.test.rating)
        return np.sqrt(((yt-yp)**2).mean())



In [5]:
rs = RecSys(data)
yp = rs.predict_with_nfm()
print("rmse",rs.rmse(yp))

rmse 2.925481623088295


<br> 
We applied skelarn non negative matrix factorization on our movie rating dataset and we get rmse value 2.925481623088295 which is even worse than the rmse value 1.2642784503423288 where we predicted everything to 3.
 
Sklearn Non-Negative factorization did not perform well compared to simple baseline or similarity-based methods because NMF fails to perform if the underlaying dataset in highly sparse matrix.  It fails to extract meaningful information and latent features if data is sparse. Since the rating matrix we constructed from movies and user ratings is a highly sparse matrix so NFM performed worst of all methods in this case. 
<br>
<br>
<br>
Performance in this case can be improved by multiple techniques.

<b>Missing value imputation:<b>

If we impute missing values in our dataset we can decrease the sparsity of data and NMF performance can be improved.

<b>Feature reduction:</b>

Feature reduction techniques can be applied to fix the performance issue of NMF for sparse data.Techniques like like Singular Value Decomposition or Principal Component Analysis can be used to reduce features and improve performance of NMF.




