In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix, csr_matrix
from scipy.spatial.distance import jaccard, cosine 
from pytest import approx
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import pairwise_distances

In [4]:
MV_users = pd.read_csv('data/users.csv')
MV_movies = pd.read_csv('data/movies.csv')
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [5]:
from collections import namedtuple
Data = namedtuple('Data', ['users','movies','train','test'])
data = Data(MV_users, MV_movies, train, test)

# Part 2 Movie Matrix using NMF

I chose to use the Jaccard Algorithims with data that was transformed by NMF.  This ended up being simpler to implement and a fairly common use of NMF to shrink this data.  The performance was slightly worse than the origional method.



In [6]:
from sklearn.metrics import jaccard_score
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
class RecSys():
    def __init__(self,data):
        self.data=data
        self.allusers = list(self.data.users['uID'])
        self.allmovies = list(self.data.movies['mID'])
        self.genres = list(self.data.movies.columns.drop(['mID', 'title', 'year']))
        self.mid2idx = dict(zip(self.data.movies.mID,list(range(len(self.data.movies)))))
        self.uid2idx = dict(zip(self.data.users.uID,list(range(len(self.data.users)))))
        self.Mr=self.rating_matrix()
        self.Mm=None 
        self.sim=np.zeros((len(self.allmovies),len(self.allmovies)))
        
    def rating_matrix(self):
        """
        Convert the rating matrix to numpy array of shape (#allusers,#allmovies)
        """
        ind_movie = [self.mid2idx[x] for x in self.data.train.mID] 
        ind_user = [self.uid2idx[x] for x in self.data.train.uID]
        rating_train = list(self.data.train.rating)
        
        return np.array(coo_matrix((rating_train, (ind_user, ind_movie)), shape=(len(self.allusers), len(self.allmovies))).toarray())


    def predict_everything_to_3(self):
        """
        Predict everything to 3 for the test data
        """
        # Generate an array with 3s against all entries in test dataset
        # your code here
        a = np.empty(1)
        a.fill(3)
        return a
    
    def predict_to_user_average(self):
        """
        Predict to average rating for the user.
        Returns numpy array of shape (#users,)
        """
        # Generate an array as follows:
        # 1. Calculate all avg user rating as sum of ratings of user across all movies/number of movies whose rating > 0
        # 2. Return the average rating of users in test data
        # your code here
        test_users = [self.uid2idx[x] for x in self.data.test.uID]
        usr_avg = []
        for uid in test_users:
            col = self.Mr[uid]
            col = col[col > 0]
            usr_avg.append(np.mean(col))
        return np.array(usr_avg)
    
    def predict_from_sim(self,uid,mid):
        """
        Predict a user rating on a movie given userID and movieID
        """
        # Predict user rating as follows:
        # 1. Get entry of user id in rating matrix
        # 2. Get entry of movie id in sim matrix
        # 3. Employ 1 and 2 to predict user rating of the movie
        # your code here
        
        user_idx = self.uid2idx[uid]
        user_ratings = self.Mr[user_idx]
        movie_idx = self.mid2idx[mid]
        similarity_row = self.Mr[movie_idx]
        #print(user_ratings)
        #print(similarity_row)
        weighted_ratings = user_ratings * similarity_row
        total_weighted_ratings = np.sum(weighted_ratings)
        rated_movies = user_ratings > 0
        sum_of_similarities = np.sum(similarity_row[rated_movies])
        if sum_of_similarities != 0:
            predicted_rating = total_weighted_ratings / sum_of_similarities
        else:
            predicted_rating = np.mean(user_ratings[user_ratings > 0])
        return predicted_rating
    
    def predict(self):
        """
        Predict ratings in the test data. Returns predicted rating in a numpy array of size (# of rows in testdata,)
        """
        # your code here
        predicted_ratings = []
        for _, row in self.data.test.iterrows():
            uid = row['uID']
            mid = row['mID']
            predicted_rating = self.predict_from_sim(uid, mid)
            predicted_ratings.append(predicted_rating)
        return np.array(predicted_ratings)
        
    
    def rmse(self,yp):
        yp[np.isnan(yp)]=3 #In case there is nan values in prediction, it will impute to 3.
        yt=np.array(self.data.test.rating)
        return np.sqrt(((yt-yp)**2).mean())

    
class ContentBased(RecSys):
    def __init__(self,data):
        super().__init__(data)
        self.data=data
        self.Mm = self.calc_movie_feature_matrix()  
        
    def calc_movie_feature_matrix(self):
        """
        Create movie feature matrix in a numpy array of shape (#allmovies, #genres) 
        """
        movie_features = np.zeros((len(self.allmovies), len(self.genres)))
        for i, movie_id in enumerate(self.allmovies):
            movie_row = self.mid2idx[movie_id]
            movie_genres = self.data.movies.iloc[movie_row][self.genres]
            movie_features[i] = np.array(movie_genres)
        return movie_features
        
        
    
    def calc_item_item_similarity(self):
        """
        Create item-item similarity using Jaccard similarity
        """
        # Update the sim matrix by calculating item-item similarity using Jaccard similarity
        # Jaccard Similarity: J(A, B) = |A∩B| / |A∪B| 
        # your code here
        for i in range(len(self.allmovies)):
            for j in range(i, len(self.allmovies)):
                if i == j:
                    self.sim[i, j] = 1.0
                else:
                    genres_i = set(np.where(self.Mm[i] == 1)[0])
                    genres_j = set(np.where(self.Mm[j] == 1)[0])
                    intersection = genres_i.intersection(genres_j)
                    union = genres_i.union(genres_j)
                    jaccard_sim = len(intersection) / len(union)
                    self.sim[i, j] = jaccard_sim
                    self.sim[j, i] = jaccard_sim
                
class Collaborative(RecSys):    
    def __init__(self,data):
        super().__init__(data)
        
    def calc_item_item_similarity(self, simfunction, *X):  
        """
        Create item-item similarity using similarity function. 
        X is an optional transformed matrix of Mr
        """    
        # General function that calculates item-item similarity based on the sim function and data inputed
        if len(X)==0:
            self.sim = simfunction()            
        else:
            self.sim = simfunction(X[0]) # *X passes in a tuple format of (X,), to X[0] will be the actual transformed matrix
            
            
    def cossim(self):    
        """
        Calculates item-item similarity for all pairs of items using cosine similarity (values from 0 to 1) 
        on utility matrix. 
        
        Returns a cosine similarity matrix of size (#all movies, #all movies)
        
        Both Boundary checks (Step 5, 7) are important.
        """
        
        # MV_array_adujsted_zeros
        movie_ratings_allUsers = self.Mr.sum(axis=1)/(self.Mr > 0).sum(axis=1)
        
        # create a sparse matrix for operating cosine on its values
        movie_ratings_array = np.repeat(np.expand_dims(movie_ratings_allUsers, axis=1), self.Mr.shape[1], axis=1)
        
        # take care of all zero ratings (missing value/itentionally we don't know)
        MR_arr = self.Mr + (self.Mr==0) * movie_ratings_array - movie_ratings_array
        # print(MR_arr)
        
        # average all the ratings: divide by its magnitude!
        MR_arr_averaged = MR_arr/np.sqrt((MR_arr**2).sum(axis=0))
        
        # boundary check # 1: since dividing by magnitude may produce inf, zeros, etc.
        MR_arr_averaged[np.isnan(MR_arr_averaged)] = 0
        
        # item-item cosine sim
        # boundary check # 2: covariance/correlation values for A,A in diagonal is always 1.
        # - set diagonals explicility to 1
        cos = np.dot(MR_arr_averaged.T, MR_arr_averaged)
        for i in range(len(self.allmovies)):
            cos[i, i] = 1
        
        # Final cosine formula:
        # normalize - between 0-1s:
        self.sim =  0.5 + 0.5 * cos
    
    
    """def jacsim(self, Xr):
        
        #Calculates item-item similarity for all pairs of items using Jaccard similarity (values from 0 to 1)
        #Xr is the transformed rating matrix.
        
        jac_matrix = np.zeros((len(Xr.T), len(Xr.T)))
        #print(jac_matrix.shape)
        for i, ii in zip(Xr.T, enumerate(range(0,len(Xr.T)))):
            for j, jj in zip(Xr.T, enumerate(range(0,len(Xr.T)))):
                jac_matrix[ii,jj] = jaccard_score(i,j)
                
        return jac_matrix"""
        
    def jacsim(self, Xr):
        """
        Calculates item-item similarity for all pairs of items using Jaccard similarity (values from 0 to 1)
        Xr is the transformed rating matrix.
        """
        jaccard_similarity_matrix = np.zeros((Xr.shape[0], Xr.shape[0]))
        for i in range(0,Xr.shape[0]):
            for j in range(i+1,Xr.shape[1]):
                set_i = set(Xr[i])
                set_j = set(Xr[j])
                intersection = len(set_i.intersection(set_j))
                union = len(set_i.union(set_j))
                jaccard_similarity = intersection / union
                jaccard_similarity_matrix[i, j] = jaccard_similarity
                jaccard_similarity_matrix[j, i] = jaccard_similarity
                self.sim = jaccard_similarity_matrix

    def create_model(self, Xr,c):
        #print(train_data)
        #train_data = csr_matrix(train_data)
        model = NMF(n_components=c, init='random', random_state=0, solver = 'mu')
        self.model = model.fit(Xr, c)
        #H = model.components_
        #err = model.reconstruction_err_
        return self.model.transform(Xr)

In [17]:
cf = Collaborative(data)
Xr = cf.Mr>=3
NMF_data = cf.create_model(Xr, 100)
cf.calc_item_item_similarity(cf.jacsim,NMF_data)
yp = cf.predict()
rmse = cf.rmse(yp)
print(rmse)
    



    



1.1686969357092107


The RMSE for this was 1.1686969357092107