In [1]:
import numpy as np 
from scipy.sparse import csr_matrix, dok_matrix, random
from scipy.sparse import random as sparse_random

import pandas as pd 
from datetime import datetime 
import random

pd.set_option('display.max_colwidth', None)

In [2]:
movie = pd.read_csv('movies_metadata.csv')

movie = movie[['id', 'title', 'genres']]

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
p = 0.00001  # 0.1 of the lines
rating = pd.read_csv(
         'ratings.csv',
         header=0, 
         skiprows=lambda i: i>0 and random.random() > p)

user_movie_rating  = pd.pivot_table(data = rating, columns = 'movieId', index= 'userId', values = 'rating', aggfunc=np.mean)

In [4]:
user_movie_rating.head()

movieId,25,32,34,45,48,50,62,110,112,144,...,81932,82169,84152,87430,99106,102125,103339,115149,134130,152081
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1136,,,,,,,,,,,...,,,,,,,,,,
2467,,,,,,,,,,,...,,,,,,,,,,
2656,,,,,,,,,,,...,,,,,,,,,,
2847,,,,,,,,,,,...,,,,,,,,,,
4095,,,,,,,,,,,...,,,,,3.0,,,,,


In [None]:
# --

In [46]:
def cross_validation(X, N, M, fold):
    """
    Given a matrix X with dimension (N,M),
    This function creates 4 sets of train & test matrices, stored in a dictionary.
    4-fold creates test matric covered with 75% zeros.
    
    input - X (Matrix), (N,M) Shape of X, f - Folder number (0,1,2,3)
    return - masked data according to f variable (folder number)
    """
    # Create a dict with the slicing indices
    rows = N
    cols = M
    mid_rows = int(rows/2)
    mid_cols = int(cols/2)
    
    idx_dict = {
                0: [[0,mid_rows],[0, mid_cols]],
                1: [[0,mid_rows],[mid_cols, cols]],
                2: [[mid_rows, rows], [0, mid_cols]],
                3: [[mid_rows, rows], [mid_cols, cols]]
    }
    
    idexes = idx_dict[fold]
    
    # Create masks
    train_mask = np.full((rows, cols), 1)
    train_mask[idexes[0][0]:idexes[0][1], idexes[1][0]:idexes[1][1]] = 0
    test_mask = 1 - train_mask
    
    # Create X_train
    X_train = X.copy()
    X_train[train_mask==0] = 0
    
    # Create X_test
    X_test = X.copy()
    X_test[train_mask==1] = 0
        
    return X_train, X_test, train_mask, test_mask
    

In [47]:
def matrix_factorization_model(X, Mask, lat_feat, epochs = 1000, learn_rate=0.005, beta=0.1):
    """
    this function factorizes a given mattix (X) such that X = P*Q.T when P, Q dimension are dependent on #latent_features
    
    input - 
    X - Rating Matrix,
    Mask - mask of non-null values in X 
    epochs - number of epochs for the model 
    lat_feat - number of latent features to factorize the matrix by
    learn_rate, beta - Error parameters
    
    output - 
    est_X - the approximated matrix 
    P, Q.T - the factorizing matrices 
    err - error rate 
    """
    
    N = len(X)
    M = len(X[0])
    P = sparse_random(N,lat_feat, density=0.2).toarray()
    Q = sparse_random(M,lat_feat, density=0.2).toarray()
    
    Q = Q.T
    sparse = csr_matrix(X)
    sparse_dict = sparse.todok()

    mask = Mask
    
    for epoch in range(epochs):
        
        for key in sparse_dict.keys():
            [i, j] = key
            value = sparse_dict[key]
            eij = value - np.dot(P[i,:], Q[:, j])
            
            # Gradient descent 
            for k in range(lat_feat):
                p_ik = P[i,k]
                
                P[i,k] = p_ik + learn_rate * (2 * eij * Q[k,j] - beta * P[i,k])
                Q[k,j] = Q[k,j] + learn_rate * (2 * eij * p_ik - beta * Q[k,j])

        
        est_X = np.dot(P,Q)
    
        err = np.sum(np.square(X - est_X * mask))  + 0.5 * beta * (1/lat_feat) *(1/len(sparse_dict.keys())) * (
                                                                    np.linalg.norm(P)**2 + np.linalg.norm(Q)**2)
        
        if epoch % 500 ==0:
            print ('iteration number: %i, err_rate: %e' % (epoch, err))
            
        if err < 0.01:
            break

    return est_X, P, Q.T, err

In [53]:
def model_crossV_4fold(X, Mask, lat_feat, epochs = 10000,  learn_rate=0.005, beta=0.1):
    """
    Matrix factorization model using cross-validation with 4 folds. 
    
    input - 
    X - Rating Matrix,
    Mask - mask of non-null values in X 
    epochs - number of epochs for the model 
    lat_feat - number of latent features to factorize the matrix by
    learn_rate, beta - Error parameters
    
    output - 
    train_err_mean, teat_err_mean - train & test mean error over 4 folders 
    mean_EST_X - the mean over all 4 approximated matrices given 4 folders 
    """
    mask = Mask
    N = len(X)
    M = len(X[0])
    
    # fold_tups is where we collect the attributes from each fold
    fold_tups = []
    EST_Rating = []
    
    # Cross validation by 4 folds
    for f in range(4):
        print ('fold number {}'.format(f))
        print ('- - - ')
        
        X_train, X_test, train_mask, test_mask = cross_validation(X, N, M, f)
        
        train_null_mask = mask * train_mask
        test_null_mask = mask * test_mask
        
        x_train_masked = train_null_mask * X_train
        x_test_masked = test_null_mask * X_test
               
        # Len of input matrix
        
        #  Initialzing Random Matrices for model
        P = sparse_random(N,lat_feat, density=0.2).toarray()
        Q = sparse_random(M,lat_feat, density=0.2).toarray()
        
       

        est_X ,nP, nQ, train_err = matrix_factorization_model(x_train_masked , mask, lat_feat, epochs,
                                                         learn_rate, beta)
        
        test_pred = est_X * test_null_mask
        X_test = X * test_null_mask
        
        test_err = (1/np.count_nonzero(X_test)) * np.sum(np.square(X_test - test_pred * mask))
        print ('fold {}, Test Error {}'.format(f, test_err))
        print ('- - -')

        fold_tup = (est_X, train_err, test_err, nP, nQ)
        fold_tups.append(fold_tup)   
        EST_Rating.append(est_X)

    train_err_mean = np.mean([x[1] for x in fold_tups])
    test_err_mean = np.mean([x[2] for x in fold_tups])
    
    mean_EST_X = np.mean([x[0] for x in fold_tups])
    
    print ('Mean Test error is {}'.format(test_err_mean))
    
    
    return train_err_mean, test_err_mean, mean_EST_X
                    

In [54]:
def user_top_ten(X, Null_Mask, movie_table, user_id):
    """
    input - 
    X - prediction Rating Data frame,
    Null_Mask - mask of null values in X 
    movie_table - Movie database (Pandas DF)
    
    output - 
    movie_id & movie_names - list of top 10 movies recommended for a user
    """
    
    Rating_table = X * Null_Mask
    user_rating = Rating_table.loc[user_id]
    top_ten = user_rating.nlargest(10)
    movie_id = list(np.array(top_ten.index))
    movie_id = [str(x) for x in movie_id]
    
    movie_names = movie_table[movie_table['id'].isin(movie_id)]
    movie_names = list(movie_names['title'].values)
    
    return movie_id , movie_names

In [55]:
def top_ten_user(Rating, movie_table, user, lat_feat = 15, epochs = 1000, learn_rate=0.005, beta=0.1):
    """
    input -
    Rating - Original rating DF
    movie_table - Original movie DF
    user_id - the user to give the 10 most recommended movies
    lat_feat, epochs, learn_rate, beta - arguments for the factorization model 

    
    output - 
    movie_id & movie_names - list of top 10 movies recommended for a user
    rate_pred - the prediction Dataframe from the model (model_crossV_4fold)
    """
    
    Rating_array= Rating.to_numpy()
    Nan_mask = ~np.isnan(Rating_array)
    Rating_fin= np.nan_to_num(Rating_array)
    
    train_err_mean, test_err_mean, mean_EST_X = model_crossV_4fold(Rating_fin, Nan_mask, lat_feat=15, 
                                                                   epochs = 1000, learn_rate=0.005, beta=0.1)
    
    
    rate_pred = pd.DataFrame(mean_EST_X, index = Rating.index, columns = Rating.columns)
    Null_place = pd.isnull(Rating)
    
    movie_id , movie_names = user_top_ten(rate_pred, Null_place, movie_table, user)
    
    print('Top movies suggested for user {}, are : {}'.format(user,movie_names))
    
    return movie_id , movie_names, rate_pred

In [56]:
# ---

In [57]:
np.random.seed(4)

id_movie, rate, pred_df = top_ten_user(user_movie_rating, movie, 4095, lat_feat = 15, epochs = 500, learn_rate=0.005, beta=0.1)

fold number 0
- - - 
iteration number: 0, err_rate: 2.023620e+03
iteration number: 500, err_rate: 1.200454e+01
fold 0, Test Error 14.537953180150529
- - -
fold number 1
- - - 
iteration number: 0, err_rate: 2.056809e+03
iteration number: 500, err_rate: 9.295555e+00
fold 1, Test Error 11.440008816325596
- - -
fold number 2
- - - 
iteration number: 0, err_rate: 2.012827e+03
iteration number: 500, err_rate: 7.803531e+00
fold 2, Test Error 12.330562867060516
- - -
fold number 3
- - - 
iteration number: 0, err_rate: 2.147719e+03
iteration number: 500, err_rate: 7.931269e+00
fold 3, Test Error 12.459191958093037
- - -
Mean Test error is 12.691929205407419
Top movies suggested for user 4095, are : ['Three Colors: Red', '2001: A Space Odyssey', 'Wings of Desire', 'Italian for Beginners', 'Jarhead']
