## Import Libraries

In [1]:
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

#cimport numpy as np # noqa
import numpy as np

from surprise import Reader, AlgoBase, PredictionImpossible
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.utils import get_rng
from surprise.model_selection import train_test_split
from surprise import accuracy

import pandas as pd
import os
import time
import math

from sklearn.metrics import confusion_matrix, mean_squared_error, mean_absolute_error, precision_score, recall_score
from math import sqrt

import matrices_generation as mg

#%reload_ext Cython
%load_ext Cython

## co-SVD

In [2]:
%%cython

cimport cython
cimport numpy as np
import pandas as pd
import numpy as np
import os
import math
import time
import sys
from surprise import AlgoBase, PredictionImpossible
from surprise.utils import get_rng
from functools import reduce
from cython.parallel import prange

@cython.boundscheck(False)
@cython.wraparound(False)

class co_SVD(AlgoBase):
    def __init__ (
        self
        , n_factors = 40
        , n_epochs = 1
        , biased = True
        , init_mean = 0
        , init_std_dev=.1

        , lr_all=.005
        , reg_all=.02
        , lr_bu=None
        , lr_bi=None
        , lr_bt=None
        , lr_pu=None
        , lr_qi=None
        , lr_rt=None

        , reg_p=.001
        , reg_r=.035
        , reg_f=1.5

        , random_state=None
        , verbose=False
        , p_ut=None
        , f_it=None
        , tags=None
        , ratings=None
        ):

        self.n_factors = n_factors
        self.n_epochs = n_epochs
        self.biased = biased
        self.init_mean = init_mean
        self.init_std_dev = init_std_dev
        self.lr_bu = lr_bu if lr_bu is not None else lr_all
        self.lr_bi = lr_bi if lr_bi is not None else lr_all
        self.lr_bt = lr_bt if lr_bt is not None else lr_all

        self.lr_pu = lr_pu if lr_pu is not None else lr_all
        self.lr_qi = lr_qi if lr_qi is not None else lr_all
        self.lr_rt = lr_rt if lr_rt is not None else lr_all

        self.reg_p = reg_p
        self.reg_f = reg_f
        self.reg_r = reg_r

        self.random_state = random_state
        self.verbose = verbose
        
        self.p_ut = p_ut
        self.f_it = f_it
        self.tags = tags
        self.ratings = ratings
        AlgoBase.__init__(self)
        
   
  
    def fit(self,trainset):
        AlgoBase.fit(self, trainset)       
        self.sgd(trainset)
        return self
    
    
    def sgd(self, trainset):
        cdef np.ndarray[np.double_t] bu
        cdef np.ndarray[np.double_t] bi
        cdef np.ndarray[np.double_t] bt

        cdef np.ndarray[np.double_t, ndim=2] pu
        cdef np.ndarray[np.double_t, ndim=2] qi
        cdef np.ndarray[np.double_t, ndim=2] rt

        cdef int u, i, t, f, raw_u, raw_i, raw_t
        cdef double r, p_put, p_fit, err_r, err_p, err_f, dot_r, dot_p, dot_f, puf, qif, rtf, global_mean_p, global_mean_f
        cdef double global_mean_r = self.trainset.global_mean

        cdef double lr_bu = self.lr_bu
        cdef double lr_bi = self.lr_bi
        cdef double lr_bt = self.lr_bt

        cdef double lr_pu = self.lr_pu
        cdef double lr_qi = self.lr_qi
        cdef double lr_rt = self.lr_rt

        cdef double reg_p = self.reg_p
        cdef double reg_f = self.reg_f
        cdef double reg_r = self.reg_r

        p_ut = self.p_ut
        f_it = self.f_it
        tags = self.tags
        ratings = self.ratings
        
        cdef int n_factors = self.n_factors
        raw_user = np.zeros(trainset.n_users, int)
        for i in trainset.all_users():
            raw_user[i] = trainset.to_raw_uid(i)

        raw_item = np.zeros(trainset.n_items, int)
        for i in trainset.all_items():
            raw_item[i] = trainset.to_raw_iid(i)
        
        final_raw = ratings[ratings.userId.isin(raw_user) & ratings.movieId.isin(raw_item)]
        raw_data = final_raw

        uni_tid = raw_data.tid.unique()
        uni_tid = uni_tid[~np.isnan(uni_tid)]
        u_t = pd.DataFrame({'tid': uni_tid,
                            'tid_inner':range(len(uni_tid))})
        
        raw_data = pd.merge(raw_data, u_t, how ='left', on=['tid'])
        
        final_p = p_ut[p_ut.userId.isin(raw_user) & p_ut.tid.isin(raw_data.tid)]
        final_f = f_it[f_it.movieId.isin(raw_item) & f_it.tid.isin(raw_data.tid)]
        
        final_p = pd.merge(final_p, u_t, how='left', on=['tid'])
        final_f = pd.merge(final_f, u_t, how='left', on=['tid'])

        p_ut = final_p.drop(['tid'], axis=1)
        f_it = final_f.drop(['tid'], axis=1)

        rng = get_rng(self.random_state)
        
        bu = np.zeros(trainset.n_users, np.double)
        bi = np.zeros(trainset.n_items, np.double)

        bt = np.zeros(len(p_ut.tid_inner.unique()), np.double)
        
        pu = rng.normal(self.init_mean, self.init_std_dev,
                        (trainset.n_users, n_factors))
        qi = rng.normal(self.init_mean, self.init_std_dev,
                        (trainset.n_items, n_factors))
        rt = rng.normal(self.init_mean, self.init_std_dev,
                        (len(p_ut.tid_inner.unique()), n_factors))
        
        global_mean_p = np.mean(p_ut.val)
        global_mean_f = np.mean(f_it.val)
        
        for current_epoch in range(self.n_epochs):
            if self.verbose:
                print("Processing epoch {}".format(current_epoch), end='\r')

            rv_sum = np.zeros((trainset.n_users, n_factors))
            pz_sum = np.zeros((trainset.n_users, n_factors))
            ru_sum = np.zeros((trainset.n_items, n_factors))
            fz_sum = np.zeros((trainset.n_items, n_factors))
            pu_sum = np.zeros((len(p_ut.tid_inner.unique()), n_factors))
            fv_sum = np.zeros((len(p_ut.tid_inner.unique()), n_factors))
            
            for rn in range(len(raw_data)):
                raw_u = raw_data.loc[rn, 'userId']
                raw_i = raw_data.loc[rn, 'movieId']
                
                u = trainset.to_inner_uid(raw_u)
                i = trainset.to_inner_iid(raw_i)
                
                r = raw_data.loc[rn, 'rating']
                if pd.isna(raw_data.loc[rn, 'tid_inner']) | np.isnan(raw_data.loc[rn, 'tid_inner']):
                    t = -1
                else:
                    t = raw_data.loc[rn, 'tid_inner']
                
                t = int(t)
                
                if t != -1:
                    p_put = p_ut.val[(p_ut.userId == raw_u) & (p_ut.tid_inner == t)].values[0]
                    p_fit = f_it.val[(f_it.movieId == raw_i) & (f_it.tid_inner == t)].values[0]
                    
                if math.isnan(r):
                    r = 0

                dot_r = 0.0
                dot_p = 0.0
                dot_f = 0.0
                
                for f in prange(n_factors, nogil=True):
                    dot_r += qi[i,f] * pu[u,f]

                if t != -1:
                    for f in prange(n_factors, nogil=True):
                        dot_p += rt[t,f] * pu[u,f]
                        dot_f += rt[t,f] * qi[i,f]

                
                err_r = r - (global_mean_r + bu[u] + bi[i] + dot_r)

                if t != -1:
                    err_p = p_put - (global_mean_p + bu[u] + bt[t] + dot_p)
                    err_f = p_fit - (global_mean_f + bi[i] + bt[t] + dot_f)
                else:
                    err_p = 0.0
                    err_f = 0.0

                if self.biased:
                    bu[u] -= lr_bu * (-1 * err_r - reg_p * err_p + reg_r * bu[u])
                    bi[i] -= lr_bi * (-1 * err_r - reg_f * err_f + reg_r * bi[i])
                    if t != -1:
                        bt[t] -= lr_bt * (-1 * reg_p * err_p - reg_f * err_f + reg_r * bt[t])
                
                if r != 0:
                    rv_sum[u] += err_r * qi[i]
                    ru_sum[i] += err_r * pu[u]
                
                if t != -1:
                    pz_sum[u] += err_p * rt[t]
                    fz_sum[i] += err_f * rt[t]
                    pu_sum[t] += err_p * pu[u]
                    fv_sum[t] += err_f * qi[i]
                    
            pu -= lr_pu * (-1 * rv_sum - reg_p * pz_sum + reg_r * pu)
            qi -= lr_pu * (-1 * ru_sum - reg_f * fz_sum + reg_r * qi)
            rt -= lr_pu * (-1 * reg_p * pu_sum - reg_f * fv_sum + reg_r * rt)
                    
        self.bu = bu
        self.bi = bi
        self.pu = pu
        self.qi = qi

    def estimate(self, u, i):
        known_user = self.trainset.knows_user(u)
        known_item = self.trainset.knows_item(i)

        if self.biased:
            est = self.trainset.global_mean

            if known_user:
                est += self.bu[u]

            if known_item:
                est += self.bi[i]
                
            if known_user and known_item:
                est += np.dot(self.qi[i], self.pu[u])
        else:
            if known_user and known_item:
                est = np.dot(self.qi[i], self.pu[u])
            else:
                raise PredictionImpossible('User and item are unknown')
        return est

## Import Dataset

In [3]:
data_source = 'mlsmall'          # MovieLens dataset 2018
#data_source = 'ml-latest-small' # MovieLens dataset 2016

reader = Reader()
path = os.path.join('Dataset',data_source)
rate = pd.read_csv(path+'/ratings.csv', encoding='utf-8')
raw_tags = pd.read_csv(path+'/tags.csv', encoding='utf-8')

data = Dataset.load_from_df(rate[['userId', 'movieId', 'rating']], reader)

## The Tag Genome dataset obtain from MovieLens latest 27M dataset
if data_source == 'mlsmall':
    path = os.path.join('Dataset', 'ml-latest')
elif data_source == 'ml-latest-small':
    path = os.path.join('Dataset', 'ml-latest-2016')
    
genome_tag = pd.read_csv(path+'/genome-tags.csv', encoding='utf-8')
genome_score = pd.read_csv(path+'/genome-scores.csv', encoding='utf-8')

## Tag Filtering

In [4]:
gb_tags = raw_tags.groupby(['tag'], as_index=False)['userId'].count()
raw_tags = raw_tags[raw_tags.tag.isin(list(gb_tags.tag[gb_tags.userId >= 3]))].reset_index(drop=True)
print("Remaining Tag Records: " + str(len(raw_tags)))

Remaining Tag Records: 2171


## Set Parameters

In [5]:
lr  = 0.006 # Learning Rate of the co-SVD
epoch = 20 # The number of iteration for model training
n_eval = 10 # The number of iteration for model evaluation
factors = 40 # the number of latent features, may change to 20, 30 or 40

## Generate Matrices

In [6]:
p_ut, f_it, tags, ratings = mg.generateTagsOrigin(rate, raw_tags)

## Model Evaluation

In [7]:
res = pd.DataFrame(columns = ['n_eval', 'precision' , 'recall', 'rmse', 'mae'])

for j in range(n_eval):
    trainset, testset = train_test_split(data, test_size=.3, random_state=j)
    algo = co_SVD(verbose=False, n_epochs=epoch, lr_all=lr, n_factors=factors
                  , p_ut=p_ut, f_it=f_it, tags=tags, ratings=ratings, random_state=j
                 )

    algo.fit(trainset)
    predictions = algo.test(testset)
    mae = accuracy.mae(predictions, verbose=False)
    rmse = accuracy.rmse(predictions, verbose=False)

    test_res = pd.DataFrame(predictions)

    threshold = 3.5
    test_res['actual_cat'] = list(map(lambda x: 1 if x >= threshold else 0, test_res['r_ui']))
    test_res['pred_cat'] = list(map(lambda x: 1 if x >= threshold else 0, test_res['est']))

    tn, fp, fn, tp = confusion_matrix(test_res.actual_cat, test_res.pred_cat).ravel()
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    res = res.append(pd.Series([j, precision, recall, rmse, mae], index=res.columns), ignore_index = True)
    print("Eval: " + str(j+1), end="\r")
    
if data_source == "mlsmall":
    print("MovieLens Dataset 2018")
elif data_source == "ml-latest-small":
    print("MovieLens Dataset 2016")

print("Reworked co-SVD - Result")
print(res.drop(columns=['n_eval']).mean())

MovieLens Dataset 2018
Reworked co-SVD - Result
precision    0.890244
recall       0.774372
rmse         0.637633
mae          0.489004
dtype: float64
