## Import Libraries

In [1]:
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

#cimport numpy as np # noqa
import numpy as np

from surprise import Reader, AlgoBase, PredictionImpossible
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.utils import get_rng
from surprise.model_selection import train_test_split
from surprise import accuracy

import pandas as pd
import os
import time
import math

from sklearn.metrics import confusion_matrix, mean_squared_error, mean_absolute_error, precision_score, recall_score
from math import sqrt

#%reload_ext Cython
%load_ext Cython

## co-SVD

In [2]:
%%cython

cimport numpy as np
import pandas as pd
import numpy as np
import os
import math
import time
import sys
from surprise import AlgoBase, PredictionImpossible
from surprise.utils import get_rng
from functools import reduce
from cython.parallel import prange

class co_SVD(AlgoBase):
    def __init__ (
        self
        , n_factors = 40
        , n_epochs = 1
        , biased = True
        , init_mean = 0
        , init_std_dev=.1

        , lr_all=.005
        , reg_all=.02
        , lr_bu=None
        , lr_bi=None
        , lr_bt=None
        , lr_pu=None
        , lr_qi=None
        , lr_rt=None

        , reg_p=.001
        , reg_r=.035
        , reg_f=1.5

        , random_state=None
        , verbose=False
        , p_ut=None
        , f_it=None
        , tags=None
        , ratings=None
        ):

        self.n_factors = n_factors
        self.n_epochs = n_epochs
        self.biased = biased
        self.init_mean = init_mean
        self.init_std_dev = init_std_dev
        self.lr_bu = lr_bu if lr_bu is not None else lr_all
        self.lr_bi = lr_bi if lr_bi is not None else lr_all
        self.lr_bt = lr_bt if lr_bt is not None else lr_all

        self.lr_pu = lr_pu if lr_pu is not None else lr_all
        self.lr_qi = lr_qi if lr_qi is not None else lr_all
        self.lr_rt = lr_rt if lr_rt is not None else lr_all

        self.reg_p = reg_p
        self.reg_f = reg_f
        self.reg_r = reg_r

        self.random_state = random_state
        self.verbose = verbose
        
        self.p_ut = p_ut
        self.f_it = f_it
        self.tags = tags
        self.ratings = ratings
        AlgoBase.__init__(self)
        
   
  
    def fit(self,trainset):
        AlgoBase.fit(self, trainset)       
        self.sgd(trainset)
        return self

    def sgd(self, trainset):
        cdef np.ndarray[np.double_t] bu
        cdef np.ndarray[np.double_t] bi
        cdef np.ndarray[np.double_t] bt

        cdef np.ndarray[np.double_t, ndim=2] pu
        cdef np.ndarray[np.double_t, ndim=2] qi
        cdef np.ndarray[np.double_t, ndim=2] rt

        cdef int u, i, t, f, raw_u, raw_i, raw_t
        cdef double r, p_put, p_fit, err_r, err_p, err_f, dot_r, dot_p, dot_f, puf, qif, rtf, global_mean_p, global_mean_f
        cdef double global_mean_r = self.trainset.global_mean

        cdef double lr_bu = self.lr_bu
        cdef double lr_bi = self.lr_bi
        cdef double lr_bt = self.lr_bt

        cdef double lr_pu = self.lr_pu
        cdef double lr_qi = self.lr_qi
        cdef double lr_rt = self.lr_rt

        cdef double reg_p = self.reg_p
        cdef double reg_f = self.reg_f
        cdef double reg_r = self.reg_r

        p_ut = self.p_ut
        f_it = self.f_it
        tags = self.tags
        ratings = self.ratings
        
        cdef int n_factors = self.n_factors
        raw_user = np.zeros(trainset.n_users, int)
        for i in trainset.all_users():
            raw_user[i] = trainset.to_raw_uid(i)

        raw_item = np.zeros(trainset.n_items, int)
        for i in trainset.all_items():
            raw_item[i] = trainset.to_raw_iid(i)
        
        final_raw = ratings[ratings.userId.isin(raw_user) & ratings.movieId.isin(raw_item)]
        raw_data = final_raw

        uni_tid = raw_data.tid.unique()
        uni_tid = uni_tid[~np.isnan(uni_tid)]
        u_t = pd.DataFrame({'tid': uni_tid,
                            'tid_inner':range(len(uni_tid))})
        
        raw_data = pd.merge(raw_data, u_t, how ='left', on=['tid'])
        
        final_p = p_ut[p_ut.userId.isin(raw_user) & p_ut.tid.isin(raw_data.tid)]
        final_f = f_it[f_it.movieId.isin(raw_item) & f_it.tid.isin(raw_data.tid)]
        
        final_p = pd.merge(final_p, u_t, how='left', on=['tid'])
        final_f = pd.merge(final_f, u_t, how='left', on=['tid'])

        p_ut = final_p.drop(['tid'], axis=1)
        f_it = final_f.drop(['tid'], axis=1)

        rng = get_rng(self.random_state)
        
        bu = np.zeros(trainset.n_users, np.double)
        bi = np.zeros(trainset.n_items, np.double)

        bt = np.zeros(len(p_ut.tid_inner.unique()), np.double)
        
        pu = rng.normal(self.init_mean, self.init_std_dev,
                        (trainset.n_users, self.n_factors))
        qi = rng.normal(self.init_mean, self.init_std_dev,
                        (trainset.n_items, self.n_factors))
        rt = rng.normal(self.init_mean, self.init_std_dev,
                        (len(p_ut.tid_inner.unique()), self.n_factors))
        
        global_mean_p = np.mean(p_ut.val)
        global_mean_f = np.mean(f_it.val)
        
        
        for current_epoch in range(self.n_epochs):
            if self.verbose:
                print("Processing epoch {}".format(current_epoch), end='\r')

            rv_sum = np.zeros((trainset.n_users, self.n_factors))
            pz_sum = np.zeros((trainset.n_users, self.n_factors))
            ru_sum = np.zeros((trainset.n_items, self.n_factors))
            fz_sum = np.zeros((trainset.n_items, self.n_factors))
            pu_sum = np.zeros((len(p_ut.tid_inner.unique()), self.n_factors))
            fv_sum = np.zeros((len(p_ut.tid_inner.unique()), self.n_factors))
            
            for rn in range(len(raw_data)):
                
                raw_u = raw_data.loc[rn, 'userId']
                raw_i = raw_data.loc[rn, 'movieId']
                
                u = trainset.to_inner_uid(raw_u)
                i = trainset.to_inner_iid(raw_i)
                
                r = raw_data.loc[rn, 'rating']
                if pd.isna(raw_data.loc[rn, 'tid_inner']) | np.isnan(raw_data.loc[rn, 'tid_inner']):
                    t = -1
                else:
                    t = raw_data.loc[rn, 'tid_inner']
                
                t = int(t)
                
                if t != -1:
                    p_put = list(p_ut.val[(p_ut.userId == raw_u) & (p_ut.tid_inner == t)])[0]
                    p_fit = list(f_it.val[(f_it.movieId == raw_i) & (f_it.tid_inner == t)])[0]

                if math.isnan(r):
                    r = 0

                dot_r = 0.0
                dot_p = 0.0
                dot_f = 0.0
                if r != 0:
                    for f in prange(n_factors, nogil=True):
                        dot_r += qi[i,f] * pu[u,f] 

                if t != -1:
                    for f in range(n_factors):
                        dot_p += rt[t,f] * pu[u,f]
                        dot_f += rt[t,f] * qi[i,f]
                
                if not(math.isnan(r)):
                    err_r = r - (global_mean_r + bu[u] + bi[i] + dot_r)
                else:
                    err_r = 0.0

                if t != -1:
                    err_p = p_put - (global_mean_p + bu[u] + bt[t] + dot_p)
                    err_f = p_fit - (global_mean_f + bi[i] + bt[t] + dot_f)
                else:
                    err_p = 0.0
                    err_f = 0.0

                if self.biased:
                    bu[u] -= lr_bu * (-1 * err_r - reg_p * err_p + reg_r * bu[u])
                    bi[i] -= lr_bi * (-1 * err_r - reg_f * err_f + reg_r * bi[i])
                    if t != -1:
                        bt[t] -= lr_bt * (-1 * reg_p * err_p - reg_f * err_f + reg_r * bt[t])
                
                for f in range(self.n_factors):
                    rv_sum[u,f] += err_r * qi[i,f]
                    ru_sum[i,f] += err_r * pu[u,f]
                    if t != -1:
                        pz_sum[u,f] += err_p * rt[t,f]
                        fz_sum[i,f] += err_f * rt[t,f]
                        pu_sum[t,f] += err_p * pu[u,f]
                        fv_sum[t,f] += err_f * qi[i,f]
            
            for f in range(self.n_factors):
                for u in trainset.all_users():              
                    puf = pu[u,f]
                    pu[u,f] -= lr_pu * (-1 * rv_sum[u,f] - reg_p * pz_sum[u,f] + reg_r * puf)
                    
                for i in trainset.all_items():
                    qif = qi[i,f]
                    qi[i,f] -= lr_pu * (-1 * ru_sum[i,f] - reg_f * fz_sum[i,f] + reg_r * qif)
                       
                for t in range(len(p_ut.tid_inner.unique())):
                    rtf = rt[t, f]
                    rt[t,f] -= lr_pu * (-1 * reg_p * pu_sum[t,f] - reg_f * fv_sum[t,f] + reg_r * rtf)
                    
        self.bu = bu
        self.bi = bi
        self.pu = pu
        self.qi = qi

    def estimate(self, u, i):
        known_user = self.trainset.knows_user(u)
        known_item = self.trainset.knows_item(i)

        if self.biased:
            est = self.trainset.global_mean

            if known_user:
                est += self.bu[u]

            if known_item:
                est += self.bi[i]
                
            if known_user and known_item:
                est += np.dot(self.qi[i], self.pu[u])
        else:
            if known_user and known_item:
                est = np.dot(self.qi[i], self.pu[u])
            else:
                raise PredictionImpossible('User and item are unknown')
        return est

## Generate User-Tag Matrix & Item-Tag Matrix

In [3]:
def generateTagsOrigin(rate, tags):
    temp_df = rate
    temp_df = temp_df.iloc[:,0:3]
    
    gb_tags = tags.groupby(['tag'], as_index=False)['userId'].count()
    uni_tags = gb_tags.tag[gb_tags.userId >= 5].reset_index(drop=True)
    tags = tags[tags.tag.isin(uni_tags)].reset_index(drop=True)
    if len(uni_tags) ==0 :
        print("Not tags available")
    uni_tags = {'tag':uni_tags, 'tid':range(len(uni_tags))}
    u_tag = pd.DataFrame(uni_tags)
    tags = pd.merge(tags, u_tag, how='left', on=['tag'])

    w = tags.groupby(['tid', 'movieId'], as_index=False)['userId'].count()
    w.columns = ['tid', 'movieId', 'cn']
    temp = w.groupby('movieId', as_index=False)['cn'].sum()
    temp = temp.set_index('movieId')
    iteration = range(len(w))
    w['val'] = np.array(pd.Series(iteration).map(lambda x: w.cn[x] / temp.loc[w.movieId[1], 'cn']))
    
    f = tags.groupby(['userId', 'tid'], as_index=False).agg({'movieId': 'count'})
    f.columns = ['userId', 'tid', 'cn']
    temp = f.groupby('userId', as_index=False)['cn'].sum()
    temp = temp.set_index('userId')
    iteration = range(len(f))
    f['val'] = np.array(pd.Series(iteration).map(lambda x: f.cn[x] / temp.loc[f.userId[x], 'cn']))
    
    nl_alpha = -0.006
    
    nl_ut = tags.groupby(['userId', 'tid'], as_index=False).agg({'timestamp': 'max'})
    nl_ut = nl_ut.sort_values(by=['userId', 'timestamp'], ascending=[True,False]).reset_index(drop=True)
    g = nl_ut.groupby(['userId'])
    nl_ut['times'] = g['timestamp'].rank(method='first', ascending=False)
    nl_ut['val'] = nl_alpha * nl_ut['times']
    nl_ut['val'] = nl_ut['val'].map(lambda x: math.exp(x)).tolist()
    
    nl_it = tags.groupby(['movieId', 'tid'], as_index=False).agg({'timestamp': 'max'})
    nl_it = nl_it.sort_values(by=['movieId', 'timestamp'], ascending=[True,False]).reset_index(drop=True)
    g = nl_it.groupby(['movieId'])
    nl_it['times'] = g['timestamp'].rank(method='first', ascending=False)
    nl_it['val'] = nl_alpha * nl_it['times']
    nl_it['val'] = nl_it['val'].map(lambda x: math.exp(x)).tolist()
    
    ru = temp_df.groupby(['userId'], as_index=False).agg({'rating': 'mean'})
    ru = ru.rename(index=str, columns={"rating": "ru"})
    p_ut = pd.DataFrame(f[['userId','tid']], columns=['userId','tid'])
    how='outer'
    overall = pd.merge(temp_df, tags, how=how, on=['userId', 'movieId'])
    overall = overall.drop(columns=['tag','timestamp'])
    overall.rating.fillna(0, inplace=True)
    temp_rt = overall[ (-pd.isna(overall.userId)) & (-pd.isna(overall.tid))]
    rt = overall.groupby(['userId', 'tid'], as_index=False).agg({'rating': 'mean'})
    rt = rt.rename(index=str, columns={"rating": "rt"})
    overall = pd.merge(overall, w, how=how, on=['movieId', 'tid'])
    overall = overall.drop(columns=['cn'])
    overall = overall.rename(index=str, columns={"val": "w_it"})
    overall.w_it.fillna(0, inplace=True)
    overall = pd.merge(overall, ru, how=how, on=['userId'])
    overall.ru.fillna(0, inplace=True)
    overall['r_bias'] = overall.rating - overall.ru
    overall['b_it'] = overall.r_bias * overall.w_it
    b_it = overall[-pd.isna(overall.tid)].groupby(['userId', 'tid'], as_index=False).agg({'w_it': 'sum', 'b_it':'sum'})
    b_it['val'] = b_it.b_it / b_it.w_it
    
    ru = ru.set_index('userId')
    rt = rt.set_index(['userId', 'tid'])
    f = f.set_index(['userId', 'tid'])
    b_it = b_it.set_index(['userId', 'tid'])
    nl_ut = nl_ut.set_index(['userId', 'tid'])

    p_ut['val'] = list(map(lambda x,y: ru.loc[x, 'ru'] + b_it.loc[(x, y), 'val'] 
              + 1.7 * f.loc[(x,y),'val'] * (rt.loc[(x,y), 'rt'] - ru.loc[x, 'ru']) 
              + 0.05 * nl_ut.loc[(x,y), 'val'] , p_ut.userId, p_ut.tid))
    
    f_it = pd.DataFrame(w[['movieId','tid']], columns=['movieId','tid'])
    w = w.set_index(['movieId', 'tid'])
    nl_it = nl_it.set_index(['movieId', 'tid'])
    f_it['val'] = list(map(lambda x,y: w.loc[(x,y), 'val'] + 0.05 * nl_it.loc[(x,y), 'val'], f_it.movieId, f_it.tid))
    
    ratings = overall.iloc[:,0:4]
    return p_ut, f_it, tags, ratings

## Generate Item-Tag Matrix with Tag Genome & User-Tag Matrix

In [4]:
def generateTagsWithGenomeScore(rate, tags, genome_tag, genome_score):
    temp_df = rate
    temp_df = temp_df.iloc[:,0:3]
    
    gb_tags = tags.groupby(['tag'], as_index=False)['userId'].count()
    tags = tags[tags.tag.isin(list(gb_tags.tag[gb_tags.userId >= 5]))].reset_index(drop=True)
    tags.tag = tags.tag.apply(lambda x: x.lower())

    tags = pd.merge(tags, genome_tag, how='left', on=['tag'])
    tags = tags[~np.isnan(tags.tagId)]
    tags = tags[tags.movieId.isin(genome_score.movieId) & tags.tagId.isin(genome_score.tagId)]
    
    uni_tags = tags.groupby(['tag'], as_index=False)['userId'].count().tag
    tags = tags[tags.tag.isin(uni_tags)].reset_index(drop=True)
    uni_tags = {'tag':uni_tags, 'tid':range(len(uni_tags))}
    u_tag = pd.DataFrame(uni_tags)
    tags = pd.merge(tags, u_tag, how='left', on=['tag'])
    
    f_it = genome_score[genome_score.movieId.isin(tags.movieId) & genome_score.tagId.isin(tags.tagId)]
    temp_tag = tags.iloc[:, -2:]
    temp_tag = temp_tag.drop_duplicates()
    f_it = pd.merge(f_it, temp_tag, how='left', on=['tagId'])
    f_it = f_it.rename(index=str, columns={"relevance": "val"})
    
    f_it = f_it.drop(['tagId'], axis=1)
    tags = tags.drop(['tagId'], axis=1)
    
    w = tags.groupby(['tid', 'movieId'], as_index=False)['userId'].count()
    w.columns = ['tid', 'movieId', 'cn']
    temp = w.groupby('movieId', as_index=False)['cn'].sum()
    temp = temp.set_index('movieId')
    iteration = range(len(w))
    w['val'] = pd.Series(iteration).map(lambda x: w.cn[x] / temp.loc[w.movieId[1], 'cn']).tolist()
    
    f = tags.groupby(['userId', 'tid'], as_index=False).agg({'movieId': 'count'})
    f.columns = ['userId', 'tid', 'cn']
    temp = f.groupby('userId', as_index=False)['cn'].sum()
    temp = temp.set_index('userId')
    iteration = range(len(f))
    f['val'] = pd.Series(iteration).map(lambda x: f.cn[x] / temp.loc[f.userId[x], 'cn']).tolist()
    
    nl_alpha = -0.006
    
    nl_ut = tags.groupby(['userId', 'tid'], as_index=False).agg({'timestamp': 'max'})
    nl_ut = nl_ut.sort_values(by=['userId', 'timestamp'], ascending=[True,False]).reset_index(drop=True)
    g = nl_ut.groupby(['userId'])
    nl_ut['times'] = g['timestamp'].rank(method='first', ascending=False)
    nl_ut['val'] = nl_alpha * nl_ut['times']
    nl_ut['val'] = nl_ut['val'].map(lambda x: math.exp(x)).tolist()
    
    ru = temp_df.groupby(['userId'], as_index=False).agg({'rating': 'mean'})
    ru = ru.rename(index=str, columns={"rating": "ru"})
    p_ut = pd.DataFrame(f[['userId','tid']], columns=['userId','tid'])
    how='outer'
    overall = pd.merge(temp_df, tags, how=how, on=['userId', 'movieId'])
    overall = overall.drop(columns=['tag','timestamp'])
    overall.rating.fillna(0, inplace=True)
    temp_rt = overall[ (-pd.isna(overall.userId)) & (-pd.isna(overall.tid))]
    rt = overall.groupby(['userId', 'tid'], as_index=False).agg({'rating': 'mean'})
    rt = rt.rename(index=str, columns={"rating": "rt"})
    overall = pd.merge(overall, w, how=how, on=['movieId', 'tid'])
    overall = overall.drop(columns=['cn'])
    overall = overall.rename(index=str, columns={"val": "w_it"})
    overall.w_it.fillna(0, inplace=True)
    overall = pd.merge(overall, ru, how=how, on=['userId'])
    overall.ru.fillna(0, inplace=True)
    overall['r_bias'] = overall.rating - overall.ru
    overall['b_it'] = overall.r_bias * overall.w_it
    b_it = overall[-pd.isna(overall.tid)].groupby(['userId', 'tid'], as_index=False).agg({'w_it': 'sum', 'b_it':'sum'})
    b_it['val'] = b_it.b_it / b_it.w_it
    
    ru = ru.set_index('userId')
    rt = rt.set_index(['userId', 'tid'])
    f = f.set_index(['userId', 'tid'])
    b_it = b_it.set_index(['userId', 'tid'])
    nl_ut = nl_ut.set_index(['userId', 'tid'])

    p_ut['val'] = list(map(lambda x,y: ru.loc[x, 'ru'] + b_it.loc[(x, y), 'val'] 
              + 1.7 * f.loc[(x,y),'val'] * (rt.loc[(x,y), 'rt'] - ru.loc[x, 'ru']) 
              + 0.05 * nl_ut.loc[(x,y), 'val'] , p_ut.userId, p_ut.tid))
    
    ratings = overall.iloc[:,0:4]
    return p_ut, f_it, tags, ratings

## Generate Item-Tag Matrix with Tag genome and time information & User-Tag Matrix

In [5]:
def generateTagsWeightageWithGenomeScore(rate, tags, genome_tag, genome_score):
    temp_df = rate
    temp_df = temp_df.iloc[:,0:3]
      
    gb_tags = tags.groupby(['tag'], as_index=False)['userId'].count()
    tags = tags[tags.tag.isin(list(gb_tags.tag[gb_tags.userId >= 5]))].reset_index(drop=True)
    tags.tag = tags.tag.apply(lambda x: x.lower())

    tags = pd.merge(tags, genome_tag, how='left', on=['tag'])
    tags = tags[~np.isnan(tags.tagId)]
    tags = tags[tags.movieId.isin(genome_score.movieId) & tags.tagId.isin(genome_score.tagId)]
    
    uni_tags = tags.groupby(['tag'], as_index=False)['userId'].count().tag
    tags = tags[tags.tag.isin(uni_tags)].reset_index(drop=True)
    uni_tags = {'tag':uni_tags, 'tid':range(len(uni_tags))}
    u_tag = pd.DataFrame(uni_tags)
    tags = pd.merge(tags, u_tag, how='left', on=['tag'])
    
    temp_tag = tags.iloc[:, -2:]
    temp_tag = temp_tag.drop_duplicates()
    tags = tags.drop(['tagId'], axis=1)
    #f_it = genome_score[genome_score.movieId.isin(tags.movieId) & genome_score.tagId.isin(tags.tagId)]
    
    w = tags.groupby(['tid', 'movieId'], as_index=False)['userId'].count()
    w.columns = ['tid', 'movieId', 'cn']
    temp = w.groupby('movieId', as_index=False)['cn'].sum()
    temp = temp.set_index('movieId')
    iteration = range(len(w))
    w['val'] = pd.Series(iteration).map(lambda x: w.cn[x] / temp.loc[w.movieId[1], 'cn']).tolist()
    
    f = tags.groupby(['userId', 'tid'], as_index=False).agg({'movieId': 'count'})
    f.columns = ['userId', 'tid', 'cn']
    temp = f.groupby('userId', as_index=False)['cn'].sum()
    temp = temp.set_index('userId')
    iteration = range(len(f))
    f['val'] = pd.Series(iteration).map(lambda x: f.cn[x] / temp.loc[f.userId[x], 'cn']).tolist()
    
    nl_alpha = -0.006
    
    nl_ut = tags.groupby(['userId', 'tid'], as_index=False).agg({'timestamp': 'max'})
    nl_ut = nl_ut.sort_values(by=['userId', 'timestamp'], ascending=[True,False]).reset_index(drop=True)
    g = nl_ut.groupby(['userId'])
    nl_ut['times'] = g['timestamp'].rank(method='first', ascending=False)
    nl_ut['val'] = nl_alpha * nl_ut['times']
    nl_ut['val'] = nl_ut['val'].map(lambda x: math.exp(x)).tolist()
    
    nl_it = tags.groupby(['movieId', 'tid'], as_index=False).agg({'timestamp': 'max'})
    nl_it = nl_it.sort_values(by=['movieId', 'timestamp'], ascending=[True,False]).reset_index(drop=True)
    g = nl_it.groupby(['movieId'])
    nl_it['times'] = g['timestamp'].rank(method='first', ascending=False)
    nl_it['val'] = nl_alpha * nl_it['times']
    nl_it['val'] = nl_it['val'].map(lambda x: math.exp(x)).tolist()
    
    ru = temp_df.groupby(['userId'], as_index=False).agg({'rating': 'mean'})
    ru = ru.rename(index=str, columns={"rating": "ru"})
    p_ut = pd.DataFrame(f[['userId','tid']], columns=['userId','tid'])
    how='outer'
    overall = pd.merge(temp_df, tags, how=how, on=['userId', 'movieId'])
    overall = overall.drop(columns=['tag','timestamp'])
    overall.rating.fillna(0, inplace=True)
    temp_rt = overall[ (-pd.isna(overall.userId)) & (-pd.isna(overall.tid))]
    rt = overall.groupby(['userId', 'tid'], as_index=False).agg({'rating': 'mean'})
    rt = rt.rename(index=str, columns={"rating": "rt"})
    overall = pd.merge(overall, w, how=how, on=['movieId', 'tid'])
    overall = overall.drop(columns=['cn'])
    overall = overall.rename(index=str, columns={"val": "w_it"})
    overall.w_it.fillna(0, inplace=True)
    overall = pd.merge(overall, ru, how=how, on=['userId'])
    overall.ru.fillna(0, inplace=True)
    overall['r_bias'] = overall.rating - overall.ru
    overall['b_it'] = overall.r_bias * overall.w_it
    b_it = overall[-pd.isna(overall.tid)].groupby(['userId', 'tid'], as_index=False).agg({'w_it': 'sum', 'b_it':'sum'})
    b_it['val'] = b_it.b_it / b_it.w_it
    
    ru = ru.set_index('userId')
    rt = rt.set_index(['userId', 'tid'])
    f = f.set_index(['userId', 'tid'])
    b_it = b_it.set_index(['userId', 'tid'])
    nl_ut = nl_ut.set_index(['userId', 'tid'])

    p_ut['val'] = list(map(lambda x,y: ru.loc[x, 'ru'] + b_it.loc[(x, y), 'val'] 
              + 1.7 * f.loc[(x,y),'val'] * (rt.loc[(x,y), 'rt'] - ru.loc[x, 'ru']) 
              + 0.05 * nl_ut.loc[(x,y), 'val'] , p_ut.userId, p_ut.tid))
    
    genome_score = genome_score[genome_score.movieId.isin(tags.movieId) & genome_score.tagId.isin(temp_tag.tagId)]
    genome_score = pd.merge(genome_score, temp_tag, how='left', on=['tagId'])
    genome_score = genome_score.rename(index=str, columns={"relevance": "val"})
    
    f_it = pd.DataFrame(w[['movieId','tid']], columns=['movieId','tid'])
    genome_score = genome_score.set_index(['movieId', 'tid'])
    nl_it = nl_it.set_index(['movieId', 'tid'])
    f_it['val'] = list(map(lambda x,y: genome_score.loc[(x,y), 'val'] + 0.05 * nl_it.loc[(x,y), 'val'], f_it.movieId, f_it.tid))
    
    ratings = overall.iloc[:,0:4]
    return p_ut, f_it, tags, ratings

## Generate Item-Tag Matrix with tag genome, tag frequency and time information & User-Tag Matrix

In [6]:
def generateTagsOriginWithGenomeWeightage(rate, tags, genome_tag, genome_score):
    temp_df = rate
    temp_df = temp_df.iloc[:,0:3]
    
    gb_tags = tags.groupby(['tag'], as_index=False)['userId'].count()
    tags = tags[tags.tag.isin(list(gb_tags.tag[gb_tags.userId >= 5]))].reset_index(drop=True)
    tags.tag = tags.tag.apply(lambda x: x.lower())

    tags = pd.merge(tags, genome_tag, how='left', on=['tag'])
    tags = tags[~np.isnan(tags.tagId)]
    tags = tags[tags.movieId.isin(genome_score.movieId) & tags.tagId.isin(genome_score.tagId)]
    
    uni_tags = tags.groupby(['tag'], as_index=False)['userId'].count().tag
    tags = tags[tags.tag.isin(uni_tags)].reset_index(drop=True)
    uni_tags = {'tag':uni_tags, 'tid':range(len(uni_tags))}
    u_tag = pd.DataFrame(uni_tags)
    tags = pd.merge(tags, u_tag, how='left', on=['tag'])
    
    temp_tag = tags.iloc[:, -2:]
    temp_tag = temp_tag.drop_duplicates()
    tags = tags.drop(['tagId'], axis=1)
    
    w = tags.groupby(['tid', 'movieId'], as_index=False)['userId'].count()
    w.columns = ['tid', 'movieId', 'cn']
    temp = w.groupby('movieId', as_index=False)['cn'].sum()
    temp = temp.set_index('movieId')
    iteration = range(len(w))
    w['val'] = pd.Series(iteration).map(lambda x: w.cn[x] / temp.loc[w.movieId[1], 'cn']).tolist()
    
    f = tags.groupby(['userId', 'tid'], as_index=False).agg({'movieId': 'count'})
    f.columns = ['userId', 'tid', 'cn']
    temp = f.groupby('userId', as_index=False)['cn'].sum()
    temp = temp.set_index('userId')
    iteration = range(len(f))
    f['val'] = pd.Series(iteration).map(lambda x: f.cn[x] / temp.loc[f.userId[x], 'cn']).tolist()
    
    nl_alpha = -0.006
    
    nl_ut = tags.groupby(['userId', 'tid'], as_index=False).agg({'timestamp': 'max'})
    nl_ut = nl_ut.sort_values(by=['userId', 'timestamp'], ascending=[True,False]).reset_index(drop=True)
    g = nl_ut.groupby(['userId'])
    nl_ut['times'] = g['timestamp'].rank(method='first', ascending=False)
    nl_ut['val'] = nl_alpha * nl_ut['times']
    nl_ut['val'] = nl_ut['val'].map(lambda x: math.exp(x)).tolist()
    
    nl_it = tags.groupby(['movieId', 'tid'], as_index=False).agg({'timestamp': 'max'})
    nl_it = nl_it.sort_values(by=['movieId', 'timestamp'], ascending=[True,False]).reset_index(drop=True)
    g = nl_it.groupby(['movieId'])
    nl_it['times'] = g['timestamp'].rank(method='first', ascending=False)
    nl_it['val'] = nl_alpha * nl_it['times']
    nl_it['val'] = nl_it['val'].map(lambda x: math.exp(x)).tolist()
    
    ru = temp_df.groupby(['userId'], as_index=False).agg({'rating': 'mean'})
    ru = ru.rename(index=str, columns={"rating": "ru"})
    p_ut = pd.DataFrame(f[['userId','tid']], columns=['userId','tid'])
    how='outer'
    overall = pd.merge(temp_df, tags, how=how, on=['userId', 'movieId'])
    overall = overall.drop(columns=['tag','timestamp'])
    overall.rating.fillna(0, inplace=True)
    temp_rt = overall[ (-pd.isna(overall.userId)) & (-pd.isna(overall.tid))]
    rt = overall.groupby(['userId', 'tid'], as_index=False).agg({'rating': 'mean'})
    rt = rt.rename(index=str, columns={"rating": "rt"})
    overall = pd.merge(overall, w, how=how, on=['movieId', 'tid'])
    overall = overall.drop(columns=['cn'])
    overall = overall.rename(index=str, columns={"val": "w_it"})
    overall.w_it.fillna(0, inplace=True)
    overall = pd.merge(overall, ru, how=how, on=['userId'])
    overall.ru.fillna(0, inplace=True)
    overall['r_bias'] = overall.rating - overall.ru
    overall['b_it'] = overall.r_bias * overall.w_it
    b_it = overall[-pd.isna(overall.tid)].groupby(['userId', 'tid'], as_index=False).agg({'w_it': 'sum', 'b_it':'sum'})
    b_it['val'] = b_it.b_it / b_it.w_it
    
    ru = ru.set_index('userId')
    rt = rt.set_index(['userId', 'tid'])
    f = f.set_index(['userId', 'tid'])
    b_it = b_it.set_index(['userId', 'tid'])
    nl_ut = nl_ut.set_index(['userId', 'tid'])

    p_ut['val'] = list(map(lambda x,y: ru.loc[x, 'ru'] + b_it.loc[(x, y), 'val'] 
              + 1.7 * f.loc[(x,y),'val'] * (rt.loc[(x,y), 'rt'] - ru.loc[x, 'ru']) 
              + 0.05 * nl_ut.loc[(x,y), 'val'] , p_ut.userId, p_ut.tid))
    
    genome_score = genome_score[genome_score.movieId.isin(tags.movieId) & genome_score.tagId.isin(temp_tag.tagId)]
    genome_score = pd.merge(genome_score, temp_tag, how='left', on=['tagId'])
    genome_score = genome_score.rename(index=str, columns={"relevance": "val"})
    
    f_it = pd.DataFrame(w[['movieId','tid']], columns=['movieId','tid'])
    w = w.set_index(['movieId', 'tid'])
    genome_score = genome_score.set_index(['movieId', 'tid'])
    nl_it = nl_it.set_index(['movieId', 'tid'])
    f_it['val'] = list(map(lambda x,y: genome_score.loc[(x,y), 'val'] + 0.05 * nl_it.loc[(x,y), 'val'] if (x,y) in genome_score.index else w.loc[(x,y), 'val'] + 0.05 * nl_it.loc[(x,y), 'val'], f_it.movieId, f_it.tid))
    
    ratings = overall.iloc[:,0:4]
    return p_ut, f_it, tags, ratings

## Import Dataset

In [7]:
data_source = 'mlsmall'          # MovieLens dataset 2018
#data_source = 'ml-latest-small' # MovieLens dataset 2016
reader = Reader()
path = os.path.join('Dataset',data_source)
rate = pd.read_csv(path+'/ratings.csv', encoding='utf-8')
raw_tags = pd.read_csv(path+'/tags.csv', encoding='utf-8')

data = Dataset.load_from_df(rate[['userId', 'movieId', 'rating']], reader)

## The Tag Genome dataset obtain from MovieLens latest 27M dataset
path = os.path.join('Dataset', 'ml-latest')
genome_tag = pd.read_csv(path+'/genome-tags.csv', encoding='utf-8')
genome_score = pd.read_csv(path+'/genome-scores.csv', encoding='utf-8')

## Set the value of settings

In [8]:
lr  = 0.0073 # Learning Rate of the co-SVD
epoch = 20 # The number of iteration for model training
n_eval = 10 # The number of iteration for model evaluation
rdm_state = 1 # Random Seed for train test data split

factors = 40 # the number of latent features, may change to 20, 30 or 40

### Evaluate co-SVD with Fixed Seed 

In [9]:
## Generate the User-Tag (p_ut) & Item-Tag (f_it) Matrices
p_ut, f_it, tags, ratings = generateTagsOrigin(rate, raw_tags)

## Dataset Train Test Split with 7:3 ratio
trainset, testset = train_test_split(data, test_size=.3, random_state = rdm_state)

## Initial the co-SVD Model
algo = co_SVD(verbose=False, n_epochs=epoch, lr_all=lr, n_factors=factors
              , p_ut=p_ut, f_it=f_it, tags=tags, ratings=ratings
             , random_state=123
             )

## Model Training
algo.fit(trainset)

## Model Testing
predictions = algo.test(testset)

## Model Evaluation
mae = accuracy.mae(predictions, verbose=False)
rmse = accuracy.rmse(predictions, verbose=False)

test_res = pd.DataFrame(predictions)

threshold = 3.5
test_res['actual_cat'] = list(map(lambda x: 1 if x >= threshold else 0, test_res['r_ui']))
test_res['pred_cat'] = list(map(lambda x: 1 if x >= threshold else 0, test_res['est']))

tn, fp, fn, tp = confusion_matrix(test_res.actual_cat, test_res.pred_cat).ravel()
precision = tp / (tp + fp)
recall = tp / (tp + fn)
if data_source == "ml_small":
    print("MovieLens Dataset 2018")
elif data_source == "ml-latest-small":
    print("MovieLens Dataset 2016")
    
print( "RMSE: "+ str(rmse) + "\n" + "MAE: " + str(mae) + "\n" + "Precision: " + str(precision) + "\n" + "Recall: " + str(recall))

RMSE: 0.5869150348669047
MAE: 0.4430839735008973
Precision: 0.907002457002457
Recall: 0.7964830897027887


### Evaluate different approach on constructing Item-Tag Matrix

In [10]:
## Generate Item-Tag Matrix & User-Tag Matrix
p_ut, f_it, tags, ratings = generateTagsOrigin(rate, raw_tags)

res = pd.DataFrame(columns = ['n_eval', 'precision' , 'recall', 'rmse', 'mae'])

for j in range(n_eval):
    trainset, testset = train_test_split(data, test_size=.3, random_state=j)
    algo = co_SVD(verbose=False, n_epochs=epoch, lr_all=lr, n_factors=factors
                  , p_ut=p_ut, f_it=f_it, tags=tags, ratings=ratings, random_state=123
                 )

    algo.fit(trainset)
    predictions = algo.test(testset)
    mae = accuracy.mae(predictions, verbose=False)
    rmse = accuracy.rmse(predictions, verbose=False)

    test_res = pd.DataFrame(predictions)

    threshold = 3.5
    test_res['actual_cat'] = list(map(lambda x: 1 if x >= threshold else 0, test_res['r_ui']))
    test_res['pred_cat'] = list(map(lambda x: 1 if x >= threshold else 0, test_res['est']))

    tn, fp, fn, tp = confusion_matrix(test_res.actual_cat, test_res.pred_cat).ravel()
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    res = res.append(pd.Series([j, precision, recall, rmse, mae], index=res.columns), ignore_index = True)
    
if data_source == "ml_small":
    print("MovieLens Dataset 2018")
elif data_source == "ml-latest-small":
    print("MovieLens Dataset 2016")

print("Original Method - Result")
print(res.drop(columns=['n_eval']).mean())

Original Method - Result
precision    0.907373
recall       0.796130
rmse         0.583199
mae          0.441311
dtype: float64


In [11]:
## Generate Item-Tag Matrix with Tag Genome & User-Tag Matrix
p_ut, f_it, tags, ratings = generateTagsWithGenomeScore(rate, raw_tags, genome_tag, genome_score)

res = pd.DataFrame(columns = ['n_eval', 'precision' , 'recall', 'rmse', 'mae'])

for j in range(n_eval):
    trainset, testset = train_test_split(data, test_size=.3, random_state=j)
    algo = co_SVD(verbose=False, n_epochs=epoch, lr_all=lr, n_factors=factors
                  , p_ut=p_ut, f_it=f_it, tags=tags, ratings=ratings, random_state=123
                 )

    algo.fit(trainset)
    predictions = algo.test(testset)
    mae = accuracy.mae(predictions, verbose=False)
    rmse = accuracy.rmse(predictions, verbose=False)

    test_res = pd.DataFrame(predictions)

    threshold = 3.5
    test_res['actual_cat'] = list(map(lambda x: 1 if x >= threshold else 0, test_res['r_ui']))
    test_res['pred_cat'] = list(map(lambda x: 1 if x >= threshold else 0, test_res['est']))

    tn, fp, fn, tp = confusion_matrix(test_res.actual_cat, test_res.pred_cat).ravel()
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    res = res.append(pd.Series([j, precision, recall, rmse, mae], index=res.columns), ignore_index = True)
    
if data_source == "ml_small":
    print("MovieLens Dataset 2018")
elif data_source == "ml-latest-small":
    print("MovieLens Dataset 2016")

print("Method I - Result")
print(res.drop(columns=['n_eval']).mean())

Method I - Result
precision    0.908524
recall       0.806932
rmse         0.569686
mae          0.429751
dtype: float64


In [12]:
## Generate Item-Tag Matrix with Tag genome and time information & User-Tag Matrix
p_ut, f_it, tags, ratings = generateTagsWeightageWithGenomeScore(rate, raw_tags, genome_tag, genome_score)

res = pd.DataFrame(columns = ['n_eval', 'precision' , 'recall', 'rmse', 'mae'])

for j in range(n_eval):
    start = time.time()
    trainset, testset = train_test_split(data, test_size=.3, random_state=j)
    algo = co_SVD(verbose=False, n_epochs=epoch, lr_all=lr, n_factors=factors
                  , p_ut=p_ut, f_it=f_it, tags=tags, ratings=ratings, random_state=123
                 )

    algo.fit(trainset)
    predictions = algo.test(testset)
    mae = accuracy.mae(predictions, verbose=False)
    rmse = accuracy.rmse(predictions, verbose=False)

    test_res = pd.DataFrame(predictions)

    threshold = 3.5
    test_res['actual_cat'] = list(map(lambda x: 1 if x >= threshold else 0, test_res['r_ui']))
    test_res['pred_cat'] = list(map(lambda x: 1 if x >= threshold else 0, test_res['est']))

    tn, fp, fn, tp = confusion_matrix(test_res.actual_cat, test_res.pred_cat).ravel()
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    res = res.append(pd.Series([j, precision, recall, rmse, mae], index=res.columns), ignore_index = True)
    duration = time.time() - start
    
if data_source == "ml_small":
    print("MovieLens Dataset 2018")
elif data_source == "ml-latest-small":
    print("MovieLens Dataset 2016")

print("Method II - Result")
print(res.drop(columns=['n_eval']).mean())

Method II - Result
precision    0.908345
recall       0.806921
rmse         0.569663
mae          0.429775
dtype: float64


In [13]:
## Generate Item-Tag Matrix with tag genome, tag frequency and time information & User-Tag Matrix
p_ut, f_it, tags, ratings = generateTagsOriginWithGenomeWeightage(rate, raw_tags, genome_tag, genome_score)

res = pd.DataFrame(columns = ['n_eval', 'precision' , 'recall', 'rmse', 'mae'])

for j in range(n_eval):
    start = time.time()
    trainset, testset = train_test_split(data, test_size=.3, random_state=j)
    algo = co_SVD(verbose=False, n_epochs=epoch, lr_all=lr, n_factors=factors
                  , p_ut=p_ut, f_it=f_it, tags=tags, ratings=ratings, random_state=123
                 )

    algo.fit(trainset)
    predictions = algo.test(testset)
    mae = accuracy.mae(predictions, verbose=False)
    rmse = accuracy.rmse(predictions, verbose=False)

    test_res = pd.DataFrame(predictions)

    threshold = 3.5
    test_res['actual_cat'] = list(map(lambda x: 1 if x >= threshold else 0, test_res['r_ui']))
    test_res['pred_cat'] = list(map(lambda x: 1 if x >= threshold else 0, test_res['est']))

    tn, fp, fn, tp = confusion_matrix(test_res.actual_cat, test_res.pred_cat).ravel()
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    res = res.append(pd.Series([j, precision, recall, rmse, mae], index=res.columns), ignore_index = True)
    duration = time.time() - start
    
if data_source == "ml_small":
    print("MovieLens Dataset 2018")
elif data_source == "ml-latest-small":
    print("MovieLens Dataset 2016")

print("Method III - Result")
print(res.drop(columns=['n_eval']).mean())

Method III - Result
precision    0.908345
recall       0.806921
rmse         0.569663
mae          0.429775
dtype: float64
