# Getting user rating by hybrid filtering

In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
from scipy.stats import pearsonr
import dask.dataframe as dd
%cd ~/data

C:\Users\user\data


In [2]:
"""similarity measure"""
def sim(x, y, metric='cos'):
    if metric == 'cos':
        return 1.-cosine(x,y)
    else:
        return pearsonr(x,y)[0]

In [3]:
ufeat_matrix = pd.read_csv('added_sparse_matrix.csv')
ufeat_matrix.fillna(0, inplace = True)
ufeat_matrix.drop('Unnamed: 0', axis = 1, inplace = True)

# Making similarity matrix

In [36]:
class Hybrid_cbf_cf(object):
    def __init__(self, ufeat_matrix):
        #initial value
        self.ufeat_matrix = ufeat_matrix
        self.nusers = len(ufeat_matrix)
        self.nani = len(ufeat_matrix.iloc[0])
        self.nfeat = 43

    def CalcRatings(self,u_vec_feats,K):
        data_sim = np.zeros((self.nusers, self.nani + 1))  # Array for ufeat_matrix and similarity
        data_sim[:,:-1] = self.ufeat_matrix
        u_rec = np.zeros(len(ufeat_matrix.iloc[0,:-self.nfeat]))  #Array on rating animation for each user 'u'
        for u in range(self.nusers):
            if np.array_equal(data_sim[u,:-1], u_vec_feats) == False:
                data_sim[u, self.nani] = sim(data_sim[u,:-1], u_vec_feats)
            else:
                data_sim[u, self.nani] = 0
                
        data_sim = data_sim[data_sim[:,self.nani].argsort()][::-1] # Ordered by similarity

        def FindKNeighbours(r,data_sim,K):
            neighs = []
            cnt=0
            for u in range(len(data_sim)):
                if data_sim[u,r]>0 and cnt<K: # Finding K neighbors for item 'r'
                   neighs.append(data_sim[u])   
                   cnt += 1 
                elif cnt==K:
                   break
            return neighs
        
        def CalcRating(u_vec_feats,r,neighs):
            rating = 0.
            den = 0.
            for j in range(len(neighs)):
                 # Multipling similarity by centered value of user 'j'th item 'r' and summing the values
                rating += neighs[j][-1]*float(neighs[j][r]-neighs[j][neighs[j]>0][:-1].mean())
                den += abs(neighs[j][-1]) # Absolute value of similarity
            if den>0:
                rating = u_vec_feats[u_vec_feats > 0].mean() + (rating/float(den))
            else:
                rating = max(u_vec_feats[u_vec_feats > 0].mean(), 0)
            if rating>10:
                return 10. # The maximum value
            elif rating<0:
                return 0. # The minimum value
            return rating
        
        for r in range(self.nani - self.nfeat):
            if u_vec_feats[r]==0:
               neighs = FindKNeighbours(r,data_sim,K)
               #calc the predicted rating
               u_rec[r] = CalcRating(u_vec_feats,r,neighs)
        return u_rec

In [52]:
%%time

number4 = Hybrid_cbf_cf(ufeat_matrix).CalcRatings(u_vec_feats = ufeat_matrix.iloc[4],K = 10)



Wall time: 7min 12s


In [61]:
number4, number4.shape

(array([ 5.6451496 ,  5.24309935,  0.        , ...,  3.86981439,
         4.22887487,  4.22887487]), (11200L,))

- 보다시피 1 사람의 rating을 10명의 가장 유사한 사람을 통해 rating vector를 예상하는데도 7분이 넘는 시간이 걸렸다.
- 따라서 당연히 성능 검증 또한 할 수 없다.
- 이후에 SVD나 CUR 분해를 통해 진행해 보겠다.