## Singular Value Decomposition

Singular value decomposition is a very popular linear algebra technique to break down a matrix into the product of a few smaller matrices. we can use SVD to discover relationship between items. A recommender system can be build easily from this.

$M = U \cdot S \cdot V_{}^{T}$

where U is a m x m matrix, S is a diagonal matrix of m x r and V is a r x r matrix.
Singular value decomposition gets its name from the diagonal entries on , which are called the singular values of matrix. Based on the percentage of singular values we need to retain, we can do a dimensionality reduction and take top K diagonal elements of S. This is called the reduced SVD. 

</br>
In essence, we are removing several rows on U that the corresponding singular values in S are small, before we use it to compute the similarity. This would likely make the prediction more accurate as those less important features of a item are removed from consideration.
</br> </br>
Once we have the item-item similarity using the reduced SVD, the method to predict a rating is similar to collaborative filtering where we give importance to top K similar items

In [1]:
import pandas as pd
import os
from pathlib import Path
import scipy
import pickle
import scipy.stats
from numpy import *
import numpy as np
from numpy import linalg as la
from sklearn.metrics import mean_squared_error
root = Path(".")
from datetime import datetime

In [2]:
class SVD_recommender:
    def __init__(self, retained_energy) -> None:
        self.read_dataset()
        self.train_test_split()
        self.generate_user_item_matrix()

        self.U, self.S, self.V_T = la.svd(self.matrix)
        self.SVD_transform(retained_energy)
        self.calculate_item_similarity()
        
        self.sim_top_k = [None for _ in range(self.item_cnt + 1)]
        for i in range(1, self.item_cnt): 
            self.sim_top_k[i] = self.get_sim_items(i)

    def read_dataset(self):
        self.rating_data = pd.read_csv('ml-1m/ratings.dat', header=None, sep='::')
        self.rating_data.columns = ['UserID', 'ItemID', 'Rating', 'Timestamp']
        self.rating_data.drop(columns=['Timestamp'], axis=1, inplace=True)
        
        self.item_cnt = max(self.rating_data['ItemID']) + 1
        self.users_cnt = max(self.rating_data['UserID']) + 1

    def train_test_split(self):
        self.rating_train = self.rating_data.sample(frac=0.8, random_state=200)
        self.rating_test = self.rating_data.drop(self.rating_train.index)

    def generate_user_item_matrix(self):
        self.matrix = np.zeros(shape=(self.users_cnt, self.item_cnt))
        for row in self.rating_train.itertuples():
            self.matrix[row.UserID][row.ItemID] = row.Rating

    def get_dim(self, S, retained_energy): 
        sum_sq = sum(S ** 2)
        cur_sum, cur_d = 0, 0
        for val in S:
            if cur_sum >= sum_sq * retained_energy: 
                return cur_d 
            cur_sum += val ** 2
            cur_d += 1
        return cur_d

    def similarity(self, A, B):
        return (1.0 / (1.0 + la.norm(A - B)))

    def SVD_transform(self, retained_energy): 
        red_dim = self.get_dim(self.S, retained_energy)

        transformed_S = np.diag(self.S[:red_dim])
        transformed_U = self.U[:, :red_dim]
        transformed_V_T = self.V_T[:red_dim, :]

        self.SVD_matrix = np.dot(transformed_U, transformed_S)
        self.SVD_matrix = np.dot(self.SVD_matrix, transformed_V_T)
    
    def calculate_item_similarity(self):
        self.item_similarity = np.zeros(shape=(self.item_cnt, self.item_cnt))
        transformed_item_matrix = list()
        for items in range(1, self.item_cnt): 
            transformed_item_matrix.append(self.SVD_matrix[items, :].T) 

        for item1 in range(1, self.item_cnt): 
            for item2 in range(item1 + 1, self.item_cnt): 
                sim = self.similarity(transformed_item_matrix[item1 - 1], transformed_item_matrix[item2 - 1])
                
                self.item_similarity[item1][item2] = self.item_similarity[item2][item1] = sim
    
    def get_sim_items(self, itemID): 
        sim_userID = list()
        for item in range(1, self.item_similarity.shape[0]): 
            if item == itemID: 
                continue
            sim_userID.append((self.item_similarity[itemID][item], item))
        sim_userID.sort(key=lambda y: -y[0])
        return sim_userID

    def predict_SVD(self, userID, itemID, top_K):
        similarity_sum = 0 
        rating_sum, cnt = 0, 0
        
        for (item_s, items) in self.sim_top_k[itemID]:
            if cnt == top_K: 
                break
            if self.matrix[userID][items] == 0: 
                continue
            similarity_sum += item_s
            rating_sum += item_s * self.matrix[userID][items]
            cnt += 1
        if similarity_sum == 0:
            return 0
            
        return (rating_sum / similarity_sum)
    
    def get_top_k(self, k=3):
        start_prediction_time = datetime.now()
        for users in range(1, self.users_cnt): 
            unrated_items = np.where(self.matrix[users] == 0)[0]
            for items in unrated_items:
                if items == 0:
                    continue
                assert(self.matrix[users][items] == 0)
                self.matrix[users][items] = self.predict_SVD(users, items, k)
        end_prediction_time = datetime.now()
        print("Time for prediction" , (end_prediction_time - start_prediction_time).total_seconds())


    def get_rmse(self):
        y_actual = list(self.rating_test.Rating)
        y_pred = list()
        for id, row in self.rating_test.iterrows(): 
            uid, itid = row['UserID'], row['ItemID']
            y_pred.append(self.matrix[uid][itid])
        return mean_squared_error(y_actual, y_pred) ** .5

In [3]:
s_recom = SVD_recommender(0.9)
s_recom.get_top_k()
s_recom.get_rmse()

  self.rating_data = pd.read_csv('ml-1m/ratings.dat', header=None, sep='::')


Time for prediction 247.880419


1.1767304502722231

In [4]:
my_path = root / "Pickled_files" / "SVD90"
dbfile = open(my_path, 'wb')     
pickle.dump(s_recom.matrix,dbfile)
dbfile.close()

In [5]:
s_recom = SVD_recommender(1.0)
s_recom.get_top_k()
s_recom.get_rmse()

  self.rating_data = pd.read_csv('ml-1m/ratings.dat', header=None, sep='::')


Time for prediction 233.878857


1.188154215448053

In [6]:
my_path = root / "Pickled_files" / "SVD"
dbfile = open(my_path, 'wb')     
pickle.dump(s_recom.matrix,dbfile)
dbfile.close()