In [1]:
%load_ext Cython

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math

from surprise import Dataset

In [17]:
USERS_COUNT = 943
ITEMS_COUNT = 1682
THRESHOLD = 0.05

In [4]:
def load_data():
    data = pd.DataFrame(Dataset.load_builtin("ml-100k").raw_ratings)
    data[0] = pd.to_numeric(data[0]) - 1
    data[1] = pd.to_numeric(data[1]) - 1
    del data[3]
    return data

In [5]:
def load_similarities():
    similarities = pd.read_csv("artificial_ratings.csv")
    similarities['0'] = similarities['0'] - 1
    similarities['1'] = similarities['1'] - 1

    similarities_arr = np.zeros((ITEMS_COUNT, ITEMS_COUNT))
    for _, row in similarities.iterrows():
        if int(row['0']) != int(row['1']):
            similarities_arr[int(row['0']), int(row['1'])] = row['2']
        
    return similarities_arr

In [31]:
similarities = load_similarities()

In [36]:
artificial = np.zeros((USERS_COUNT, ITEMS_COUNT))
for item in range(ITEMS_COUNT):
    for user in range(USERS_COUNT):
        rating = 0
        all_user_ratings = train_set[train_set[:, 0] == user]
        sum_sim = 0
        for u, i, r in all_user_ratings:
            sim1 = similarities[item, int(i)]
            if (sim1 < 0.05):
                continue
            rating = rating + r * sim1
            sum_sim = sum_sim + sim1
        if sum_sim == 0:
            artificial[user, item] = -1
        else:
            artificial[user, item] = rating / sum_sim

dataframe = pd.DataFrame(columns=[0, 1, 2])
artificial_data = []
for item in range(ITEMS_COUNT):
    for user in range(USERS_COUNT):
        if (artificial[user, item] != -1):
            artificial_data.append([user, item, artificial[user, item]])

In [6]:
data = load_data()

In [7]:
actual_data = data.values

In [8]:
train_set, _ = train_test_split(actual_data, test_size=0.95)
sparcity = 1 - (len(train_set) / (USERS_COUNT * ITEMS_COUNT))
print("Sparcitiy: %.4f" % sparcity)
train_set, test_set = train_test_split(train_set, test_size=.20)

Sparcitiy: 0.9968


In [52]:
algo = SVD(verbose=True)
algo.fit(np.array(artificial_data))

TEst
Test
TEST
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


In [27]:
algo = SVD(verbose=True)
algo.fit(train_set)

TEST
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


In [53]:
predictions = [(r_ui_trans, algo.estimate(uid, iid)) for (uid, iid, r_ui_trans) in test_set]

In [54]:
predictions = np.array(predictions)

In [55]:
math.sqrt(mean_squared_error(predictions[:, 0], predictions[:, 1]))

1.1575781752248677

In [None]:
predictions = [(r_ui_trans, algo.estimate(uid, iid)) for (uid, iid, r_ui_trans) in actual_data]

In [46]:
%%cython
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
USERS_COUNT = 943
ITEMS_COUNT = 1682
cimport numpy as np  # noqa
import numpy as np
from six.moves import range

from surprise.prediction_algorithms.predictions import PredictionImpossible
from surprise.utils import get_rng


class SVD():

    def __init__(self, n_factors=100, n_epochs=20, biased=True, init_mean=0,
                 init_std_dev=.1, lr_all=.005,
                 reg_all=.02, lr_bu=None, lr_bi=None, lr_pu=None, lr_qi=None,
                 reg_bu=None, reg_bi=None, reg_pu=None, reg_qi=None,
                 random_state=None, verbose=False):

        self.n_factors = n_factors
        self.n_epochs = n_epochs
        self.biased = biased
        self.init_mean = init_mean
        self.init_std_dev = init_std_dev
        self.lr_bu = lr_bu if lr_bu is not None else lr_all
        self.lr_bi = lr_bi if lr_bi is not None else lr_all
        self.lr_pu = lr_pu if lr_pu is not None else lr_all
        self.lr_qi = lr_qi if lr_qi is not None else lr_all
        self.reg_bu = reg_bu if reg_bu is not None else reg_all
        self.reg_bi = reg_bi if reg_bi is not None else reg_all
        self.reg_pu = reg_pu if reg_pu is not None else reg_all
        self.reg_qi = reg_qi if reg_qi is not None else reg_all
        self.random_state = random_state
        self.verbose = verbose

    def fit(self, trainset):
        self.trainset = trainset
        
        self.sgd(trainset)

    def sgd(self, trainset):

        # user biases
        cdef np.ndarray[np.double_t] bu
        # item biases
        cdef np.ndarray[np.double_t] bi
        # user factors
        cdef np.ndarray[np.double_t, ndim=2] pu
        # item factors
        cdef np.ndarray[np.double_t, ndim=2] qi

        cdef int u, i, f
        cdef double r, err, dot, puf, qif
        cdef global_mean = trainset[:, 2].mean()
        self.global_mean = global_mean

        cdef double lr_bu = self.lr_bu
        cdef double lr_bi = self.lr_bi
        cdef double lr_pu = self.lr_pu
        cdef double lr_qi = self.lr_qi

        cdef double reg_bu = self.reg_bu
        cdef double reg_bi = self.reg_bi
        cdef double reg_pu = self.reg_pu
        cdef double reg_qi = self.reg_qi

        rng = get_rng(self.random_state)

        bu = np.zeros(USERS_COUNT, np.double)
        bi = np.zeros(ITEMS_COUNT, np.double)
        pu = rng.normal(self.init_mean, self.init_std_dev,
                        (USERS_COUNT, self.n_factors))
        qi = rng.normal(self.init_mean, self.init_std_dev,
                        (ITEMS_COUNT, self.n_factors))

        if not self.biased:
            global_mean = 0

        for current_epoch in range(self.n_epochs):
            if self.verbose:
                print("Processing epoch {}".format(current_epoch))
            for u, i, r in trainset:

                # compute current error
                dot = 0  # <q_i, p_u>
                for f in range(self.n_factors):
                    dot += qi[i, f] * pu[u, f]
                err = r - (global_mean + bu[u] + bi[i] + dot)

                # update biases
                if self.biased:
                    bu[u] += lr_bu * (err - reg_bu * bu[u])
                    bi[i] += lr_bi * (err - reg_bi * bi[i])

                # update factors
                for f in range(self.n_factors):
                    puf = pu[u, f]
                    qif = qi[i, f]
                    pu[u, f] += lr_pu * (err * qif - reg_pu * puf)
                    qi[i, f] += lr_qi * (err * puf - reg_qi * qif)

        self.bu = bu
        self.bi = bi
        self.pu = pu
        self.qi = qi

    def estimate(self, u, i):
        # Should we cythonize this as well?

        known_user = u in self.trainset[:, 0]
        known_item = i in self.trainset[:, 1]
        
        u = int(u)
        i = int(i)

        if self.biased:
            est = self.global_mean

            if known_user:
                est += self.bu[u]

            if known_item:
                est += self.bi[i]

            if known_user and known_item:
                est += np.dot(self.qi[i], self.pu[u])

        else:
            if known_user and known_item:
                est = np.dot(self.qi[i], self.pu[u])
            else:
                raise PredictionImpossible('User and item are unkown.')

        est = min(5.0, est)
        est = max(1.0, est)
        return est