# Non-negative Matrix Factorization

### Ограничения Matrix Factorization

Значения в $P$ и $Q$ в MF не поддаются объяснению, т.к. компоненты могут принимать любые значения.

### Non-negative Matrix Factorization
### Функция сходства

Эвклидово расстояние используется в NMF и определяется, как
\begin{equation}
J = \frac{1}{2}\sum_{(u,i) \in \kappa}||R_{u,i} - P_uQ_i^{\top}||^2 + \lambda_P||P_u||^2 + \lambda_Q||Q_i||^2
\end{equation}

Цель: минимизировать $J$, оптимизируюя параметры $P$ и $Q$ с использованием $\lambda_P$ и $\lambda_Q$ параметров регуляризации

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import os

In [None]:
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
def ttsplit(examples, labels, test_size=0.1, verbose=0):
    from sklearn.model_selection import train_test_split

    if verbose:
        print("Train/Test split ")
        print(100-test_size*100, "% of training data")
        print(test_size*100, "% of testing data")

    # split data into train and test sets
    train_examples, test_examples, train_labels, test_labels = train_test_split(
        examples,
        labels,
        test_size=0.1,
        random_state=42,
        shuffle=True
    )

    # transform train and test examples to their corresponding one-hot representations
    train_users = train_examples[:, 0]
    test_users = test_examples[:, 0]

    train_items = train_examples[:, 1]
    test_items = test_examples[:, 1]

    # Final training and test set
    x_train = np.array(list(zip(train_users, train_items)))
    x_test = np.array(list(zip(test_users, test_items)))

    y_train = train_labels
    y_test = test_labels

    if verbose:
        print()
        print('number of training examples : ', x_train.shape)
        print('number of training labels : ', y_train.shape)
        print('number of test examples : ', x_test.shape)
        print('number of test labels : ', y_test.shape)

    return (x_train, x_test), (y_train, y_test)


def mean_ratings(dataframe):
    means = dataframe.groupby(by='userId', as_index=False)['rating'].mean()
    return means


def normalized_ratings(dataframe, norm_column="norm_rating"):
    """
    Нормализация рейтинга пользователя относительно общего среднего
    """
    mean = mean_ratings(dataframe=dataframe)
    norm = pd.merge(dataframe, mean, suffixes=('', '_mean'), on='userId')
    norm[f'{norm_column}'] = norm['rating'] - norm['rating_mean']

    return norm


def rating_matrix(dataframe, column):
    crosstab = pd.crosstab(dataframe.userId, dataframe.movieId, dataframe[f'{column}'], aggfunc=sum).fillna(0).values
    matrix = csr_matrix(crosstab)
    return matrix


def scale_ratings(dataframe, scaled_column="scaled_rating"):
    dataframe[f"{scaled_column}"] = dataframe.rating / 5.0
    return dataframe


def get_examples(dataframe, labels_column="rating"):
    examples = dataframe[['userId', 'movieId']].values
    labels = dataframe[f'{labels_column}'].values
    return examples, labels

In [None]:
def ids_encoder(ratings):
    """
        Энкодер для более удобной работы
    """
    users = sorted(ratings['userId'].unique())
    items = sorted(ratings['movieId'].unique())

    # энкодер для пользователей и элементов
    uencoder = LabelEncoder()
    iencoder = LabelEncoder()

    # fit
    uencoder.fit(users)
    iencoder.fit(items)

    # перезапись ID
    ratings.userId = uencoder.transform(ratings.userId.tolist())
    ratings.movieId = iencoder.transform(ratings.movieId.tolist())

    return ratings, uencoder, iencoder

### Данные

In [None]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')

m = ratings.userId.nunique()   # всего пользователей
n = ratings.movieId.nunique()   # всего элементов

ratings, uencoder, iencoder = ids_encoder(ratings)

# получение данных в подготовленном виде
raw_examples, raw_labels = get_examples(ratings)

# train test split
(x_train, x_test), (y_train, y_test) = ttsplit(examples=raw_examples, labels=raw_labels)

### Non-negative Matrix Factorization

In [None]:
class NMF:

    def __init__(self, ratings, m, n, uencoder, iencoder, K=10, lambda_P=0.01, lambda_Q=0.01):

        np.random.seed(32)

        # инициализация матриц P / Q по переданной размерности
        self.ratings = ratings
        self.np_ratings = ratings.to_numpy()
        self.K = K
        self.P = np.random.rand(m, K)
        self.Q = np.random.rand(n, K)

        # гиперпараметры
        self.lambda_P = lambda_P
        self.lambda_Q = lambda_Q

        # энкодеры
        self.uencoder = uencoder
        self.iencoder = iencoder

        # словарь для сохранения обучения
        self.history = {
            "epochs": [],
            "loss": [],
            "val_loss": [],
        }

    def print_training_parameters(self):
        print('Training NMF ...')
        print(f'k={self.K}')

    def mae(self, x_train, y_train):
        """
        функция возвращает MAE
        """
        # кол-во в сэплте
        m = x_train.shape[0]
        error = 0
        for pair, r in zip(x_train, y_train):
            u, i = pair
            error += abs(r - np.dot(self.P[u], self.Q[i]))
        return error / m

    def update_rule(self, u, i, error):
        # основные изменения, отличие от MF
        I = self.np_ratings[self.np_ratings[:, 0] == u][:, [1, 2]]
        U = self.np_ratings[self.np_ratings[:, 1] == i][:, [0, 2]]


        num = self.P[u] * np.dot(self.Q[I[:, 0]].T, I[:, 1])
        dem = np.dot(self.Q[I[:, 0]].T, np.dot(self.P[u], self.Q[I[:, 0]].T)) + self.lambda_P * len(I) * self.P[u]
        self.P[u] = num / dem

        num = self.Q[i] * np.dot(self.P[U[:, 0]].T, U[:, 1])
        dem = np.dot(self.P[U[:, 0]].T, np.dot(self.P[U[:, 0]], self.Q[i].T)) + self.lambda_Q * len(U) * self.Q[i]
        self.Q[i] = num / dem

    @staticmethod
    def print_training_progress(epoch, epochs, error, val_error, steps=5):
        if epoch == 1 or epoch % steps == 0:
            print(f"epoch {epoch}/{epochs} - loss : {round(error, 3)} - val_loss : {round(val_error, 3)}")

    def fit(self, x_train, y_train, validation_data, epochs=10):

        self.print_training_parameters()
        x_test, y_test = validation_data
        for epoch in range(1, epochs+1):
            for pair, r in zip(x_train, y_train):
                u, i = pair
                r_hat = np.dot(self.P[u], self.Q[i])
                e = abs(r - r_hat)
                self.update_rule(u, i, e)
            # обучение и тестирование
            error = self.mae(x_train, y_train)
            val_error = self.mae(x_test, y_test)
            self.update_history(epoch, error, val_error)
            self.print_training_progress(epoch, epochs, error, val_error, steps=1)

        return self.history

    def update_history(self, epoch, error, val_error):
        self.history['epochs'].append(epoch)
        self.history['loss'].append(error)
        self.history['val_loss'].append(val_error)

    def evaluate(self, x_test, y_test):
        error = self.mae(x_test, y_test)
        print(f"validation error : {round(error,3)}")
        print('MAE : ', error)
        return error

    def predict(self, userid, itemid):
        u = self.uencoder.transform([userid])[0]
        i = self.iencoder.transform([itemid])[0]
        r = np.dot(self.P[u], self.Q[i])
        return r

    def recommend(self, userid, N=10):

        u = uencoder.transform([userid])[0]

        # предикт
        predictions = np.dot(self.P[u], self.Q.T)

        # индекст Топ N
        # только необходимое кол-во
        top_items = self.iencoder.inverse_transform(top_idx)
        top_idx = np.flip(np.argsort(predictions))[:N]
        preds = predictions[top_idx]

        return top_items, preds


### Обучение NMF

Параметры :

- $k = 10$ кол-во факторов
- $\lambda_P = 0.6$
- $\lambda_Q = 0.6$
- epochs = 3

In [None]:
m = ratings['userId'].nunique()
n = ratings['movieId'].nunique()

# обучаем
nmf = NMF(ratings, m, n, uencoder, iencoder, K=10, lambda_P=0.6, lambda_Q=0.6)
history = nmf.fit(x_train, y_train, epochs=10, validation_data=(x_test, y_test))

In [None]:
nmf.evaluate(x_test, y_test)

validation error : 1.41
MAE :  1.4096612193965772


1.4096612193965772

## NMF with Scikit-suprise

In [None]:
from surprise import NMF
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate

# создадим объект
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# NMF
nmf = NMF(n_factors=10, n_epochs=10)

# 5 фолдов для
history = cross_validate(nmf, data, measures=['MAE'], cv=5, verbose=True)

Evaluating MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.9035  0.9458  0.9374  0.9207  0.9546  0.9324  0.0183  
Fit time          0.31    0.44    0.36    0.30    0.39    0.36    0.05    
Test time         0.09    0.09    0.08    0.14    0.12    0.10    0.02    


Средний результат **mae = 0.93**