In [17]:
import pandas as pd
import numpy as np
import scipy.sparse as scp

from matplotlib import pyplot as plt
%matplotlib inline

from tqdm import tqdm

In [18]:
column_names = ['user_id', 'film_rating', 'film_id']

In [19]:
row_number = 20352
data = pd.read_csv('netflix_prize.csv', header=None, names=column_names, nrows=row_number)

In [20]:
data.head()

Unnamed: 0,user_id,film_rating,film_id
0,1488844,3,1
1,822109,5,1
2,885013,4,1
3,30878,4,1
4,823519,3,1


## Encoding

In [None]:
from scipy.sparse import hstack
from scipy.sparse import diags

from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle

In [None]:
encoder = OneHotEncoder(categories="auto")

film_matrix = encoder.fit_transform(np.asarray(data['film_id']).reshape(-1,1))
user_matrix = encoder.fit_transform(np.asarray(data['user_id']).reshape(-1,1))

In [None]:
ones = np.ones(shape=(row_number, 1))
X = hstack([ones, user_matrix, film_matrix]).tocsr()
y = np.asarray(data['film_rating']).reshape(-1,1)

X, y = shuffle(X,y)

## FactorizationMachine 

In [None]:
from sklearn.model_selection import KFold

from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score as R2

def RMSE(y_test, y_pred):
    return np.sqrt(MSE(y_test, y_pred))

In [None]:
X.shape, y.shape

In [None]:
class FactorizationMachine:

    def __init__(self,
                 learning_rate=1e-4, epochs=1e4, min_weight_dist=1e-4, factors_num=2,
                 weight=None, factor=None, error=None):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.min_weight_dist = min_weight_dist
        self.factors_num = factors_num
        self.weight = weight
        self.factor = factor
        self.error = error
        

    def predict(self, X, w=None, factor=None):
        if w is None:
            w = self.weight
        if factor is None:
            factor = self.factor

        return X.dot(w) + (np.sum(np.square(X.dot(factor)), axis=1).reshape(-1, 1) - np.sum(X.power(2).dot(np.square(factor)),axis=1).reshape(-1, 1)) / 2

    
    def fit(self, X, y):
        self.weight = np.zeros((X.shape[1], 1))
        self.factor = np.zeros((X.shape[1], self.factors_num))
        self.error = []
        
        w_next = self.weight
        f_next = self.factor

        i = 0
        while (i < self.epochs):
            D_weight = (2 * self.learning_rate / len(y)) * X.T.dot(y - X.dot(self.weight))
            w_next += D_weight

            precomputation = X.dot(self.factor)
            self.weight = w_next
            y_pred = self.predict(X)
            for j in range(f_next.shape[1]):
                a = diags(np.array(precomputation)[:, j]).dot(X)
                b = X.power(2).dot(diags(f_next[:, j]))

                f_next[:, j] += (2 * self.learning_rate / X.shape[1]) * (a - b).T.dot(y - y_pred).reshape((-1,))
            D_factor = f_next - self.factor
            self.factor = f_next
            self.error += MSE(y, y_pred)

            i += 1

In [None]:
learning_rate=5e-2
min_weight_dist=1e-4
epochs = 1000


model_factorization = FactorizationMachine(epochs=epochs, learning_rate=learning_rate, min_weight_dist=min_weight_dist)

model_weight = []
model_factor = []
model_error = []

MSE_train = []
MSE_test  = []
RMSE_train = []
RMSE_test  = []
R2_train  = []
R2_test   = []

kf = KFold(n_splits=5)
kf.get_n_splits(X)
for train_index, test_index in tqdm(kf.split(X)):
    X_train = X[train_index]
    y_train = y[train_index]
    X_test = X[test_index]
    y_test = y[test_index]
    
    model_factorization.fit(X_train, y_train)
    model_weight += [model_factorization.weight] 
    model_factor += [model_factorization.factor]
    model_error += [model_error]
    y_train_pred = model_factorization.predict(X_train)
    y_test_pred = model_factorization.predict(X_test)
    
    MSE_train += [MSE(y_train, y_train_pred)]
    MSE_test += [MSE(y_test, y_test_pred)]
    RMSE_train += [RMSE(y_train, y_train_pred)]
    RMSE_test += [RMSE(y_test, y_test_pred)]
    R2_train += [R2(y_train, y_train_pred)]
    R2_test += [R2(y_test, y_test_pred)]

In [None]:
for error in model_error:
    plot(range(len(error)), error)
    xlabel('Iteration')
    ylabel('MSE')

In [None]:
columns = ["Names"]
columns += [f'T{i + 1}' for i in range(number_of_splits)]
columns += ["Mean"]
columns += ["Std"]

rows =  ['MSE Train', 'MSE Test', 'RMSE Train', 'RMSE Test', 'R2 Train', 'R2 Test']
rows += [f"Feature {i + 1}" for i in range(len(weights[0]))]

In [None]:
result_dataframe = pd.DataFrame(columns=columns)
result_dataframe["Names"] = rows
result_dataframe.set_index("Names", inplace=True)

for i in range(number_of_splits):
    data = np.concatenate(([MSE_train[i], MSE_test[i], RMSE_train[i], RMSE_test[i], R2_train[i], R2_test[i]], model_weight[i].reshape((-1,))))
    result_dataframe[f"T{i + 1}"] = data

result_dataframe["Mean"] = result_dataframe.mean(axis=1)
result_dataframe["Std"] = result_dataframe.std(axis=1)

In [None]:
result_dataframe