In [2]:
import numpy as np
from scipy.sparse import *
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold 


Read dataset

In [3]:
dataset = pd.read_csv('netflixSorted.csv', sep=',')

dataset = dataset.drop(['Previous Movie', 'Date'],axis = 1)
YY = dataset['Rate'].values
XX = dataset.drop(['Rate'], axis=1)


OneHotEncoder

In [4]:
XX = OneHotEncoder(sparse=True).fit_transform(XX.values)

In [5]:
def sumOfDiff(A, B):
	u = 0
	sum = 0
	for i in A:
		sum = sum + (i - B[u])**2
		u = u + 1
	return sum

def getErr(Y, y_pred, N):
	sum_data2 = sumOfDiff(Y, y_pred)
	return np.sqrt(sum_data2 / N)
	#r2 =  1 - sum_data2/sum([data**2 for data in(Y - Y.mean())])

	#return float(rmse), float(r2)



Факторизационная машину 2-го порядка с квадратичной функцией потерь (аналогично линейной регрессии)


In [10]:
class factorization_machine:

    def get_batches(self, dataset, batch_size):
        X, Y = dataset
        n_samples = X.shape[0]

        # Shuffle at the start of epoch
        indices = np.arange(n_samples)
        np.random.shuffle(indices)

        for start in range(0, n_samples, batch_size):
            end = min(start + batch_size, n_samples)
            batch_idx = indices[start:end]

            yield X[batch_idx], Y[batch_idx]
            
    def prediction(self, X, W, V,X2,X_V, n):
        tmp =np.sum(X_V**2-(X2).dot(V**2), axis = 1, keepdims=True)
        return X.dot(W.reshape(n,1)) + np.multiply(1/2, tmp )
    
    def cost(self, curr, pred):
        return np.subtract(pred, curr.reshape(len(curr),1))

    def step(self, cost, x, n, W, learning_rate):
        return W.reshape(n,1) - np.multiply(learning_rate, (x.T).dot(cost))
    
    def fit(self, X, Y, W, V, learning_rate):
                X2 = X.power(2)
                X_V = X.dot(V)
                
                y_pred = prediction(X, W, V, X2, X_V, XX.shape[1])
                loss = cost(Y, y_pred)

                W = step(loss, X,XX.shape[1] , W, learning_rate)
                loss = np.multiply(learning_rate, loss)
                
                for j in range(k):
                    gradV = X.multiply(X_V[:, j].reshape(-1, 1)) - X2.multiply(V[:, j])
                    V[:,j] = V[:,j] - loss.reshape(-1) @ gradV
                    
                return W, V

Кроссвалидация

In [15]:
def cross_validation(kk):
    kf = KFold(n_splits=3) 
    kf.get_n_splits(XX) 
    
    data = pd.DataFrame(columns=["T1", "T2", "T3",  "E", "STD"])
    
    count = 0
    for train_i, test_i in kf.split(XX):
        print(count)
        print('begin')
        
        factorizationMachine = factorization_machine()
        
        X_train, X_test = XX[train_i], XX[test_i]
        Y_train, Y_test = YY[train_i], YY[test_i]
        
        n = XX.shape[1]
        W0 = 0
        W = np.random.random(n)
        V = np.random.random((n, k))
        
        for i in range(epochs):
            print(i)
            for x_batch, y_batch in factorizationMachine.get_batches((X_train, Y_train), batch_size):
                learning_rate = kk /((np.sqrt(i + 1))*x_batch.shape[0])
                W,V = factorizationMachine.fit(x_batch, y_batch, W, V, learning_rate)
                
               
        X2 = X_train.power(2)
        X_V = X_train.dot(V)
        y_pred = factorizationMachine.prediction(X_train, W, V, X2,X_V,n).reshape(-1)
        rmse_train = getErr(Y_train, y_pred, len(Y_train))
        print (rmse_train)
        
        X2 = X_test.power(2)
        X_V = X_test.dot(V)
        y_pred = factorizationMachine.prediction(X_test, W, V,X2, X_V,n).reshape(-1)
        rmse_test = getErr(Y_test, y_pred, len(Y_test))
        print (rmse_test)
        
        count = count + 1
        data["T" + str(count)] = [ rmse_test, rmse_train] 
        data["E"] = data[["T1", "T2", "T3"]].mean(axis=1)
        data["STD"] = data[["T1", "T2", "T3"]].std(axis=1)
        print('end')
        
        
    data.index = ["RMSE_test", "RMSE_train"]     
    print(data)


In [16]:
k = 3
epochs = 5
batch_size = 1000

cross_validation(1.6)


0
begin
0
1
2
3
4
0.9654084885451145
1.0384698451695498
end
1
begin
0
1
2
3
4
0.9781451498958771
0.9989821254404299
end
2
begin
0
1
2
3
4
0.9525943769699476
1.1229277419311903
end
                  T1        T2        T3         E       STD
RMSE_test   1.038470  0.998982  1.122928  1.053460  0.063318
RMSE_train  0.965408  0.978145  0.952594  0.965383  0.012775


<module 'scipy.sparse.data' from 'C:\\Users\\ivm\\Anaconda3\\lib\\site-packages\\scipy\\sparse\\data.py'>
