In [1]:
import pandas as pd
import numpy as np
import math

from sklearn.base import BaseEstimator
from sklearn.preprocessing import MinMaxScaler

from sparsesvd import sparsesvd
from scipy.sparse import csc_matrix

from tqdm import trange, tqdm

In [2]:
df_train = pd.read_csv('data/train.txt', sep='\t', names=['UserId', 'FilmId', 'Mark'])
df_test = pd.read_csv('data/test.txt', sep='\t', names=['UserId', 'FilmId'])

In [6]:
class ImplicitAlsBiased(BaseEstimator):
    def __init__(self, features=4, iterations=20, alpha=5, eps=0.01, init_mean=0,
                 init_std=0.1, lr=0.07, random_state=None, decrease=0.85,
                 c_function='linear', use_test=True, substract_mean=True):

        self.features = int(features)
        self.iterations = int(iterations)
        self.init_mean = init_mean
        self.init_std = init_std
        self.lr = lr

        self.random_state = random_state
        self.alpha = alpha
        self.eps = eps
        self.use_test = use_test

        self.substract_mean = substract_mean
        self.c_function = c_function
        self.decrease = decrease

        if c_function is 'linear':
            self.confidence_func = self.__linear_conf
        if c_function is 'log':
            self.confidence_func = self.__log_conf

    def __linear_conf(self, R):
        C = 1 + self.alpha * R
        return C

    def __log_conf(self, R):
        C = 1 + self.alpha * np.log(1 + R / self.eps)
        return C
    
    def __init_decomposition(self, R_train):
        u, _, v = sparsesvd(csc_matrix(R_train), self.features)
        self.u = u
        self.v = v

    def __init_X(self, R_train, initialize=False, init_x=None):
        X = np.ones((self.n_users, 1 + self.features))
        if initialize:
            X[:, 1:] = init_x
        else:
            X = np.hstack([np.ones((R_train.shape[0], 1)), self.u.T])
        return X

    def __init_Y(self, R_train, initialize=False, init_y=None):
        Y = np.ones((self.n_items, 1 + self.features))
        if initialize:
            Y[:, 1:] = init_y
        else:
            Y = np.hstack([np.ones((R_train.shape[1], 1)), self.v.T])
        return Y

    def __parse_df(self, df, train=True):
        R = np.zeros(self.shape)
        for index, row in df.iterrows():
            user_id = int(row[0])
            film_id = int(row[1])
            if train:
                mark = row[2]
                R[user_id-1, film_id-1] = mark
            else:
                R[user_id-1, film_id-1] = 1

        return R

    def __bias_addition(self, R, R_test):
        positive_mask = (R > 0)
        zero_mask = R == 0

        global_mean = R[positive_mask].mean()
        R_globalbias = R * positive_mask - positive_mask * global_mean

        user_bias = (R_globalbias.sum(1) / positive_mask.sum(1)).reshape(-1, 1)
        R_userbias = R_globalbias * positive_mask - positive_mask * user_bias

        film_pos_mask = positive_mask.sum(0)
        film_pos_mask[film_pos_mask == 0] = 1
        item_bias = (R_userbias.sum(0) / film_pos_mask).reshape(1, -1)

        if self.use_test:
            test_mask = (R_test > 0)
            P = R + zero_mask * (~test_mask) * (user_bias + item_bias + global_mean) * self.decrease + zero_mask * \
                test_mask * (user_bias + item_bias + global_mean)
        else:
            P = R + zero_mask * (user_bias + item_bias + global_mean) * self.decrease
        return P

    def fit(self, df_train, df_test=None, valid=False, retrain=False,
            initialize=False, init_x=None, init_y=None):
        n_users = int(np.max(df_train['UserId'].unique()))
        n_movies = int(np.max(df_train['FilmId'].unique()))
        self.shape = (n_users, n_movies)

        R_train = self.__parse_df(df_train, train=True)
        R_test= self.__parse_df(df_test, train=retrain)

        n_users = self.shape[0]
        n_items = self.shape[1]
        self.n_users = n_users
        self.n_items = n_items

        not_zero_idx = (R_train > 0)
        R_full = R_train + self.decrease * R_test
        global_mean = R_train[not_zero_idx].mean()
        global_std = R_train[not_zero_idx].std()

        if self.use_test:
            C = self.confidence_func(R_full)
        else:
            C = self.confidence_func(R_train)
            
        # Добавил
        C = self.confidence_func(R_train)

        self.__init_decomposition(R_train)
        X = self.__init_X(R_train, initialize, init_x)
        Y = self.__init_Y(R_train, initialize, init_y)

        user_bias = np.repeat(0.0, n_users)
        item_bias = np.repeat(0.0, n_items)
        
        numfilms_user = (R_train > 0).sum(1)
        numusers_film = (R_train > 0).sum(0)

        P = self.__bias_addition(R_train, R_test)

        lrI = self.lr * np.eye(self.features + 1, self.features + 1)
        self.train_scores = []
        self.test_scores = []

        for num_epoch in trange(self.iterations):
            Pbeta = P - user_bias[:, None]
            Pgamma = P - item_bias[None, :]

            # Users
            Y[:, 0] = np.ones(n_items)
            X[:, 0] = np.ones(n_users)

            Yt = Y.T
            YtY = np.matmul(Yt, Y)
            for u in range(n_users):
                inv = YtY + np.matmul(Yt * (C[u, :] - 1), Y) + lrI * (numfilms_user[u])
                inv_mat = np.linalg.inv(inv)
                a = np.matmul(Yt * C[u, :], Pgamma[u, :])
                X[u, :] = np.matmul(inv_mat, a)
            user_bias = np.copy(X[:, 0])
            
            Pbeta = P - user_bias[:, None]
            Pgamma = P - item_bias[None, :]

            # Items
            X[:, 0] = np.ones(n_users)
            Y[:, 0] = np.ones(n_items)

            Xt = X.T
            XtX = np.matmul(Xt, X)
            for i in range(n_items):
                inv = XtX + np.matmul(Xt * (C[:, i] - 1), X) + lrI * (numusers_film[i])
                inv_mat = np.linalg.inv(inv)
                a = np.matmul(Xt * C[:, i], Pbeta[:, i])
                Y[i, :] = np.matmul(inv_mat, a)
            item_bias = np.copy(Y[:, 0])

            self.user_bias = user_bias
            self.item_bias = item_bias
            self.X = X[:, 1:]
            self.Y = Y[:, 1:]
            self.global_mean = global_mean
            self.global_std = global_std

            if valid:
                test_score = self.score(df_test.iloc[:, :2].values,
                                        df_test.iloc[:, 2].values)
                train_score = self.score(df_train.iloc[:, :2].values,
                                         df_train.iloc[:, 2].values)
                self.train_scores.append(train_score)
                self.test_scores.append(test_score)

    def predict(self, test_data):
        score = 0
#         score += self.global_mean
        score += self.user_bias[test_data[:, 0].astype(int)-1]
        score += self.item_bias[test_data[:, 1].astype(int)-1]
        score += (self.X[test_data[:, 0].astype(int)-1] *
                  self.Y[test_data[:, 1].astype(int)-1]).sum(1)
        return score

    def score(self, test_data, y_test):
        y_pred = self.predict(test_data)
        return np.sqrt(np.mean((y_test - y_pred) ** 2))

In [7]:
class MeanModel:
    def __init__(self, *models):
        self.models = models
        
    def predict(self, test_data):
        results = np.zeros(test_data.shape[0])
        for model in self.models:
            results += model.predict(test_data)
        
        results = results / float(len(self.models))
        return results

In [9]:
ials1 = ImplicitAlsBiased(iterations=35, features=6, lr=7.5, alpha=35, eps=0.1)
ials2 = ImplicitAlsBiased(iterations=35, features=15, lr=9.5, alpha=35, eps=0.1)
ials3 = ImplicitAlsBiased(iterations=40, features=4, lr=2, alpha=25, eps=0.1)
ials5 = ImplicitAlsBiased(iterations=50, features=3, lr=5, alpha=35, eps=0.1)
ials6 = ImplicitAlsBiased(iterations=70, features=15, lr=9, alpha=35, eps=0.1)
ials10 = ImplicitAlsBiased(iterations=80, features=15, lr=7, alpha=40, eps=0.1)
ials13 = ImplicitAlsBiased(iterations=50, features=4, lr=10, alpha=25, eps=0.1)
ials14 = ImplicitAlsBiased(iterations=100, features=12, lr=9, alpha=30, eps=0.1)

ials1.fit(df_train, df_test)
ials2.fit(df_train, df_test)
ials3.fit(df_train, df_test)
ials5.fit(df_train, df_test)
ials6.fit(df_train, df_test)
ials10.fit(df_train, df_test)
ials13.fit(df_train, df_test)
ials14.fit(df_train, df_test)

clf_ials_mean = MeanModel(*[ials1, ials2, ials3, ials5, ials6, ials10, ials13, ials14])

In [None]:
df_sample = pd.read_csv('data/sample.txt', sep=',')

def make_submission(clf, name='submission_v0'):
    
    test_data = df_test.values
    
    marks = clf.predict(test_data)
    
    marks[marks < 1] = 1
    marks[marks > 5] = 5

    sample = df_sample.copy()
    sample.iloc[:, 1] = marks
    
    name = 'results/' + name + '.csv'
    sample.to_csv(name, index=False)

In [None]:
make_submission(clf_ials_mean, 'subm_ials_mean_v8')