In [1]:
import pandas as pd
import numpy as np

In [2]:
import gc

class BaselinePredictor:
    def __init__(self, item_lambda: int or float=25, user_lambda: int or float=10):
        self.item_lambda = item_lambda
        self.user_lambda = user_lambda

    def fit(self, train_df: 'pd.DataFrame'):
        self.mu = train_df.rating.mean()

        group_by_item = train_df.groupby('movieId')
        self.item_bias = {
            movie_id: (sum(group.rating) - group.size * self.mu) / (self.item_lambda + group.size)
            for movie_id, group in group_by_item
        }

        group_by_user = train_df.groupby('userId')
        self.user_bias = {
            user_id: (sum(group.rating) - group.size * self.mu - self._calc_sum_item_bias(group.movieId)) / (self.user_lambda + group.size)
            for user_id, group in group_by_user
        }
    
    def _calc_sum_item_bias(self, ids: list):
        return sum([
            self.item_bias[movie_id]
            for movie_id in ids
        ])

    def predict(self, test_df: 'pd.DataFrame'):
        return test_df.apply(lambda x: self._predict(x.userId, x.movieId), axis=1).values

    def _predict(self, u: int, i: int) -> float:
        return self.mu + self.user_bias.get(u, 0) + self.item_bias.get(i, 0)

    def clear(self):
        del self.item_bias, self.user_bias
        gc.collect()

In [3]:
train_df = pd.read_csv('data/train_ratings.csv')

In [4]:
param_grid = {
    'item_lambda': [2 ** i for i in range(10)],
    'user_lambda': [2 ** i for i in range(10)]
}

In [None]:
from sklearn.model_selection import KFold
kf = KFold()

In [None]:
for item_lambda in param_grid['item_lambda']:
    for user_lambda in param_grid['user_lambda']:
        bp = BaselinePredictor(item_lambda, user_lambda)
        rmse = []
        for index, (train_index, test_index) in enumerate(kf.split(train_df)):
            print(f'START #{index}')
            bp.fit(train_df.iloc[train_index, :])
            y_pred = bp.predict(train_df.iloc[test_index, :])
            bp.clear()
            rmse.append(np.sqrt(((train_df.iloc[test_index, :].rating.values - y_pred) ** 2).mean()))
            del y_pred
            print(f'END #{index}')
            
        print(f'item_lambda: {item_lambda}, user_lambda: {user_lambda}, rmse: {sum(rmse) / len(rmse)}')

START #0
END #0
START #1
END #1
START #2
END #2
item_lambda: 1, user_lambda: 1, rmse: 4.720782413395134
START #0
