In [5]:
import pandas as pd


# Загрузка данных
data = pd.read_csv('u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
data.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [6]:
from sklearn.model_selection import train_test_split

data = data[['user_id', 'item_id', 'rating']]

# Сопоставление ID с индексами
users = data['user_id'].unique()
items = data['item_id'].unique()

user_to_index = {u: idx for idx, u in enumerate(users)}
item_to_index = {i: idx for idx, i in enumerate(items)}

data['user_idx'] = data['user_id'].map(user_to_index)
data['item_idx'] = data['item_id'].map(item_to_index)

# Разделение на train/test
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data.head()

Unnamed: 0,user_id,item_id,rating,user_idx,item_idx
75220,807,1411,1,804,901
48955,474,659,5,467,488
44966,463,268,4,465,139
13568,139,286,4,321,289
92727,621,751,4,618,261


In [7]:
test_data.head()

Unnamed: 0,user_id,item_id,rating,user_idx,item_idx
75721,877,381,4,873,377
80184,815,602,3,808,601
19864,94,431,4,90,354
76699,416,875,2,409,570
92991,500,182,2,496,356


In [3]:
import time

from lfm import LatentFactorModel


n_users = len(users)
n_items = len(items)
model = LatentFactorModel(n_users=n_users, n_items=n_items, n_factors=20, lr=0.01, reg=0.02, n_epochs=30)

start = time.time()
model.fit(train_data)
end = time.time()

rmse, mae = model.evaluate(test_data)
print(f"Custom RMSE: {rmse:.4f}")
print(f"Custom MAE: {mae:.4f}")
print(f"Custom train time: {end - start:.4f}s")

Custom RMSE: 0.9739
Custom MAE: 0.7597
Custom train time: 44.4593s


In [4]:
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split


reader = Reader(rating_scale=(1, 5))
surprise_data = Dataset.load_builtin('ml-100k', prompt=False)
trainset, testset = train_test_split(surprise_data, test_size=0.2, random_state=42)

model_surprise = SVD(n_factors=20, lr_all=0.01, reg_all=0.02, n_epochs=30)
start_time = time.time()
model_surprise.fit(trainset)
end_time = time.time()

predictions = model_surprise.test(testset)
rmse_surprise = np.sqrt(mean_squared_error([p.r_ui for p in predictions], [p.est for p in predictions]))
mae_surprise = mean_absolute_error([p.r_ui for p in predictions], [p.est for p in predictions])

print(f"Surprise RMSE: {rmse_surprise:.4f}")
print(f"Surprise MAE: {mae_surprise:.4f}")
print(f"Surprise train time: {end_time - start_time:.4f}s")

Dataset ml-100k could not be found. Do you want to download it? [Y/n] Trying to download dataset from https://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /Users/wignorbo/.surprise_data/ml-100k
Surprise RMSE: 0.9619
Surprise MAE: 0.7514
Surprise train time: 0.2001s
