In [42]:
import time
import numpy as np
import pandas as pd
from typing import Tuple, List
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split as surprise_split
from surprise import accuracy
from LMF import LatentFactorModel


import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [43]:
df = pd.read_csv(
    "./ml-100k/u.data",
    sep="\t",
    names=["user", "item", "rating", "timestamp"],
)

print(f"Number of users: {df['user'].nunique()}")
print(f"Number of items: {df['item'].nunique()}")
print(f"Number of ratings: {len(df)}")

Number of users: 943
Number of items: 1682
Number of ratings: 100000


In [44]:
user_map = {id_: idx for idx, id_ in enumerate(df['user'].unique())}
item_map = {id_: idx for idx, id_ in enumerate(df['item'].unique())}

df['user_idx'] = df['user'].map(user_map)
df['item_idx'] = df['item'].map(item_map)

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_ratings = list(zip(train_df['user_idx'], train_df['item_idx'], train_df['rating']))
test_ratings = list(zip(test_df['user_idx'], test_df['item_idx'], test_df['rating']))

In [45]:
import time

n_users = df['user_idx'].nunique()
n_items = df['item_idx'].nunique()


lfm = LatentFactorModel(n_users, n_items, n_epochs=15, learning_rate=0.01, reg=0.05)

start = time.time()
lfm.fit(train_ratings)
lfm_time = time.time() - start

lfm_preds = [lfm.predict(u, i) for u, i, _ in test_ratings]
lfm_true = [r for _, _, r in test_ratings]

lfm_rmse = mean_squared_error(lfm_true, lfm_preds)
lfm_mae = mean_absolute_error(lfm_true, lfm_preds)

Epoch 1/15 | Train RMSE: 0.9941
Epoch 2/15 | Train RMSE: 0.9311
Epoch 3/15 | Train RMSE: 0.9115
Epoch 4/15 | Train RMSE: 0.8997
Epoch 5/15 | Train RMSE: 0.8908
Epoch 6/15 | Train RMSE: 0.8829
Epoch 7/15 | Train RMSE: 0.8750
Epoch 8/15 | Train RMSE: 0.8664
Epoch 9/15 | Train RMSE: 0.8567
Epoch 10/15 | Train RMSE: 0.8459
Epoch 11/15 | Train RMSE: 0.8332
Epoch 12/15 | Train RMSE: 0.8193
Epoch 13/15 | Train RMSE: 0.8041
Epoch 14/15 | Train RMSE: 0.7885
Epoch 15/15 | Train RMSE: 0.7724


In [46]:
reader = Reader(line_format="user item rating timestamp", sep="\t")
data = Dataset.load_from_df(df[["user", "item", "rating"]], reader)
trainset, testset = surprise_split(data, test_size=0.2, random_state=42)

svd = SVD(n_factors=40)
start = time.time()
svd.fit(trainset)
svd_time = time.time() - start
predictions = svd.test(testset)

svd_rmse = accuracy.rmse(predictions, verbose=False)
svd_mae = accuracy.mae(predictions, verbose=False)

In [47]:
import pandas as pd
comparison_df = pd.DataFrame({
    "Model": ["Custom LFM", "Surprise SVD"],
    "RMSE": [lfm_rmse, svd_rmse],
    "MAE": [lfm_mae, svd_mae],
    "Train Time (s)": [lfm_time, svd_time]
})
print(comparison_df)

          Model      RMSE       MAE  Train Time (s)
0    Custom LFM  0.856297  0.729812       10.904222
1  Surprise SVD  0.930935  0.734509        0.454024


## Рекомендации для пользователя, который любит мультфильмы (Кастомная модель)

In [48]:
df_movies = pd.read_csv("./ml-100k/u.item", sep="|", encoding="latin-1", header=None, usecols=[0, 1], names=["item", "title"])

mults_ids = df_movies[df_movies["title"].str.contains("Toy Story|Lion King|Hercules", case=False)]['item'].values

new_user_id = lfm.user_factors.shape[0]
n_factors = lfm.user_factors.shape[1]

lfm.user_factors = np.vstack([lfm.user_factors, np.random.normal(0, 0.1, n_factors)])
lfm.user_bias = np.append(lfm.user_bias, 0.0)

mults_ratings = [(new_user_id, item_idx, 5.0) for item_idx in mults_ids]

## Обучаем вектор нового пользователя
for _ in range(15):
    for user_idx, item_idx, rating in mults_ratings:
        lfm._update_factors(user_idx, item_idx, rating, learning_rate=0.01, reg=0.05)

print(mults_ids)

[  1  71 993]


In [49]:
all_items = np.arange(lfm.item_factors.shape[0])
items_to_recommend = [i for i in all_items if i not in mults_ids]

predictions = [(item, lfm.predict(new_user_id, item)) for item in items_to_recommend]
top_recommendations = sorted(predictions, key=lambda x: x[1], reverse=True)[:10]


top_titles = df_movies.set_index('item').loc[[item for item, _ in top_recommendations]]['title']
print(top_titles)

item
277                Restoration (1995)
819                      Eddie (1996)
541              Mortal Kombat (1995)
180             Apocalypse Now (1979)
174    Raiders of the Lost Ark (1981)
36                    Mad Love (1995)
239                   Sneakers (1992)
130                Kansas City (1996)
297                Ulee's Gold (1997)
612               Lost Horizon (1937)
Name: title, dtype: object


## Рекомендации для пользователя, который любит мультфильмы (библиотечная модель)

In [50]:
n_factors = svd.n_factors
new_user_vector = np.random.normal(0, 0.1, n_factors)
new_user_bias = 0.0
global_bias = svd.trainset.global_mean

# Достаём item-вектора и смещения из обученной модели
item_factors = svd.qi
item_biases = svd.bi

# Получаем внутренние ID
iid_map = {raw_iid: svd.trainset.to_inner_iid(raw_iid) for raw_iid in mults_ids if raw_iid in svd.trainset._raw2inner_id_items}

# Обучаем вектор нового пользователя на основе любимых фильмов
learning_rate = 0.01
reg = 0.05

for _ in range(15):
    for raw_iid, inner_iid in iid_map.items():
        pred = global_bias + new_user_bias + item_biases[inner_iid] + np.dot(new_user_vector, item_factors[inner_iid])
        err = 5.0 - pred

        new_user_bias += learning_rate * (err - reg * new_user_bias)
        new_user_vector += learning_rate * (err * item_factors[inner_iid] - reg * new_user_vector)

In [51]:
all_inner_iids = svd.trainset.all_items()

predictions = []
for inner_iid in all_inner_iids:
    if inner_iid in iid_map.values():
        continue

    raw_iid = svd.trainset.to_raw_iid(inner_iid)
    score = global_bias + new_user_bias + svd.bi[inner_iid] + np.dot(new_user_vector, svd.qi[inner_iid])
    predictions.append((raw_iid, score))

# Топ-10 фильмов
top_items = sorted(predictions, key=lambda x: x[1], reverse=True)[:10]
top_titles = df_movies.set_index("item").loc[[iid for iid, _ in top_items]]["title"]
print(top_titles)

item
178                 12 Angry Men (1957)
603                  Rear Window (1954)
513               Third Man, The (1949)
318             Schindler's List (1993)
408               Close Shave, A (1995)
50                     Star Wars (1977)
64     Shawshank Redemption, The (1994)
483                   Casablanca (1942)
488                 Sunset Blvd. (1950)
98     Silence of the Lambs, The (1991)
Name: title, dtype: object
