# LightFM comparison in Colab

# Import

In [None]:
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/My Drive/

import pandas as pd
import numpy as np

! pip install lightfm
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, recall_at_k

In [37]:
df_train = pd.read_csv("processed/interactions_train.csv")
df_users_train = pd.read_csv("processed/users_train.csv")
df_items_train = pd.read_csv("processed/items_train.csv")
with np.load("processed/encoded_data_train.npz") as file:
  X_users_train, X_movies_train = file['X_users_train'], file['X_movies_train']

df_test = pd.read_csv("processed/interactions_test.csv")

# Train-test split

In [38]:
users_ids = df_users_train['user_id'].values
items_ids = df_items_train['item_id'].values

users_unique_features = [f'feat{i}' for i in range(X_users_train.shape[1])]
movies_unique_features = [f'feat{i}' for i in range(X_movies_train.shape[1])]

dataset = Dataset(user_identity_features=False, item_identity_features=False)
dataset.fit(
    users=users_ids, items=items_ids,
    item_features=movies_unique_features,
    user_features=users_unique_features
)

inters = dataset.build_interactions(
    tuple(zip(df_train['user_id'], df_train['item_id'])))[0]

users_features = dataset.build_user_features([
    (id, [f'feat{nz}' for nz in
         X_users_train[idx].nonzero()[0]]) for idx, id in
         enumerate(df_users_train['user_id'])
], normalize=False)

movies_features = dataset.build_item_features([
    (id, [f'feat{nz}' for nz in
         X_movies_train[idx].nonzero()[0]]) for idx, id in
         enumerate(df_items_train['item_id'])
], normalize=False)

In [39]:
model = LightFM(no_components=64, loss='bpr', learning_rate=0.001, random_state=42)

In [40]:
model.fit(
    interactions=inters,
    user_features=users_features,
    item_features=movies_features,
    epochs=50,
    num_threads=4,
    verbose=True
)

Epoch: 100%|██████████| 50/50 [12:01<00:00, 14.43s/it]


<lightfm.lightfm.LightFM at 0x7850c0605a80>

# Test

In [41]:
df_test = df_test[df_test['user_id'].isin(users_ids) & df_test['item_id'].isin(items_ids)]

In [42]:
test_inters = dataset.build_interactions(
    tuple(zip(df_test['user_id'], df_test['item_id'])))[0]

In [43]:
precision = precision_at_k(model, test_interactions=test_inters,
               user_features=users_features, item_features=movies_features,
               k=20, num_threads=4)

recall = recall_at_k(model, test_interactions=test_inters,
               user_features=users_features, item_features=movies_features,
               k=20, num_threads=4)

In [113]:
precision.mean(), recall.mean()

(0.0006062308, 0.006458138132033284)

In [47]:
user_id = 148609
df_train[df_train['user_id'] == user_id].merge(df_items_train)

Unnamed: 0,user_id,item_id,last_watch_dt,watched_pct,content_type,title,genres,age_rating,keywords,views
0,148609,12395,2021-03-23,100.0,film,"Беги, мальчик, беги","боевики, драмы, биография, военные",16,"вторая мировая война, варшавское гетто, дети, ...",100
1,148609,11885,2021-04-01,100.0,film,Хан Соло: Звёздные войны. Истории,"боевики, фантастика, приключения",12,"космический корабль, приквел, спин-офф, контра...",523
2,148609,13713,2021-04-01,0.0,film,Звёздные войны: Скрытая угроза,"боевики, фантастика, приключения",0,"пророчество, сенат, королева, надзиратель, гал...",362
3,148609,13980,2021-04-11,100.0,film,Изгой-один: Звёздные войны. Истории.,"боевики, фантастика, приключения",16,"бунтарь, космический корабль, космическое сраж...",362


In [51]:
user_idx = dataset.mapping()[0][user_id]

In [101]:
recommendations = model.predict(
    user_ids=np.zeros(len(items_ids)),
    item_ids=np.arange(len(items_ids)),
    user_features=users_features[user_idx][np.zeros(len(items_ids)),:],
    item_features=movies_features,
    )
top_items = np.argsort(-recommendations)[:20]

In [112]:
movies_idx_to_id = {id: idx for idx, id in dataset.mapping()[2].items()}
movies_id_to_df_idx = {movie_id: df_idx for df_idx, movie_id in df_items_train['item_id'].items()}

top_items_id = np.vectorize(movies_idx_to_id.get)(top_items)
rec_df_idx = np.vectorize(movies_id_to_df_idx.get)(top_items_id)

df_items_train.iloc[rec_df_idx].reset_index(drop=True).head()

Unnamed: 0,item_id,content_type,title,genres,age_rating,keywords,views
0,5136,film,Хозяин морей: На краю Земли,"историческое, приключения, драмы, военные, бое...",12,"по роману или книге, хирург, военно-морской фл...",145
1,10219,film,Железный рыцарь 2,"историческое, приключения, драмы, военные, бое...",16,"замок, продолжение, бой на мечах, средневековь...",767
2,12995,film,Восемь сотен,"боевики, драмы, историческое, военные",18,"осада, китайско-японская война, 2020, китай, в...",5036
3,2581,film,Не брать живым,"боевики, драмы, триллеры, военные",16,"война, солдат, иракская война, оружие массовог...",4
4,12546,film,Патриот,"боевики, драмы, историческое, военные",16,"Американская революция, Битва, Ветеран, Герой,...",88
