In [1]:
# !pip install implicit

import numpy as np
import pandas as pd

from utils.data_transformation import get_train_csr_matrix, get_test_warm_data
from utils.evaluation import compute_metrics

from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
from implicit.evaluation import precision_at_k, mean_average_precision_at_k

In [2]:
train_ratio, watched_pct_threshold = 0.8, 50
user_movies, users_rev_mapping, movies_rev_mapping = get_train_csr_matrix(train_ratio, watched_pct_threshold)

als_model = AlternatingLeastSquares(factors=50, regularization=0.05, iterations=30, random_state=42)
als_model.fit(user_movies)

  0%|          | 0/30 [00:00<?, ?it/s]

In [3]:
df_inter = pd.read_csv('processed_data/interactions.csv')
df_items = pd.read_csv('processed_data/items.csv')

train_size = int(train_ratio*len(df_inter))

users_mapping = {user_id: idx for idx, user_id in users_rev_mapping.items()}
movies_mapping = {movie_id: idx for idx, movie_id in movies_rev_mapping.items()}

id_df_idx_mapping = {id: idx for idx, id in df_items['item_id'].items()}

### Example of making recommendations by AlternatingLeastSquares

In [4]:
user_id = df_inter[:train_size].loc[12321, 'user_id']
df_inter[df_inter['user_id'] == user_id].merge(df_items)

Unnamed: 0,user_id,item_id,last_watch_dt,watched_pct,content_type,title,genres,age_rating,keywords
0,11420,9342,2021-03-13,85.0,film,Дэдпул,"боевики, фантастика, приключения, комедии",18,"антигерой, наемник, супергерой, основанный на ..."
1,11420,12849,2021-03-13,38.0,film,Первый мститель,"боевики, фантастика, приключения",12,"нью-йорк, вторая мировая война, нацисты, супер..."
2,11420,3095,2021-03-13,0.0,film,Тор: Царство тьмы,"боевики, фэнтези, приключения",12,"супергерой, основанный на комическом, скандина..."
3,11420,2280,2021-05-05,25.0,series,На безымянной высоте,"боевики, историческое, военные",16,
4,11420,3734,2021-07-03,0.0,film,Прабабушка легкого поведения,комедии,16,", 2021, россия, прабабушка, легкого, поведения"


In [5]:
test_user_idx = users_mapping[user_id]
rec_movies_idx = als_model.recommend(test_user_idx, user_movies[test_user_idx], N=20)[0]

rec_movies_id = np.vectorize(movies_rev_mapping.get)(rec_movies_idx)
rec_movies_id = np.vectorize(id_df_idx_mapping.get)(rec_movies_id)
df_items.iloc[rec_movies_id].head().reset_index(drop=True)

Unnamed: 0,item_id,content_type,title,genres,age_rating,keywords
6037,14317,film,Веном,"популярное, фантастика, триллеры, боевики, ужасы",16,"Сан-Франциско, Калифорния, космический корабль..."
8762,11754,film,Kingsman: Секретная служба,"боевики, криминал, приключения, комедии",18,"шпион, великобритания, секретная организация, ..."
1547,10942,film,Мстители,"боевики, фантастика, фэнтези, приключения",12,"нью-йорк, щит, супергерой, основанный на комик..."
2638,12841,film,Стражи Галактики,"боевики, фантастика, приключения, комедии",12,"космический корабль, основанный на комиксе, ко..."
11782,7210,film,Тор: Рагнарёк,"приключения, фантастика, боевики, фэнтези, ком...",16,"продолжение, супергерой, основанный на комиксе..."


### Metrics

In [6]:
test_data = get_test_warm_data(train_ratio, watch_threshold=5)

test_interactions = {}
for user_id, item_id in test_data[['user_id', 'item_id']].values:
    user_idx = users_mapping[user_id]
    if user_idx not in test_interactions:
        test_interactions[user_idx] = set()
    test_interactions[user_idx].add(movies_mapping[item_id])

users_idx = np.array(list(test_interactions.keys()))
recommendations = als_model.recommend(users_idx, user_movies[users_idx], N=20)[0]

precision, recall, map = compute_metrics(users_idx, test_interactions, recommendations)
precision, recall, map

(0.01634743367094225, 0.07352864974885133, 0.004866656530971885)

In [35]:
train_data = df_inter[:train_size].copy()
train_data['liked'] = train_data['watched_pct'] > 90
train_data = train_data[train_data['liked']]

train_count = train_data['user_id'].value_counts().to_dict()
movies = set(df_inter[:train_size]['item_id'].unique())

test_mask = test_data.apply(lambda inter:
    train_count.get(inter['user_id'], 0) >= 5 and
    inter['item_id'] in movies,
    axis=1)

test_data = test_data[test_mask]

In [36]:
test_data.user_id.nunique()

18331

In [39]:
user_movies.shape

(634181, 14548)

In [7]:
users = test_data['user_id'].map(users_mapping)
movies = test_data['item_id'].map(movies_mapping)

test_user_movies = csr_matrix(
    (np.ones(len(test_data)), (users, movies)),
    shape=(len(users_mapping), len(movies_mapping)))

In [8]:
precision = precision_at_k(als_model, user_movies, test_user_movies, K=20)
map = mean_average_precision_at_k(als_model, user_movies, test_user_movies, K=20)

precision, map

  0%|          | 0/78887 [00:00<?, ?it/s]

  0%|          | 0/78887 [00:00<?, ?it/s]

(0.07610706687400964, 0.01964516917255615)