# Initialization

In [1]:
import logging

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'png'
%config InlineBackend.figure_format = 'retina'

# Загрузка данных

In [3]:
items = pd.read_parquet("items.par")
events = pd.read_parquet("events.par")

# Разбиение с учётом хронологии

Рекомендательные системы на практике работают с учётом хронологии. Поэтому поток событий для тренировки и валидации полезно делить на то, что уже случилось, и что ещё случится. Это позволяет проводить валидацию на тех же пользователях, на которых тренировались, но на их событиях в будущем.

In [4]:
# разобьем на тест и трейн по дате 2017-08-01

# test_after_date = pd.to_datetime("2017-08-01").date()
# events = events.query("started_at < @test_after_date and read_at < @test_after_date").copy()

# зададим точку разбиения
train_test_global_time_split_date = pd.to_datetime("2017-08-01").date()

train_test_global_time_split_idx = events["started_at"] < train_test_global_time_split_date
# events_train = events.query("started_at < @train_test_global_time_split_date").copy() # ваш код здесь #
events_train = events[train_test_global_time_split_idx]
events_test = events[~train_test_global_time_split_idx]

# количество пользователей в train и test
users_train = events_train["user_id"].drop_duplicates()
users_test =  events_test["user_id"].drop_duplicates() # ваш код здесь #
# количество пользователей, которые есть и в train, и в test
common_users = set(users_train).intersection(set(users_test)) # ваш код здесь #

print(len(users_train), len(users_test), len(common_users))

428220 123223 120858


# === Знакомство: "холодный" старт

# === Знакомство: первые персональные рекомендации

# === Базовые подходы: коллаборативная фильтрация

In [5]:
import scipy
import sklearn.preprocessing

# перекодируем идентификаторы пользователей: 
# из имеющихся в последовательность 0, 1, 2, ...
user_encoder = sklearn.preprocessing.LabelEncoder()
user_encoder.fit(events["user_id"])
events_train["user_id_enc"] = user_encoder.transform(events_train["user_id"])
events_test["user_id_enc"] = user_encoder.transform(events_test["user_id"])

# перекодируем идентификаторы объектов: 
# из имеющихся в последовательность 0, 1, 2, ...
item_encoder = sklearn.preprocessing.LabelEncoder()
item_encoder.fit(items["item_id"])
items["item_id_enc"] = item_encoder.transform(items["item_id"])
events_train["item_id_enc"] = item_encoder.transform(events_train["item_id"]) # ваш код здесь #
events_test["item_id_enc"] = item_encoder.transform(events_test["item_id"])# ваш код здесь #

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_train["user_id_enc"] = user_encoder.transform(events_train["user_id"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_test["user_id_enc"] = user_encoder.transform(events_test["user_id"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_train["item_id_enc"] = item_encoder.transfor

In [6]:
events_train

Unnamed: 0,user_id,item_id,started_at,read_at,is_read,rating,is_reviewed,started_at_month,user_id_new,user_id_enc,item_id_enc
6679625,00000377eea48021d3002730d56aca9a,11012,2015-12-05,2015-12-11,True,4,False,2015-12-01,1000000,0,1303
6679617,00000377eea48021d3002730d56aca9a,4671,2014-06-05,2014-06-30,True,5,False,2014-06-01,1000000,0,493
6679618,00000377eea48021d3002730d56aca9a,5,2012-10-02,2012-10-24,True,5,False,2012-10-01,1000000,0,3
6679620,00000377eea48021d3002730d56aca9a,2,2009-07-12,2009-07-29,True,5,False,2009-07-01,1000000,0,1
6679621,00000377eea48021d3002730d56aca9a,14497,2016-05-09,2016-06-02,True,5,False,2016-05-01,1000000,0,1808
...,...,...,...,...,...,...,...,...,...,...,...
5625381,fffff8a718843c0e11dfd93fb41c1297,29056083,2016-08-01,2016-08-03,True,3,True,2016-08-01,1430584,430584,41809
5625379,fffff8a718843c0e11dfd93fb41c1297,25111004,2016-12-08,2016-12-08,True,5,False,2016-12-01,1430584,430584,40432
5625378,fffff8a718843c0e11dfd93fb41c1297,6606855,2017-03-01,2017-03-01,True,3,False,2017-03-01,1430584,430584,24391
5625377,fffff8a718843c0e11dfd93fb41c1297,18812405,2017-05-05,2017-05-31,True,3,True,2017-05-01,1430584,430584,37138


In [7]:
events_train['item_id_enc'].max()

43304

Вычислите размер матрицы user_item_matrix_train, как если бы она хранила все свои элементы, включая пропуски, и для каждого элемента использовался бы один байт. Ответ приведите в виде целого числа гигабайтов, отбросив дробную часть.
Подсказка: 
Умножьте количество строк на количество столбцов, а затем результат разделите на 1024^3.

In [8]:
len(events_train.user_id.unique())

428220

In [9]:
len(events_train.item_id.unique())

41474

In [10]:
( len(events_train.item_id.unique()) * len(events_train.user_id.unique()) ) / (1024**3)

16.54028546065092

In [11]:
1024^3

1027

In [12]:
len(events_train.item_id.unique()) * len(events_train.user_id.unique())

17759996280

In [13]:
# создаём sparse-матрицу формата CSR 
user_item_matrix_train = scipy.sparse.csr_matrix((
    events_train["rating"],
    (events_train['user_id_enc'], events_train['item_id_enc'])),
    dtype=np.int8)

In [14]:
import sys

sum([sys.getsizeof(i) for i in user_item_matrix_train.data])/1024**3

0.26370687410235405

Выполняем код для тренировки ALS модели

In [15]:
from implicit.als import AlternatingLeastSquares

als_model = AlternatingLeastSquares(factors=50, iterations=50, regularization=0.05, random_state=0)
als_model.fit(user_item_matrix_train)

  from .autonotebook import tqdm as notebook_tqdm
  check_blas_config()
100%|██████████| 50/50 [02:56<00:00,  3.54s/it]


In [16]:
def get_recommendations_als(user_item_matrix, model, user_id, user_encoder, item_encoder, include_seen=True, n=5):
    """
    Возвращает отранжированные рекомендации для заданного пользователя
    """
    user_id_enc = user_encoder.transform([user_id])[0]
    recommendations = model.recommend(
         user_id_enc, 
         user_item_matrix[user_id_enc], 
         filter_already_liked_items=not include_seen,
         N=n)
    recommendations = pd.DataFrame({"item_id_enc": recommendations[0], "score": recommendations[1]})
    recommendations["item_id"] = item_encoder.inverse_transform(recommendations["item_id_enc"])
    
    return recommendations

In [17]:
# получаем список всех возможных user_id (перекодированных)
user_ids_encoded = range(len(user_encoder.classes_))

# получаем рекомендации для всех пользователей
als_recommendations = als_model.recommend(
    user_ids_encoded, 
    user_item_matrix_train[user_ids_encoded], 
    filter_already_liked_items=False, N=100)

In [18]:
als_recommendations

(array([[    2,  1942,     3, ..., 28836, 30688, 10393],
        [31432, 29792, 36956, ...,   533, 32060, 34554],
        [35810, 33276, 37255, ..., 31562, 41459,  1043],
        ...,
        [20997, 20386, 23004, ...,  2293, 28200, 29560],
        [22844, 28025, 37138, ..., 37914,   422,  4112],
        [41809, 34434, 35669, ..., 33675, 28263, 22072]], dtype=int32),
 array([[0.99094146, 0.89661723, 0.8644041 , ..., 0.2261226 , 0.22548363,
         0.22546645],
        [0.674292  , 0.6229848 , 0.49019852, ..., 0.02235501, 0.02226192,
         0.02225844],
        [0.24119437, 0.22116913, 0.18066649, ..., 0.04201685, 0.04178948,
         0.04172034],
        ...,
        [0.23566297, 0.23407641, 0.22276123, ..., 0.02843785, 0.02830932,
         0.02820013],
        [0.05539129, 0.03866215, 0.03835723, ..., 0.01568658, 0.01557466,
         0.01546565],
        [0.47294533, 0.46393558, 0.4604288 , ..., 0.09494869, 0.09492695,
         0.09303415]], dtype=float32))

In [19]:
# преобразуем полученные рекомендации в табличный формат
item_ids_enc = als_recommendations[0]
als_scores = als_recommendations[1]

als_recommendations = pd.DataFrame({
    "user_id_enc": user_ids_encoded,
    "item_id_enc": item_ids_enc.tolist(), 
    "score": als_scores.tolist()})
als_recommendations = als_recommendations.explode(["item_id_enc", "score"], ignore_index=True)

# приводим типы данных
als_recommendations["item_id_enc"] = als_recommendations["item_id_enc"].astype("int")
als_recommendations["score"] = als_recommendations["score"].astype("float")

# получаем изначальные идентификаторы
als_recommendations["user_id"] = user_encoder.inverse_transform(als_recommendations["user_id_enc"])
als_recommendations["item_id"] = item_encoder.inverse_transform(als_recommendations["item_id_enc"])
als_recommendations = als_recommendations.drop(columns=["user_id_enc", "item_id_enc"])

Сохраним полученные рекомендации в файл, они ещё нам пригодятся.

In [20]:
als_recommendations = als_recommendations[["user_id", "item_id", "score"]]
# als_recommendations.to_parquet("als_recommendations.parquet")

In [21]:
als_recommendations

Unnamed: 0,user_id,item_id,score
0,00000377eea48021d3002730d56aca9a,3,0.990941
1,00000377eea48021d3002730d56aca9a,15881,0.896617
2,00000377eea48021d3002730d56aca9a,5,0.864404
3,00000377eea48021d3002730d56aca9a,6,0.822254
4,00000377eea48021d3002730d56aca9a,2,0.774095
...,...,...,...
43058495,fffff8a718843c0e11dfd93fb41c1297,13206900,0.096082
43058496,fffff8a718843c0e11dfd93fb41c1297,5060378,0.096065
43058497,fffff8a718843c0e11dfd93fb41c1297,16071764,0.094949
43058498,fffff8a718843c0e11dfd93fb41c1297,9969571,0.094927


### Метрики

Score от ALS не лежат на той же шкале, что и пользовательские оценки. Сравнивать исходные и новые оценки напрямую — некорректно. Поэтому посчитать метрики MAE, RMSE проблематично. Вместо них можно использовать метрики ранжирования. Они сравнивают не абсолютные значения рейтингов и их оценок, а соответствие порядков. Метрики ранжирования покажут, насколько порядок рекомендаций по убыванию score соответствует порядку объектов по убыванию пользовательских оценок. 
На практике часто используют метрику NDCG, она принимает значение от 0 (предлагаемый порядок никак не соответствует истинному) до 1 (предлагаемый порядок в точности соответствует истинному). 

Для удобства оценки добавим в датафрейм с рекомендациями истинные оценки из тестовой выборки:

In [22]:
als_recommendations = (
    als_recommendations
    .merge(events_test[["user_id", "item_id", "rating"]]
               .rename(columns={"rating": "rating_test"}), 
           on=["user_id", "item_id"], how="left")
)

In [23]:
als_recommendations

Unnamed: 0,user_id,item_id,score,rating_test
0,00000377eea48021d3002730d56aca9a,3,0.990941,
1,00000377eea48021d3002730d56aca9a,15881,0.896617,
2,00000377eea48021d3002730d56aca9a,5,0.864404,
3,00000377eea48021d3002730d56aca9a,6,0.822254,
4,00000377eea48021d3002730d56aca9a,2,0.774095,
...,...,...,...,...
43058495,fffff8a718843c0e11dfd93fb41c1297,13206900,0.096082,
43058496,fffff8a718843c0e11dfd93fb41c1297,5060378,0.096065,
43058497,fffff8a718843c0e11dfd93fb41c1297,16071764,0.094949,
43058498,fffff8a718843c0e11dfd93fb41c1297,9969571,0.094927,


Подсчитать метрику NDCG для одного пользователя поможет готовая реализация из scikit-learn:

In [24]:
import sklearn.metrics

def compute_ndcg(rating: pd.Series, score: pd.Series, k):

    """ подсчёт ndcg
    rating: истинные оценки
    score: оценки модели
    k: количество айтемов (по убыванию score) для оценки, остальные - отбрасываются
    """
    
    # если кол-во объектов меньше 2, то NDCG - не определена
    if len(rating) < 2:
        return np.nan

    ndcg = sklearn.metrics.ndcg_score(np.asarray([rating.to_numpy()]), np.asarray([score.to_numpy()]), k=k)

    return ndcg

In [25]:
rating_test_idx = ~als_recommendations["rating_test"].isnull()
ndcg_at_5_scores = als_recommendations[rating_test_idx].groupby("user_id").apply(lambda x: compute_ndcg(x["rating_test"], x["score"], k=5))

In [26]:
ndcg_at_5_scores

user_id
00014c578111090720e20f5705eba051    1.0
000157a6f8331e9c9a21252e1fee91d1    NaN
0003f216823de684cea464170efe1d42    NaN
00048fe3297cbb92a6e9fb78a6dce421    NaN
0004ce6c7cde7ce6f6e3d1c982d6d706    1.0
                                   ... 
fffc9cfe8fd818f574c8c219b93274c0    NaN
fffdbe24990b7e9e78653f97fc8cecd1    1.0
fffe5352bfcdc38a3fb70f41e2ba7e5b    NaN
ffff601c0ffa34bd5ffbbf2caee30644    1.0
fffff8a718843c0e11dfd93fb41c1297    NaN
Length: 48135, dtype: float64

In [27]:
# print(ndcg_at_5_scores.mean()) 

In [28]:
round(ndcg_at_5_scores.mean(), 2)

0.98

# === Базовые подходы: контентные рекомендации

In [29]:
items["genre_and_votes"] = items["genre_and_votes"].apply(eval)

In [30]:
items["genre_and_votes"]

3          {'Womens Fiction-Chick Lit': 739, 'Fiction': 442}
6                                {'Politics': 1, 'Humor': 1}
15         {'Christian': 395, 'Nonfiction': 392, 'Religio...
16         {'Christian': 225, 'Religion-Theology': 154, '...
17         {'Historical-Historical Fiction': 284, 'Childr...
                                 ...                        
2360257    {'Plays': 294, 'Historical-Historical Fiction'...
2360258    {'Sequential Art-Comics': 683, 'Sequential Art...
2360322    {'Romance-Paranormal Romance': 703, 'Fantasy-P...
2360395    {'Fiction': 4, 'Romance': 3, 'Business-Amazon'...
2360448    {'Fantasy': 33, 'Young Adult': 16, 'Horror': 1...
Name: genre_and_votes, Length: 43312, dtype: object

In [31]:
# items

Теперь составьте список жанров с долями голосов по ним в genres.  Дополните и выполните следующий код:

In [32]:
def get_genres(items):

    """ 
    извлекает список жанров по всем книгам, 
    подсчитывает долю голосов по каждому их них
    """
    
    genres_counter = {}
    
    for k, v, in items.iterrows():
        genre_and_votes = v['genre_and_votes']# ваш код здесь #
        if genre_and_votes is None or not isinstance(genre_and_votes, dict):
            continue
        for genre, votes in genre_and_votes.items():
            # увеличиваем счётчик жанров
            try:
                genres_counter[genre] += votes # ваш код здесь #
            except KeyError:
                genres_counter[genre] = 0

    genres = pd.Series(genres_counter, name="votes")
    genres = genres.to_frame()
    genres = genres.reset_index().rename(columns={"index": "name"})
    genres.index.name = "genre_id"
    
    return genres
   
genres = get_genres(items)

In [33]:
# def get_genres(items):

#     """
#     извлекает список жанров по всем книгам,
#     подсчитывает долю голосов по каждому их них
#     """

#     genres_counter = {}
#     for k, v, in items.iterrows():
#         print(f'k is {k}')
#         # print(f'v is {v}')
#         genre_and_votes = v['genre_and_votes'] # ваш код здесь #
#         print(genre_and_votes)
#         if genre_and_votes is None or not isinstance(genre_and_votes, dict):
#             print('genre_and_votes is None or not isinstance(genre_and_votes, dict)')
#             # continue
#         for genre, votes in genre_and_votes.items():
#             # увеличиваем счётчик жанров
#             try:
#                 genres_counter[genre] += votes # ваш код здесь #
#                 print('votes are plused')
#             except KeyError:
#                 genres_counter[genre] = 0
#                 print('genres_counter[genre] = 0')
  
#     genres = pd.Series(genres_counter, name="votes")
#     genres = genres.to_frame()
#     genres = genres.reset_index().rename(columns={"index": "name"})
#     genres.index.name = "genre_id"

#     return genres

# genres = get_genres(items)

In [34]:
genres

Unnamed: 0_level_0,name,votes
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Womens Fiction-Chick Lit,254558
1,Fiction,6406256
2,Politics,103296
3,Humor,304302
4,Christian,105273
...,...,...
810,German History-Nazi Party,0
811,Favorites,0
812,History-Latin American History,0
813,Cryptids-Bigfoot,0


In [35]:
# for k, v, in items.head(2).iterrows():
#     print('k is', k)
#     print('---')
#     print('v is', v)
#     print('-------')

In [36]:
items.head(2)

Unnamed: 0,item_id,author,title,description,genre_and_votes,num_pages,average_rating,ratings_count,text_reviews_count,publisher,publication_year,country_code,language_code,format,is_ebook,isbn,isbn13,genre_and_votes_dict,genre_and_votes_str,item_id_enc
3,6066819,Jennifer Weiner,Best Friends Forever,Addie Downs and Valerie Adler were eight when ...,"{'Womens Fiction-Chick Lit': 739, 'Fiction': 442}",368.0,3.49,51184,3282,Atria Books,2009,US,eng,Hardcover,False,743294297,9780743294294,"{'Academic': None, 'Academic-Academia': None, ...","Womens Fiction-Chick Lit 739, Fiction 442",23133
6,378460,Michael Halberstam,The Wanting of Levine,,"{'Politics': 1, 'Humor': 1}",,4.38,12,4,Berkley Publishing Group,1979,US,,Paperback,False,425040887,9780425040881,"{'Academic': None, 'Academic-Academia': None, ...","Politics 1user, Humor 1user",12687


In [37]:
items.loc[3, 'genre_and_votes_dict']

{'Academic': None,
 'Academic-Academia': None,
 'Academic-College': None,
 'Academic-Grad School': None,
 'Academic-Read For School': None,
 'Academic-School': None,
 'Academic-Students': None,
 'Academic-Teachers': None,
 'Action': None,
 'Adolescence': None,
 'Adult': None,
 'Adult Fiction': None,
 'Adult Fiction-Erotica': None,
 'Adventure': None,
 'Adventure-Maritime': None,
 'Adventure-Pirates': None,
 'Adventure-Survival': None,
 'Aeroplanes': None,
 'Africa-Eastern Africa': None,
 'Africa-Western Africa': None,
 'African Literature-Egyptian Literature': None,
 'Alcohol-Beer': None,
 'Alcohol-Booze': None,
 'Alcohol-Cocktails': None,
 'Alcohol-Wine': None,
 'American History-American Civil War': None,
 'American Revolution-American Revolutionary War': None,
 'American-African American Literature': None,
 'American-American Classics': None,
 'American-American Fiction': None,
 'American-Americana': None,
 'American-Southern': None,
 'Amish': None,
 'Anarchism': None,
 'Animals': N

In [38]:
# genres

Результат выполнения кода — список жанров с долями голосов по ним в genres. Посмотрим на самые популярные жанры:

In [39]:
genres["score"] = genres["votes"] / genres["votes"].sum()
genres.sort_values(by="score", ascending=False).head(10)

Unnamed: 0_level_0,name,votes,score
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
25,Fantasy,6850060,0.149651
1,Fiction,6406256,0.139955
38,Classics,3414934,0.074605
18,Young Adult,3296951,0.072027
34,Romance,2422614,0.052926
5,Nonfiction,1737406,0.037957
16,Historical-Historical Fiction,1531205,0.033452
20,Mystery,1371196,0.029956
24,Science Fiction,1218917,0.026629
33,Fantasy-Paranormal,857012,0.018723


Матрицы

In [40]:
def get_item2genre_matrix(genres, items):

    genre_names_to_id = genres.reset_index().set_index("name")["genre_id"].to_dict()
    
    # list to build CSR matrix
    genres_csr_data = []
    genres_csr_row_idx = []
    genres_csr_col_idx = []
    
    for item_idx, (k, v) in enumerate(items.iterrows()):
        if v["genre_and_votes"] is None:
            continue
        for genre_name, votes in v["genre_and_votes"].items():
            genre_idx = genre_names_to_id[genre_name]
            genres_csr_data.append(int(votes))
            genres_csr_row_idx.append(item_idx)
            genres_csr_col_idx.append(genre_idx)

    genres_csr = scipy.sparse.csr_matrix((genres_csr_data, (genres_csr_row_idx, genres_csr_col_idx)), shape=(len(items), len(genres)))
    # нормализуем, чтобы сумма оценок принадлежности к жанру была равна 1
    genres_csr = sklearn.preprocessing.normalize(genres_csr, norm='l1', axis=1)
    
    return genres_csr

In [41]:
items = items.sort_values(by="item_id_enc")
all_items_genres_csr = get_item2genre_matrix(genres, items)

In [42]:
user_id = 1000010
user_events = events_train.query("user_id_new == @user_id")[["item_id", "rating"]]
user_items = items[items["item_id"].isin(user_events["item_id"])]

user_items_genres_csr = get_item2genre_matrix(genres, user_items) # ваш код здесь #)
user_items_genres_csr

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 149 stored elements and shape (22, 815)>

In [43]:
# вычислим склонность пользователя к жанрам как среднее взвешенное значение популяции на его оценки книг.

# преобразуем пользовательские оценки из списка в вектор-столбец
user_ratings = user_events["rating"].to_numpy() / 5
user_ratings = np.expand_dims(user_ratings, axis=1)

user_items_genres_weighted = user_items_genres_csr.multiply(user_ratings)

user_genres_scores = np.asarray(user_items_genres_weighted.mean(axis=0))

In [44]:
# выведем список жанров, которые предпочитает пользователь

user_genres = genres.copy()
user_genres["score"] = np.ravel(user_genres_scores)
user_genres = user_genres[user_genres["score"] > 0].sort_values(by=["score"], ascending=False)

user_genres.head(5)

Unnamed: 0_level_0,name,votes,score
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Fiction,6406256,0.195253
38,Classics,3414934,0.096687
25,Fantasy,6850060,0.074261
24,Science Fiction,1218917,0.045902
5,Nonfiction,1737406,0.044359


### Задание 4

In [45]:
from sklearn.metrics.pairwise import cosine_similarity

# вычисляем сходство между вектором пользователя и векторами по книгам
similarity_scores = cosine_similarity(all_items_genres_csr, user_genres_scores)

# преобразуем в одномерный массив
similarity_scores = similarity_scores.flatten()



In [46]:
similarity_scores

array([0.51328292, 0.50496693, 0.53061606, ..., 0.67123638, 0.03087739,
       0.30265991])

In [47]:
similarity_scores_sorted = np.argsort(similarity_scores)
similarity_scores_sorted

array([18326, 38646, 18386, ..., 14087,  1120,  4471])

In [48]:
similarity_scores_sorted[-5:]

array([ 1988,  4460, 14087,  1120,  4471])

In [49]:
similarity_scores[similarity_scores_sorted[:5]]

array([0., 0., 0., 0., 0.])

In [50]:
similarity_scores[similarity_scores_sorted]

array([0.        , 0.        , 0.        , ..., 0.92066604, 0.92702526,
       0.94333945])

In [51]:
# получаем индексы top-k (по убыванию значений), по сути, индексы книг (encoded)

k = 5
top_k_indices = similarity_scores[similarity_scores_sorted[:k]] # ваш код здесь #

In [52]:
selected_items = items[items["item_id_enc"].isin(top_k_indices)]

with pd.option_context("max_colwidth", 100):
   display(selected_items[["author", "title", "genre_and_votes"]])

Unnamed: 0,author,title,genre_and_votes
861044,J.K. Rowling,"Harry Potter and the Half-Blood Prince (Harry Potter, #6)","{'Fantasy': 46400, 'Young Adult': 15083, 'Fiction': 13083, 'Fantasy-Magic': 3815, 'Childrens': 2..."


# === Базовые подходы: валидация

In [53]:
def process_events_recs_for_binary_metrics(events_train, events_test, recs, top_k=None):

    """
    размечает пары <user_id, item_id> для общего множества пользователей признаками
    - gt (ground truth)
    - pr (prediction)
    top_k: расчёт ведётся только для top k-рекомендаций
    """

    events_test["gt"] = True
    common_users = set(events_test["user_id"]) & set(recs["user_id"])

    print(f"Common users: {len(common_users)}")
    
    events_for_common_users = events_test[events_test["user_id"].isin(common_users)].copy()
    recs_for_common_users = recs[recs["user_id"].isin(common_users)].copy()

    recs_for_common_users = recs_for_common_users.sort_values(["user_id", "score"], ascending=[True, False])

    # оставляет только те item_id, которые были в events_train, 
    # т. к. модель не имела никакой возможности давать рекомендации для новых айтемов
    events_for_common_users = events_for_common_users[events_for_common_users["item_id"].isin(events_train["item_id"].unique())]

    if top_k is not None:
        recs_for_common_users = recs_for_common_users.groupby("user_id").head(top_k)
    
    events_recs_common = events_for_common_users[["user_id", "item_id", "gt"]].merge(
        recs_for_common_users[["user_id", "item_id", "score"]], 
        on=["user_id", "item_id"], how="outer")    

    events_recs_common["gt"] = events_recs_common["gt"].fillna(False)
    events_recs_common["pr"] = ~events_recs_common["score"].isnull()
    
    events_recs_common["tp"] = events_recs_common["gt"] & events_recs_common["pr"]
    events_recs_common["fp"] = ~events_recs_common["gt"] & events_recs_common["pr"]
    events_recs_common["fn"] = events_recs_common["gt"] & ~events_recs_common["pr"]

    return events_recs_common

In [54]:
events_recs_for_binary_metrics = process_events_recs_for_binary_metrics(
  events_train,
    events_test, 
    als_recommendations, 
    top_k=5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_test["gt"] = True


Common users: 123223


KeyboardInterrupt: 

In [None]:
def compute_cls_metrics(events_recs_for_binary_metric):
    
    groupper = events_recs_for_binary_metric.groupby("user_id")

    # precision = tp / (tp + fp)
    precision = groupper["tp"].sum()/(groupper["tp"].sum()+groupper["fp"].sum())
    precision = precision.fillna(0).mean()
    
    # recall = tp / (tp + fn)
    recall = groupper["tp"].sum()/(groupper["tp"].sum()+groupper["fn"].sum()) # ваш код здесь #
    recall = recall.fillna(0).mean()

    return precision, recall

In [None]:
precision_5, recall_5 = compute_cls_metrics(events_recs_for_binary_metrics)

In [None]:
precision_5

0.007581376853347184

In [None]:
recall_5

0.014121568795222568

In [None]:
round(recall_5,3)

0.014

In [None]:
events_recs_for_binary_metrics_10 = process_events_recs_for_binary_metrics(
  events_train,
    events_test, 
    als_recommendations, 
    top_k=10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_test["gt"] = True


Common users: 123223


In [None]:
precision_10, recall_10 = compute_cls_metrics(events_recs_for_binary_metrics_10)

In [None]:
precision_10

0.008732947582837622

In [None]:
recall_10

0.03130238527136974

# === Двухстадийный подход: метрики

Для рекомендаций, сохранённых в переменной als_recommendations, посчитайте покрытие по объектам согласно формуле выше. При этом используйте весь топ-100 рекомендаций.

In [None]:
# als_recommendations = pd.read_parquet("als_recommendations.parquet")

In [None]:
len(als_recommendations.item_id.unique()) / len(items.item_id.unique())

0.09362301440709273

In [None]:
# расчёт покрытия по объектам
cov_items = len(als_recommendations.item_id.unique()) / len(items.item_id.unique()) # ваш код здесь #
print(f"{cov_items:.2f}")

0.09


In [None]:
als_recommendations_top100 = als_recommendations.head(100)

In [None]:
len(als_recommendations_top100.item_id.unique()) / len(items.item_id.unique())

0.0023088289619504986

In [None]:
# recs_dict = {}
# for item in als_recommendations_top100.item_id.unique():
#     recs_dict

In [None]:
# set(als_recommendations_top100.item_id.unique())

In [None]:
# als_recommendations

Unnamed: 0,user_id,item_id,score
0,00000377eea48021d3002730d56aca9a,3,0.990941
1,00000377eea48021d3002730d56aca9a,15881,0.896617
2,00000377eea48021d3002730d56aca9a,5,0.864404
3,00000377eea48021d3002730d56aca9a,6,0.822254
4,00000377eea48021d3002730d56aca9a,2,0.774095
...,...,...,...
43058495,fffff8a718843c0e11dfd93fb41c1297,13206900,0.096082
43058496,fffff8a718843c0e11dfd93fb41c1297,5060378,0.096065
43058497,fffff8a718843c0e11dfd93fb41c1297,16071764,0.094949
43058498,fffff8a718843c0e11dfd93fb41c1297,9969571,0.094927


In [None]:
# events.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11751086 entries, 6679625 to 5625392
Data columns (total 9 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   user_id           object
 1   item_id           int64 
 2   started_at        object
 3   read_at           object
 4   is_read           bool  
 5   rating            int64 
 6   is_reviewed       bool  
 7   started_at_month  object
 8   user_id_new       int64 
dtypes: bool(2), int64(3), object(4)
memory usage: 739.6+ MB


In [None]:
# als_recommendations

Unnamed: 0,user_id,item_id,score
0,00000377eea48021d3002730d56aca9a,3,0.990941
1,00000377eea48021d3002730d56aca9a,15881,0.896617
2,00000377eea48021d3002730d56aca9a,5,0.864404
3,00000377eea48021d3002730d56aca9a,6,0.822254
4,00000377eea48021d3002730d56aca9a,2,0.774095
...,...,...,...
43058495,fffff8a718843c0e11dfd93fb41c1297,13206900,0.096082
43058496,fffff8a718843c0e11dfd93fb41c1297,5060378,0.096065
43058497,fffff8a718843c0e11dfd93fb41c1297,16071764,0.094949
43058498,fffff8a718843c0e11dfd93fb41c1297,9969571,0.094927


### Задание 2 

In [None]:
# разметим каждую рекомендацию признаком read
events_train["read"] = True
als_recommendations = als_recommendations.merge(events.loc[:,['user_id','item_id', 'is_read']], on=["user_id", "item_id"], how="left")
als_recommendations["read"] = als_recommendations["is_read"].fillna(False).astype("bool")

# # проставим ранги
als_recommendations = als_recommendations.sort_values(by='score', ascending=False) # ваш код здесь #)
als_recommendations["rank"] = als_recommendations.groupby("user_id").cumcount() + 1

# # посчитаем novelty по пользователям
novelty_5 = (1-als_recommendations.query("rank <= 5").groupby("user_id")["read"].mean())

# # посчитаем средний novelty
# # ваш код здесь #
round(novelty_5.mean(), 2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_train["read"] = True


0.61

In [None]:
round(novelty_5.mean(), 2)

0.61

In [None]:
novelty_5

user_id
00000377eea48021d3002730d56aca9a    0.2
00004584d524ec468619e81b176cc991    0.8
000079c580bbe45e1500acabe551b276    0.8
00009e46d18f223a82b22da38586b605    0.0
0000c3d51aa099745e93a4e99c4856c8    0.0
                                   ... 
ffff7cf38c717c8172ff5ba656cec6df    0.8
ffffbb062a8b208c9c1031b529c08f7a    0.8
ffffd81a724c0fa70ac37cd347c1c0f1    0.8
ffffe38c3a89ad5122e17e4cb9997fe4    1.0
fffff8a718843c0e11dfd93fb41c1297    0.8
Name: read, Length: 430585, dtype: float64

# === Двухстадийный подход: модель

In [None]:
# als_recommendations = pd.read_parquet("als_recommendations.parquet")

In [None]:
# задаём точку разбиения
split_date_for_labels = pd.to_datetime("2017-09-15").date()

split_date_for_labels_idx = events_test["started_at"] < split_date_for_labels


In [None]:
# len(split_date_for_labels_idx[-45:])

45

In [None]:
# split_date_for_labels_idx[-45:]

10066344     True
10066341     True
10066340     True
12249500     True
12249494     True
12293771    False
4148872     False
2044948      True
2044912     False
7899180     False
7899181     False
7899182     False
7899183      True
7899184      True
7899185      True
7899199     False
7899215      True
7354332      True
7354334      True
7354349     False
1085920     False
1085832     False
1085831     False
4869752      True
3342055      True
3342053      True
3342052      True
3342050      True
11318273    False
11318274     True
11318286     True
3731867      True
3731866      True
3731868      True
1285200      True
1285201      True
1285202      True
1285203      True
1285388     False
1285377     False
1941753      True
12578630    False
12578629    False
5625380      True
5625376      True
Name: started_at, dtype: bool

In [None]:
len(events_test[split_date_for_labels_idx][-45:])

45

In [None]:
events_labels = events_test[split_date_for_labels_idx].copy()
events_test_2 = events_test[~split_date_for_labels_idx].copy()

In [None]:
len(events_labels.user_id.unique())

99849

In [None]:
len(events_test[split_date_for_labels_idx]['user_id'].unique())

99849

In [None]:
# загружаем рекомендации от двух базовых генераторов
als_recommendations = pd.read_parquet("candidates/training/als_recommendations.parquet")
content_recommendations = pd.read_parquet("candidates/training/content_recommendations.parquet")



In [None]:
# als_recommendations

Unnamed: 0,user_id,item_id,score
0,1000000,3,0.972557
1,1000000,15881,0.890201
2,1000000,5,0.865850
3,1000000,6,0.834282
4,1000000,2,0.792929
...,...,...,...
43058495,1430584,26114135,0.093726
43058496,1430584,15451058,0.092196
43058497,1430584,19288043,0.092150
43058498,1430584,23453112,0.091675


In [None]:
# content_recommendations

Unnamed: 0,user_id,item_id,score
0,1000000,1,0.933434
1,1000000,2,0.925806
2,1000000,3,0.920225
3,1000000,5,0.918026
4,1000000,6,0.916345
...,...,...,...
42821995,1430584,31327371,0.786363
42821996,1430584,32841355,0.784905
42821997,1430584,33828743,0.784706
42821998,1430584,34037113,0.784556


In [None]:
candidates = pd.merge(
    als_recommendations[["user_id", "item_id", "score"]].rename(columns={"score": "als_score"}),
    content_recommendations[["user_id", "item_id", "score"]].rename(columns={"score": "cnt_score"}),
    on=['user_id', 'item_id'],
    how="outer")

In [None]:
# candidates

Unnamed: 0,user_id,item_id,als_score,cnt_score
0,1000000,3,0.972557,0.920225
1,1000000,15881,0.890201,0.905740
2,1000000,5,0.865850,0.918026
3,1000000,6,0.834282,0.916345
4,1000000,2,0.792929,0.925806
...,...,...,...,...
82993089,1430584,31327371,,0.786363
82993090,1430584,32841355,,0.784905
82993091,1430584,33828743,,0.784706
82993092,1430584,34037113,,0.784556


In [None]:
# candidates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82993094 entries, 0 to 82993093
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   user_id    int64  
 1   item_id    int64  
 2   als_score  float64
 3   cnt_score  float64
dtypes: float64(2), int64(2)
memory usage: 2.5 GB


In [None]:
# events_labels.info()

<class 'pandas.core.frame.DataFrame'>
Index: 253765 entries, 2478098 to 5625376
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   user_id           253765 non-null  object
 1   item_id           253765 non-null  int64 
 2   started_at        253765 non-null  object
 3   read_at           253765 non-null  object
 4   is_read           253765 non-null  bool  
 5   rating            253765 non-null  int64 
 6   is_reviewed       253765 non-null  bool  
 7   started_at_month  253765 non-null  object
 8   user_id_new       253765 non-null  int64 
 9   user_id_enc       253765 non-null  int64 
 10  item_id_enc       253765 non-null  int64 
 11  gt                253765 non-null  bool  
dtypes: bool(3), int64(5), object(4)
memory usage: 20.1+ MB


In [None]:
# добавляем таргет к кандидатам со значением:
# — 1 для тех item_id, которые пользователь прочитал
# — 0, для всех остальных 

events_labels["target"] = 1
candidates = candidates.merge(events_labels[["user_id_new", "item_id", "target"]].rename(columns={"user_id_new": "user_id"}), 
                              on = ['user_id', 'item_id'], how = 'left')
candidates["target"] = candidates["target"].fillna(0).astype("int")

# в кандидатах оставляем только тех пользователей, у которых есть хотя бы один положительный таргет
candidates_to_sample = candidates.groupby("user_id").filter(lambda x: x["target"].sum() > 0)



In [None]:
candidates

Unnamed: 0,user_id,item_id,als_score,cnt_score,target
0,1000000,3,0.972557,0.920225,0
1,1000000,15881,0.890201,0.905740,0
2,1000000,5,0.865850,0.918026,0
3,1000000,6,0.834282,0.916345,0
4,1000000,2,0.792929,0.925806,0
...,...,...,...,...,...
82993089,1430584,31327371,,0.786363,0
82993090,1430584,32841355,,0.784905,0
82993091,1430584,33828743,,0.784706,0
82993092,1430584,34037113,,0.784556,0


In [None]:
# candidates_to_sample

Unnamed: 0,user_id,item_id,als_score,cnt_score,target
600,1000006,3,0.929247,,0
601,1000006,15881,0.904462,,0
602,1000006,5,0.887194,,0
603,1000006,6,0.872281,,0
604,1000006,2,0.842997,,0
...,...,...,...,...,...
82993089,1430584,31327371,,0.786363,0
82993090,1430584,32841355,,0.784905,0
82993091,1430584,33828743,,0.784706,0
82993092,1430584,34037113,,0.784556,0


In [None]:
# candidates_to_sample.query("target == 1")

Unnamed: 0,user_id,item_id,als_score,cnt_score,target
615,1000006,29868610,0.286715,,1
632,1000006,7445,0.230529,,1
649,1000006,18812405,0.178382,,1
1998,1000019,37415,0.043595,,1
2302,1000023,7260188,0.598791,,1
...,...,...,...,...,...
82983061,1430476,104378,,0.953409,1
82983062,1430476,104379,,0.951988,1
82983080,1430476,816953,,0.940140,1
82984419,1430490,16301111,,0.955308,1


In [None]:
# для каждого пользователя оставляем только 4 негативных примера
negatives_per_user = 4
candidates_for_train = pd.concat([
    candidates_to_sample.query("target == 1"),
    candidates_to_sample.query("target == 0") \
        .groupby("user_id") \
        .apply(lambda x: x.sample(negatives_per_user, random_state=0))
    ])

In [None]:
candidates_for_train

Unnamed: 0,user_id,item_id,als_score,cnt_score,target
615,1000006,29868610,0.286715,,1
632,1000006,7445,0.230529,,1
649,1000006,18812405,0.178382,,1
1998,1000019,37415,0.043595,,1
2302,1000023,7260188,0.598791,,1
...,...,...,...,...,...
"(1430579, 82992597)",1430579,15698462,,0.900922,0
"(1430584, 43058418)",1430584,18774964,0.222126,,0
"(1430584, 82993064)",1430584,8393104,,0.795215,0
"(1430584, 82993001)",1430584,24929,,0.847833,0


### обучение модели

In [None]:
from catboost import CatBoostClassifier, Pool

# задаём имена колонок признаков и таргета
features = ['als_score', 'cnt_score']
target = 'target'

# Create the Pool object
train_data = Pool(
    data=candidates_for_train[features], 
    label=candidates_for_train[target])

# инициализируем модель CatBoostClassifier
cb_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    verbose=100,
    random_seed=0
)

# тренируем модель
cb_model.fit(train_data)

0:	learn: 0.6526057	total: 96.7ms	remaining: 1m 36s
100:	learn: 0.5118959	total: 1.93s	remaining: 17.2s
200:	learn: 0.5111710	total: 3.83s	remaining: 15.2s
300:	learn: 0.5105208	total: 5.75s	remaining: 13.3s
400:	learn: 0.5100174	total: 7.65s	remaining: 11.4s
500:	learn: 0.5095747	total: 9.55s	remaining: 9.52s
600:	learn: 0.5091600	total: 11.5s	remaining: 7.62s
700:	learn: 0.5087803	total: 13.4s	remaining: 5.72s
800:	learn: 0.5084220	total: 15.3s	remaining: 3.8s
900:	learn: 0.5080930	total: 17.2s	remaining: 1.89s
999:	learn: 0.5078081	total: 19.1s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7fd6bcc91a50>

### Задание 4

In [None]:
# загружаем рекомендации от двух базовых генераторов
als_recommendations_2 = pd.read_parquet("candidates/inference/als_recommendations.parquet")
content_recommendations_2 = pd.read_parquet("candidates/inference/content_recommendations.parquet")





: 

In [None]:
candidates_to_rank = pd.merge(
    als_recommendations_2[["user_id", "item_id", "score"]].rename(columns={"score": "als_score"}),
    content_recommendations_2[["user_id", "item_id", "score"]].rename(columns={"score": "cnt_score"}),
    on=['user_id', 'item_id'],
    how="outer")

In [None]:
# оставляем только тех пользователей, что есть в тестовой выборке, для экономии ресурсов
candidates_to_rank = candidates_to_rank[candidates_to_rank["user_id"].isin(events_test_2["user_id_new"].drop_duplicates())]
print(len(candidates_to_rank))

14517152


### Задание 5

In [None]:
inference_data = Pool(data=candidates_to_rank[features])
predictions = cb_model.predict_proba(inference_data)



In [None]:
candidates_to_rank[features]

Unnamed: 0,als_score,cnt_score
300,1.129979,
301,1.123475,
302,1.112699,
303,1.060634,
304,0.903286,
...,...,...
83152908,,0.902576
83152909,,0.902539
83152910,,0.901478
83152911,,0.901235


In [None]:
predictions

array([[0.49099514, 0.50900486],
       [0.49099514, 0.50900486],
       [0.49099514, 0.50900486],
       ...,
       [0.93805367, 0.06194633],
       [0.93805367, 0.06194633],
       [0.9380585 , 0.0619415 ]])

In [None]:
predictions[:, 1]

array([0.50900486, 0.50900486, 0.50900486, ..., 0.06194633, 0.06194633,
       0.0619415 ])

In [None]:
candidates_to_rank["cb_score"] = predictions[:, 1]

In [None]:
# для каждого пользователя проставляем rank, начиная с 1 — это максимальный cb_score
candidates_to_rank = candidates_to_rank.sort_values(["user_id", "cb_score"], ascending=[True, False])


In [None]:
candidates_to_rank_2 = candidates_to_rank.copy(deep=True)
candidates_to_rank_2["rank"] = 1


In [None]:
candidates_to_rank_2

Unnamed: 0,user_id,item_id,als_score,cnt_score,cb_score,rank
347,1000003,49628,0.446143,0.906649,0.583617,1
300,1000003,7260188,1.129979,,0.509005,1
301,1000003,6148028,1.123475,,0.509005,1
302,1000003,2767052,1.112699,,0.509005,1
320,1000003,43641,0.617602,,0.477032,1
...,...,...,...,...,...,...
82993814,1430580,183087,,0.981247,0.066960,1
82993807,1430580,33906,,0.988908,0.064737,1
82993808,1430580,82436,,0.987934,0.064737,1
82993809,1430580,82780,,0.987217,0.061955,1


In [None]:
candidates_to_rank_2["rank"] = candidates_to_rank_2.groupby('user_id')['cb_score'].rank(method='first', ascending=False)

In [None]:
candidates_to_rank_2

Unnamed: 0,user_id,item_id,als_score,cnt_score,cb_score,rank
347,1000003,49628,0.446143,0.906649,0.583617,1.0
300,1000003,7260188,1.129979,,0.509005,2.0
301,1000003,6148028,1.123475,,0.509005,3.0
302,1000003,2767052,1.112699,,0.509005,4.0
320,1000003,43641,0.617602,,0.477032,5.0
...,...,...,...,...,...,...
82993814,1430580,183087,,0.981247,0.066960,187.0
82993807,1430580,33906,,0.988908,0.064737,188.0
82993808,1430580,82436,,0.987934,0.064737,189.0
82993809,1430580,82780,,0.987217,0.061955,190.0


In [None]:
candidates_to_rank_2[candidates_to_rank_2['user_id']==1430580]

Unnamed: 0,user_id,item_id,als_score,cnt_score,cb_score,rank
43058000,1430580,99561,0.316549,0.985569,0.616248,1.0
43058022,1430580,12700353,0.049827,0.955355,0.555732,2.0
43058021,1430580,248704,0.051227,0.980603,0.507811,3.0
43058001,1430580,6442769,0.307937,0.961369,0.500981,4.0
43058006,1430580,49750,0.153669,0.988261,0.414065,5.0
...,...,...,...,...,...,...
82993814,1430580,183087,,0.981247,0.066960,187.0
82993807,1430580,33906,,0.988908,0.064737,188.0
82993808,1430580,82436,,0.987934,0.064737,189.0
82993809,1430580,82780,,0.987217,0.061955,190.0


In [None]:
candidates_to_rank_2.groupby('user_id').head(100)

Unnamed: 0,user_id,item_id,als_score,cnt_score,cb_score,rank
347,1000003,49628,0.446143,0.906649,0.583617,1.0
300,1000003,7260188,1.129979,,0.509005,2.0
301,1000003,6148028,1.123475,,0.509005,3.0
302,1000003,2767052,1.112699,,0.509005,4.0
320,1000003,43641,0.617602,,0.477032,5.0
...,...,...,...,...,...,...
43058095,1430580,23705512,0.016477,,0.232760,96.0
43058096,1430580,6314763,0.016404,,0.232760,97.0
43058097,1430580,11710373,0.016035,,0.221228,98.0
43058098,1430580,7445,0.015793,,0.221228,99.0


In [None]:
candidates_to_rank["rank"] = candidates_to_rank.groupby('user_id')['cb_score'].rank(method='first', ascending=False)

max_recommendations_per_user = 100
final_recommendations = candidates_to_rank.groupby('user_id').head(100)

### Задание 6

In [None]:
events_inference = pd.concat([events_train, events_labels])

In [None]:
# events_inference

Unnamed: 0,user_id,item_id,started_at,read_at,is_read,rating,is_reviewed,started_at_month,user_id_new,target
6679625,00000377eea48021d3002730d56aca9a,11012,2015-12-05,2015-12-11,True,4,False,2015-12-01,1000000,
6679617,00000377eea48021d3002730d56aca9a,4671,2014-06-05,2014-06-30,True,5,False,2014-06-01,1000000,
6679618,00000377eea48021d3002730d56aca9a,5,2012-10-02,2012-10-24,True,5,False,2012-10-01,1000000,
6679620,00000377eea48021d3002730d56aca9a,2,2009-07-12,2009-07-29,True,5,False,2009-07-01,1000000,
6679621,00000377eea48021d3002730d56aca9a,14497,2016-05-09,2016-06-02,True,5,False,2016-05-01,1000000,
...,...,...,...,...,...,...,...,...,...,...
1285202,ffff601c0ffa34bd5ffbbf2caee30644,216378,2017-08-27,2017-08-28,True,4,False,2017-08-01,1430578,1.0
1285203,ffff601c0ffa34bd5ffbbf2caee30644,6723348,2017-08-02,2017-08-03,True,4,False,2017-08-01,1430578,1.0
1941753,ffff7cafdaf5196383cb2efca08fb6fe,27272506,2017-09-03,2017-10-07,True,3,True,2017-09-01,1430579,1.0
5625380,fffff8a718843c0e11dfd93fb41c1297,23395680,2017-08-09,2017-08-15,True,4,True,2017-08-01,1430584,1.0


In [None]:
events_inference.drop('user_id', axis=1).rename(columns={"user_id_new": "user_id"})

Unnamed: 0,item_id,started_at,read_at,is_read,rating,is_reviewed,started_at_month,user_id,target
6679625,11012,2015-12-05,2015-12-11,True,4,False,2015-12-01,1000000,
6679617,4671,2014-06-05,2014-06-30,True,5,False,2014-06-01,1000000,
6679618,5,2012-10-02,2012-10-24,True,5,False,2012-10-01,1000000,
6679620,2,2009-07-12,2009-07-29,True,5,False,2009-07-01,1000000,
6679621,14497,2016-05-09,2016-06-02,True,5,False,2016-05-01,1000000,
...,...,...,...,...,...,...,...,...,...
1285202,216378,2017-08-27,2017-08-28,True,4,False,2017-08-01,1430578,1.0
1285203,6723348,2017-08-02,2017-08-03,True,4,False,2017-08-01,1430578,1.0
1941753,27272506,2017-09-03,2017-10-07,True,3,True,2017-09-01,1430579,1.0
5625380,23395680,2017-08-09,2017-08-15,True,4,True,2017-08-01,1430584,1.0


In [None]:
cb_events_recs_for_binary_metrics_5 = process_events_recs_for_binary_metrics(
    events_inference.drop('user_id', axis=1).rename(columns={"user_id_new": "user_id"}),
    events_test_2.drop('user_id', axis=1).rename(columns={"user_id_new": "user_id"}),
    final_recommendations.rename(columns={"cb_score": "score"}), 
    top_k=5)

cb_precision_5, cb_recall_5 = compute_cls_metrics(cb_events_recs_for_binary_metrics_5)

print(f"precision: {cb_precision_5:.3f}, recall: {cb_recall_5:.3f}")

Common users: 75194
precision: 0.006, recall: 0.015


# === Двухстадийный подход: построение признаков

### Задание 1

In [None]:
items["age"] = 2018-items["publication_year"]
invalid_age_idx = items["age"] < 0
items.loc[invalid_age_idx, "age"] = np.nan
items["age"] = items["age"].astype("float")



In [None]:
items.head(3)

Unnamed: 0,item_id,author,title,description,genre_and_votes,num_pages,average_rating,ratings_count,text_reviews_count,publisher,publication_year,country_code,language_code,format,is_ebook,isbn,isbn13,genre_and_votes_dict,genre_and_votes_str,age
3,6066819,Jennifer Weiner,Best Friends Forever,Addie Downs and Valerie Adler were eight when ...,"{'Womens Fiction-Chick Lit': 739, 'Fiction': 442}",368.0,3.49,51184,3282,Atria Books,2009.0,US,eng,Hardcover,False,743294297,9780743294294,"{'Academic': None, 'Academic-Academia': None, ...","Womens Fiction-Chick Lit 739, Fiction 442",9.0
6,378460,Michael Halberstam,The Wanting of Levine,,"{'Politics': 1, 'Humor': 1}",,4.38,12,4,Berkley Publishing Group,1979.0,US,,Paperback,False,425040887,9780425040881,"{'Academic': None, 'Academic-Academia': None, ...","Politics 1user, Humor 1user",39.0
15,89375,"Don Piper, Cecil Murphey",90 Minutes in Heaven: A True Story of Death an...,As he is driving home from a minister's confer...,"{'Christian': 395, 'Nonfiction': 392, 'Religio...",,3.91,68157,2885,,,US,,,False,800759494,9780800759490,"{'Academic': None, 'Academic-Academia': None, ...","Christian 395, Nonfiction 392, Religion 142, S...",


In [None]:
# candidates_for_train = pd.concat([
#     candidates_to_sample.query("target == 1"),
#     candidates_to_sample.query("target == 0") \
#         .groupby("user_id") \
#         .apply(lambda x: x.sample(negatives_per_user, random_state=0))
#     ])

In [None]:
candidates_for_train = candidates_for_train.merge(items[['item_id', 'average_rating', 'age']], 
                                                  on = 'item_id', how='left')



In [None]:
candidates_to_rank = candidates_to_rank.merge(items[['item_id', 'average_rating', 'age']], on = 'item_id', how='left')

In [None]:
candidates_to_rank.age.median()

7.0

### Задание 2

In [None]:
events

Unnamed: 0,user_id,item_id,started_at,read_at,is_read,rating,is_reviewed,started_at_month,user_id_new
6679625,00000377eea48021d3002730d56aca9a,11012,2015-12-05,2015-12-11,True,4,False,2015-12-01,1000000
6679617,00000377eea48021d3002730d56aca9a,4671,2014-06-05,2014-06-30,True,5,False,2014-06-01,1000000
6679618,00000377eea48021d3002730d56aca9a,5,2012-10-02,2012-10-24,True,5,False,2012-10-01,1000000
6679620,00000377eea48021d3002730d56aca9a,2,2009-07-12,2009-07-29,True,5,False,2009-07-01,1000000
6679621,00000377eea48021d3002730d56aca9a,14497,2016-05-09,2016-06-02,True,5,False,2016-05-01,1000000
...,...,...,...,...,...,...,...,...,...
5625379,fffff8a718843c0e11dfd93fb41c1297,25111004,2016-12-08,2016-12-08,True,5,False,2016-12-01,1430584
5625378,fffff8a718843c0e11dfd93fb41c1297,6606855,2017-03-01,2017-03-01,True,3,False,2017-03-01,1430584
5625377,fffff8a718843c0e11dfd93fb41c1297,18812405,2017-05-05,2017-05-31,True,3,True,2017-05-01,1430584
5625376,fffff8a718843c0e11dfd93fb41c1297,18692431,2017-08-02,2017-08-09,True,3,True,2017-08-01,1430584


In [None]:
events_test

Unnamed: 0,user_id,item_id,started_at,read_at,is_read,rating,is_reviewed,started_at_month,user_id_new
11439209,00009e46d18f223a82b22da38586b605,25893709,2017-10-05,2017-10-17,True,4,False,2017-10-01,1000003
1264898,0001085188e302fc6b2568de45a5f56b,34076952,2017-10-09,2017-10-24,True,5,False,2017-10-01,1000005
2478095,00014c578111090720e20f5705eba051,18774964,2017-10-04,2017-10-22,True,4,False,2017-10-01,1000006
2478098,00014c578111090720e20f5705eba051,29868610,2017-08-30,2017-09-16,True,4,False,2017-08-01,1000006
2478104,00014c578111090720e20f5705eba051,7445,2017-08-26,2017-08-30,True,4,False,2017-08-01,1000006
...,...,...,...,...,...,...,...,...,...
1941753,ffff7cafdaf5196383cb2efca08fb6fe,27272506,2017-09-03,2017-10-07,True,3,True,2017-09-01,1430579
12578630,ffff7cf38c717c8172ff5ba656cec6df,22021611,2017-10-05,2017-10-05,True,4,False,2017-10-01,1430580
12578629,ffff7cf38c717c8172ff5ba656cec6df,15749186,2017-10-05,2017-10-18,True,4,False,2017-10-01,1430580
5625380,fffff8a718843c0e11dfd93fb41c1297,23395680,2017-08-09,2017-08-15,True,4,True,2017-08-01,1430584


In [None]:
pd.concat([events_train, events_labels])

Unnamed: 0,user_id,item_id,started_at,read_at,is_read,rating,is_reviewed,started_at_month,user_id_new,target
6679625,00000377eea48021d3002730d56aca9a,11012,2015-12-05,2015-12-11,True,4,False,2015-12-01,1000000,
6679617,00000377eea48021d3002730d56aca9a,4671,2014-06-05,2014-06-30,True,5,False,2014-06-01,1000000,
6679618,00000377eea48021d3002730d56aca9a,5,2012-10-02,2012-10-24,True,5,False,2012-10-01,1000000,
6679620,00000377eea48021d3002730d56aca9a,2,2009-07-12,2009-07-29,True,5,False,2009-07-01,1000000,
6679621,00000377eea48021d3002730d56aca9a,14497,2016-05-09,2016-06-02,True,5,False,2016-05-01,1000000,
...,...,...,...,...,...,...,...,...,...,...
1285202,ffff601c0ffa34bd5ffbbf2caee30644,216378,2017-08-27,2017-08-28,True,4,False,2017-08-01,1430578,1.0
1285203,ffff601c0ffa34bd5ffbbf2caee30644,6723348,2017-08-02,2017-08-03,True,4,False,2017-08-01,1430578,1.0
1941753,ffff7cafdaf5196383cb2efca08fb6fe,27272506,2017-09-03,2017-10-07,True,3,True,2017-09-01,1430579,1.0
5625380,fffff8a718843c0e11dfd93fb41c1297,23395680,2017-08-09,2017-08-15,True,4,True,2017-08-01,1430584,1.0


In [None]:
def get_user_features(events):
    """ считает пользовательские признаки """
    
    user_features = events.groupby("user_id_new").agg(
        reading_years=("started_at", lambda x: (x.max()-x.min()).days/365.25),
        books_read=('is_read', "sum"),
        rating_avg=("rating", "mean"),
        rating_std=("rating", "std"))
    
    user_features["books_per_year"] = user_features["books_read"] / user_features["reading_years"]
    
    return user_features
    
user_features_for_train = get_user_features(events_train)




In [None]:
user_features_for_train

Unnamed: 0_level_0,reading_years,books_read,rating_avg,rating_std,books_per_year
user_id_new,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000000,8.643395,29,4.482759,0.687682,3.355163
1000001,0.104038,2,4.500000,0.707107,19.223684
1000002,2.554415,4,3.750000,0.500000,1.565916
1000003,7.414100,94,3.287234,0.712746,12.678545
1000004,4.388775,133,4.007519,0.883315,30.304585
...,...,...,...,...,...
1430580,0.000000,1,3.000000,,inf
1430581,1.930185,8,3.500000,1.195229,4.144681
1430582,0.355921,3,4.000000,0.000000,8.428846
1430583,0.213552,2,4.000000,0.000000,9.365385


In [None]:
user_features_for_train

Unnamed: 0_level_0,reading_years,books_read,rating_avg,rating_std,books_per_year
user_id_new,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000000,8.643395,29,4.482759,0.687682,3.355163
1000001,0.104038,2,4.500000,0.707107,19.223684
1000002,2.554415,4,3.750000,0.500000,1.565916
1000003,7.414100,94,3.287234,0.712746,12.678545
1000004,4.388775,133,4.007519,0.883315,30.304585
...,...,...,...,...,...
1430580,0.000000,1,3.000000,,inf
1430581,1.930185,8,3.500000,1.195229,4.144681
1430582,0.355921,3,4.000000,0.000000,8.428846
1430583,0.213552,2,4.000000,0.000000,9.365385


In [None]:
candidates_for_train

Unnamed: 0,user_id,item_id,als_score,cnt_score,target,average_rating,age
0,1000006,29868610,0.286715,,1,3.90,
1,1000006,7445,0.230529,,1,4.24,12.0
2,1000006,18812405,0.178382,,1,3.81,4.0
3,1000019,37415,0.043595,,1,3.87,12.0
4,1000023,7260188,0.598791,,1,4.03,8.0
...,...,...,...,...,...,...,...
213703,1430579,15698462,,0.900922,0,3.60,6.0
213704,1430584,18774964,0.222126,,0,4.35,4.0
213705,1430584,8393104,,0.795215,0,3.64,8.0
213706,1430584,24929,,0.847833,0,2.81,16.0


In [None]:
candidates_for_train.rename(columns={"user_id": "user_id_new"})

Unnamed: 0,user_id_new,item_id,als_score,cnt_score,target,average_rating,age
0,1000006,29868610,0.286715,,1,3.90,
1,1000006,7445,0.230529,,1,4.24,12.0
2,1000006,18812405,0.178382,,1,3.81,4.0
3,1000019,37415,0.043595,,1,3.87,12.0
4,1000023,7260188,0.598791,,1,4.03,8.0
...,...,...,...,...,...,...,...
213703,1430579,15698462,,0.900922,0,3.60,6.0
213704,1430584,18774964,0.222126,,0,4.35,4.0
213705,1430584,8393104,,0.795215,0,3.64,8.0
213706,1430584,24929,,0.847833,0,2.81,16.0


In [None]:
candidates_for_train = candidates_for_train.rename(columns={"user_id": "user_id_new"}).merge(user_features_for_train, 
                                                                                 on="user_id_new", how="left")
  


In [None]:
# оставим только тех пользователей, что есть в тесте, для экономии ресурсов
events_inference = pd.concat([events_train, events_labels])
events_inference = events_inference[events_inference["user_id_new"].isin(events_test["user_id_new"].drop_duplicates())]

In [None]:
events_inference

Unnamed: 0,user_id,item_id,started_at,read_at,is_read,rating,is_reviewed,started_at_month,user_id_new,target
11439279,00009e46d18f223a82b22da38586b605,754771,2017-04-13,2017-05-25,True,4,True,2017-04-01,1000003,
11439270,00009e46d18f223a82b22da38586b605,7260188,2012-07-30,2012-08-11,True,4,True,2012-07-01,1000003,
11439277,00009e46d18f223a82b22da38586b605,7945049,2012-01-28,2012-01-30,True,3,False,2012-01-01,1000003,
11439276,00009e46d18f223a82b22da38586b605,7790906,2012-04-25,2012-04-27,True,3,False,2012-04-01,1000003,
11439275,00009e46d18f223a82b22da38586b605,9361589,2014-10-28,2014-11-12,True,4,False,2014-10-01,1000003,
...,...,...,...,...,...,...,...,...,...,...
1285202,ffff601c0ffa34bd5ffbbf2caee30644,216378,2017-08-27,2017-08-28,True,4,False,2017-08-01,1430578,1.0
1285203,ffff601c0ffa34bd5ffbbf2caee30644,6723348,2017-08-02,2017-08-03,True,4,False,2017-08-01,1430578,1.0
1941753,ffff7cafdaf5196383cb2efca08fb6fe,27272506,2017-09-03,2017-10-07,True,3,True,2017-09-01,1430579,1.0
5625380,fffff8a718843c0e11dfd93fb41c1297,23395680,2017-08-09,2017-08-15,True,4,True,2017-08-01,1430584,1.0


In [None]:
user_features_for_ranking = get_user_features(events_inference)

In [None]:
user_features_for_ranking

Unnamed: 0_level_0,reading_years,books_read,rating_avg,rating_std,books_per_year
user_id_new,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000003,7.414100,94,3.287234,0.712746,12.678545
1000005,1.180014,16,4.375000,0.957427,13.559165
1000006,1.998631,20,4.200000,0.695852,10.006849
1000007,5.399042,59,3.983051,0.955772,10.927865
1000009,4.862423,92,4.086957,0.909681,18.920608
...,...,...,...,...,...
1430573,3.728953,20,4.750000,0.550120,5.363436
1430578,7.570157,188,4.175532,0.771163,24.834358
1430579,6.611910,265,3.924528,1.142317,40.079193
1430580,0.000000,1,3.000000,,inf


In [None]:
candidates_to_rank = candidates_to_rank.rename(columns={"user_id": "user_id_new"}).merge(user_features_for_ranking, 
                                                                                 on="user_id_new", how="left")

In [None]:
candidates_for_train.books_read.median()

32.0

### Задание 3

In [None]:
# определяем индексы топ-10 жанров и всех остальных
genres_top_k = 3
genres_top_idx = genres.sort_values("votes", ascending=False).head(genres_top_k).index
genres_others_idx = list(set(genres.index) - set(genres_top_idx))



In [None]:
genres

Unnamed: 0_level_0,name,votes,score
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Womens Fiction-Chick Lit,254558,0.005561
1,Fiction,6406256,0.139955
2,Politics,103296,0.002257
3,Humor,304302,0.006648
4,Christian,105273,0.002300
...,...,...,...
810,German History-Nazi Party,0,0.000000
811,Favorites,0,0.000000
812,History-Latin American History,0,0.000000
813,Cryptids-Bigfoot,0,0.000000


In [None]:
genres_top_columns = [f"genre_{id}" for id in genres_top_idx]
genres_others_column = "genre_others"
genre_columns = genres_top_columns + [genres_others_column] # ваш код здесь #

In [None]:
genre_columns

['genre_25', 'genre_1', 'genre_38', 'genre_others']

In [None]:
genres_top_columns

['genre_25', 'genre_1', 'genre_38']

In [None]:
all_items_genres_csr

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 210895 stored elements and shape (43312, 815)>

In [None]:
pd.DataFrame.sparse.from_spmatrix(all_items_genres_csr[:, genres_top_idx], columns=genres_top_columns)

Unnamed: 0,genre_25,genre_1,genre_38
0,0.524988,0.148026,0.014064
1,0.621931,0.174786,0.000000
2,0.514586,0.154140,0.022797
3,0.518708,0.148702,0.015347
4,0.623564,0.176369,0.000000
...,...,...,...
43307,0.000000,0.000000,0.000000
43308,0.000000,0.000000,0.000000
43309,0.000000,0.444444,0.000000
43310,0.000000,0.000000,0.000000


In [None]:
# составляем таблицу принадлежности книг к жанрам
item_genres = (
    pd.concat([
        # топ жанров
        pd.DataFrame.sparse.from_spmatrix(all_items_genres_csr[:, genres_top_idx], columns=genres_top_columns),
        # все остальные жанры
        pd.DataFrame(all_items_genres_csr[:, genres_others_idx].sum(axis=1), columns=[genres_others_column])
        ],
        axis=1)
    .reset_index()
    .rename(columns={"index": "item_id_enc"})
)



In [None]:
item_genres

Unnamed: 0,item_id_enc,genre_25,genre_1,genre_38,genre_others
0,0,0.524988,0.148026,0.014064,0.312922
1,1,0.621931,0.174786,0.000000,0.203283
2,2,0.514586,0.154140,0.022797,0.308478
3,3,0.518708,0.148702,0.015347,0.317243
4,4,0.623564,0.176369,0.000000,0.200067
...,...,...,...,...,...
43307,43307,0.000000,0.000000,0.000000,1.000000
43308,43308,0.000000,0.000000,0.000000,1.000000
43309,43309,0.000000,0.444444,0.000000,0.555556
43310,43310,0.000000,0.000000,0.000000,1.000000


In [None]:
# объединяем информацию принадлежности книг к жанрам с основной информацией о книгах
items = items.merge(item_genres, on="item_id_enc", how="left")

def get_user_genres(events, items, item_genre_columns):
    user_genres = (
        events
        .merge(items[["item_id"] + item_genre_columns], on="item_id", how="left")
        .groupby("user_id")[item_genre_columns].mean()
    )
    return user_genres
    


MergeError: Passing 'suffixes' which cause duplicate columns {'genre_38_x', 'genre_others_x', 'genre_1_x', 'genre_25_x'} is not allowed.

In [None]:
user_genres_for_train = get_user_genres(events_train, items, genre_columns)

KeyboardInterrupt: 

In [None]:
candidates_for_train = candidates_for_train.merge(user_genres_for_train, on="user_id", how="left")

In [None]:
user_genres_for_ranking = get_user_genres(events_inference, items, genre_columns)

KeyboardInterrupt: 

In [None]:
candidates_to_rank = candidates_to_rank.merge(user_genres_for_ranking, on="user_id", how="left")