In [519]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

from functools import partial
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.base import TransformerMixin

from typing import List

### load & split

In [437]:
purchases = pd.read_csv('retail_train.csv')
purchases.head(3)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0


In [438]:
# train/test split
test_size_weeks = 3

train = purchases[purchases['week_no'] < purchases['week_no'].max() - test_size_weeks].copy()
test = purchases[purchases['week_no'] >= purchases['week_no'].max() - test_size_weeks].copy()

In [439]:
# products = pd.read_csv('product.csv')
# products.head(3)

In [440]:
# prepare result DataFrame
result = test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(3)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."


### Used functions

In [548]:
# Топ популярных товаров
def popularity_measure(source, fields: List[str], k=5000, beta: List[float] = None, add_target=None, scaler=None):
    """ Расчет оценки важности товара в покупке и отбор топ K наиболее популярных товаров
    :param source - исходные данные
    :param fields - признаки, по которым измеряется мера важности товара
    :param k - количество товаров, отбираемых в топ
    :param beta - множители значимости для каждого признака в оценке
    :param add_target - название финального признака. Признак не добавляется, если target = None
    :param scaler - класс масштабирования данных
    """
    b = [1.] * len(fields) if beta is None else np.array(beta)
    assert len(fields) == len(b), '`fields` and `beta` dimensions must equal'
    assert issubclass(StandardScaler, TransformerMixin) or scaler is None, 'scaler must be a subclass of TransformerMixin'
    _df = source[['item_id']].copy()
    prepared = scaler().fit_transform(source[fields]) * b if scaler else source[fields] * b
    values = np.linalg.norm(prepared, ord=2, axis=1)
    _df['popularity'] = values
    if add_target:
        source.loc[:, add_target] = values
    popularity = _df.groupby('item_id')['popularity'].sum()
    return popularity.sort_values(ascending=False).head(k).index.tolist()

In [442]:
# предсказатель-интерпретатор
def recommender(user_id, mdl, params):
    uid = userid_to_id.get(user_id, None)
    if uid is None:
        return list()
    rec_score = mdl.recommend(userid_to_id[user_id], **params)
    return [id_to_itemid[rec[0]] for rec in rec_score]


In [586]:
# метрики
def precision_at_k(recommended_list, bought_list, k=5):
    flags = np.isin(bought_list, recommended_list[:k])
    return flags.sum() / k

def ap_k(recommended_list, bought_list, k=5):
    flags = np.isin(recommended_list, bought_list)
    if sum(flags) == 0:
        return 0

    func = partial(precision_at_k, recommended_list, bought_list)
    rel_items = np.arange(1, k + 1)[flags[:k]]
    return np.sum(list(map(func, rel_items))) / flags.sum()

In [444]:
# сериализация расчета метрики
def calc_metric(metric_func, source: pd.DataFrame):
    """ Подсчет метрики
    :param metric_func - функция измерения метрики. Первый аргумент - рекомендации, второй - актуальные значения
    :param source - данные для подсчета метрики
    """
    def metric_wrapper(pred, act):
        return metric_func(pred, act) if len(pred) != 0 else 0

    metric = pd.DataFrame()
    for col in source.columns:
        if col == 'user_id':
            metric[col] = source[col]
        elif col == 'actual':
            continue
        else:
            metric[col] = source[[col, 'actual']].apply(lambda row: metric_wrapper(*row.values), axis=1)
    return metric

### Baseline

In [476]:
# %%time
# # стандартный топ5000 по кол-ву проданных единиц
# top5k = popularity_measure(train, ['quantity'], k=5000, add_target='popularity')
# top5k[:7]

CPU times: user 121 ms, sys: 73 µs, total: 121 ms
Wall time: 120 ms


[6534178, 6533889, 6534166, 6544236, 1404121, 397896, 1426702]

In [563]:
%%time
# берем топ5000 по оценке популярности
top5k = popularity_measure(train, ['quantity', 'sales_value'], beta=[1., 1.], k=5000, add_target='popularity', scaler=StandardScaler)
top5k[:7]

CPU times: user 157 ms, sys: 8.08 ms, total: 165 ms
Wall time: 163 ms


[6534178, 6533889, 1082185, 6534166, 6533765, 995242, 981760]

In [564]:
# обработка товаров не из топа
top_train = train.copy()
top_train.loc[~top_train['item_id'].isin(top5k), 'item_id'] = -1     # товары не из топ5000 превращаем в один товар
# top_train = top_train.loc[top_train['item_id'].isin(top_5000)]          # или оставляем только топ5000; еще один способ описан ниже
top_train.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,popularity
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,0.415439
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0,0.549036
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0,0.509067
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0,0.457485
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0,0.389835


In [572]:
# подготовка обучающих данных: составление таблицы user-item
user_item_matrix = pd.pivot_table(top_train,
                                  index='user_id',
                                  columns='item_id',
                                  values='popularity',
                                  aggfunc='sum',
                                  fill_value=0
                                 )

user_item_matrix = user_item_matrix.astype(float)

# переведем в формат sparse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

In [573]:
user_item_matrix.head(3)

item_id,-1,201704,259120,397896,420647,480014,731106,818980,819063,819255,...,15927403,15927661,15971546,15972074,15972298,16053266,16100266,16769635,16809471,17242362
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,223.148156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,89.109578,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,64.136018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.127754,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [574]:
# remap dictionaries
id_to_itemid = dict(enumerate(user_item_matrix.columns.values))
id_to_userid = dict(enumerate(user_item_matrix.index.values))

itemid_to_id = {v: k for k, v in id_to_itemid.items()}
userid_to_id = {v: k for k, v in id_to_userid.items()}

In [575]:
# обучение модели
model = AlternatingLeastSquares(factors=44,
                                regularization=0.001,
                                iterations=15,
                                calculate_training_loss=True,
                                use_gpu=False,
                                random_state=23)        # don't forget to fix RS!!!

model.fit(sparse_user_item.T, show_progress=True)
# Зачем в оригинальном ноутбуке в предыдущем шаге делаем sparse_user_item, если его нигде не используем?

  0%|          | 0/15 [00:00<?, ?it/s]

In [576]:
recommender_params = {
    'user_items': sparse_user_item,
    'N': 5, # кол-во рекомендаций
    'filter_already_liked_items': False,
    'filter_items': [itemid_to_id[-1]],
    'recalculate_user': True
}

In [590]:
%%time
result['als_baseline'] = result['user_id'].apply(partial(recommender, mdl=model, params=recommender_params))
result.head(3)

CPU times: user 1min 31s, sys: 1min 21s, total: 2min 52s
Wall time: 23.6 s


Unnamed: 0,user_id,actual,als_baseline
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[995242, 1082185, 1005186, 5978656, 6534178]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[5569230, 6534178, 1133018, 1082185, 1053690]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[866211, 1127831, 878996, 834484, 854852]"


In [591]:
pr_at_k = calc_metric(partial(precision_at_k, k=5), result)
ap_at_k = calc_metric(lambda pred, act: ap_k(pred, act, k=min(5, len(pred))), result)

In [592]:
summary = pd.DataFrame([pr_at_k.mean(), ap_at_k.mean()], index=['precision@k', 'map@k'])
summary.drop(columns='user_id')

Unnamed: 0,als_baseline
precision@k,0.207444
map@k,0.345969


### Матрицы весов

In [596]:
tfidf_weight(user_item_matrix.T).T.toarray()

array([[48.31278863,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [28.13112119,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [21.36721667,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [33.72772894,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [26.1882237 ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [40.98629335,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])