In [1]:
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight

from functools import partial
from itertools import zip_longest
from sklearn.preprocessing import StandardScaler
from sklearn.base import TransformerMixin

from typing import List

## load & split

In [381]:
purchases = pd.read_csv('retail_train.csv')
purchases.head(3)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0


In [382]:
# train/test split
test_size_weeks = 3

train = purchases[purchases['week_no'] < purchases['week_no'].max() - test_size_weeks].copy()
test = purchases[purchases['week_no'] >= purchases['week_no'].max() - test_size_weeks].copy()

In [383]:
products = pd.read_csv('product.csv')
products.columns = products.columns.str.lower()
products.head(3)

Unnamed: 0,product_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,


In [384]:
# prepare result DataFrame
true_values = test.groupby('user_id')['item_id'].unique().reset_index()
true_values.columns=['user_id', 'actual']
true_values.head(3)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."


## Used functions

In [385]:
class BColor:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

def cprint(msg, color: BColor):
    print(f'{color}{msg}{BColor.ENDC}')

In [386]:
# Топ популярных товаров
def popularity_measure(source, fields: List[str], k=5000, beta: List[float] = None, add_target=None, scaler=None):
    """ Расчет оценки важности товара в покупке и отбор топ K наиболее популярных товаров
    :param source - исходные данные
    :param fields - признаки, по которым измеряется мера важности товара
    :param k - количество товаров, отбираемых в топ
    :param beta - множители значимости для каждого признака в оценке
    :param add_target - название финального признака. Признак не добавляется, если target = None
    :param scaler - класс масштабирования данных
    """
    b = [1.] * len(fields) if beta is None else np.array(beta)
    assert len(fields) == len(b), '`fields` and `beta` dimensions must equal'
    assert issubclass(StandardScaler, TransformerMixin) or scaler is None, 'scaler must be a subclass of TransformerMixin'
    _df = source[['item_id']].copy()
    prepared = scaler().fit_transform(source[fields]) * b if scaler else source[fields] * b
    values = np.linalg.norm(prepared, ord=2, axis=1)
    _df['popularity'] = values
    if add_target:
        source.loc[:, add_target] = values
    popularity = _df.groupby('item_id')['popularity'].sum()
    return popularity.sort_values(ascending=False).head(k).index.tolist()

In [387]:
def check_model(uim, mdl_params, rec_params, res, ttl='als'):
    """
    :param uim: user-item matrix
    :param mdl_params: model init parameters
    :param rec_params: recommendation parameters
    :param res: true values, including user_id
    :param ttl: model title
    :return: predicted values (DataFrame)
    """
    mdl = AlternatingLeastSquares(**mdl_params)
    mdl.fit(uim.T, show_progress=False)
    # rec_params['user_items'] = uim
    res[ttl] = res['user_id'].apply(partial(recommender, mdl=mdl, params=rec_params))
    return mdl

In [388]:
# предсказатель-интерпретатор
def recommender(user_id, mdl, params):
    uid = userid_to_id.get(user_id, None)
    if uid is None:
        return list()
    rec_score = mdl.recommend(userid_to_id[user_id], **params)
    return [id_to_itemid[rec[0]] for rec in rec_score]

In [389]:
# метрики
def precision_at_k(recommended_list, bought_list, k=5):
    flags = np.isin(bought_list, recommended_list[:k])
    return flags.sum() / k

def ap_k(recommended_list, bought_list, k=5):
    flags = np.isin(recommended_list, bought_list)
    if sum(flags) == 0:
        return 0

    func = partial(precision_at_k, recommended_list, bought_list)
    rel_items = np.arange(1, k + 1)[flags[:k]]
    return np.sum(list(map(func, rel_items))) / flags.sum()

In [390]:
# сериализация расчета метрики
def calc_metric(metric_func, source: pd.DataFrame):
    """ Подсчет метрики
    :param metric_func - функция измерения метрики. Первый аргумент - рекомендации, второй - актуальные значения
    :param source - данные для подсчета метрики
    """
    def metric_wrapper(pred, act):
        return metric_func(pred, act) if len(pred) != 0 else 0

    metric = pd.DataFrame()
    for col in source.columns:
        if col == 'user_id':
            metric[col] = source[col]
        elif col == 'actual':
            continue
        else:
            metric[col] = source[[col, 'actual']].apply(lambda row: metric_wrapper(*row.values), axis=1)
    return metric

In [391]:
def compare_metrics(res, saveto=None):
    pr_at_k = calc_metric(partial(precision_at_k, k=5), res)
    ap_at_k = calc_metric(lambda pred, act: ap_k(pred, act, k=min(5, len(pred))), res)
    smr = pd.DataFrame([pr_at_k.mean(), ap_at_k.mean()], index=['precision@k', 'map@k']).drop(columns='user_id')
    if saveto:
        smr.T.to_csv(saveto)
    return smr

In [392]:
# def plot_weight_curve(data: pd.Series, p1=True):
#     """ Построение графиков весов """
#     _val = data.sort_values(ascending=False).values
#     fig, ax = plt.subplots(1, 2, figsize=(12, 3))
#     fig.suptitle(f'Weights curve for {data.name}')
#     ax[0].set_title('clean')
#     ax[0].plot(_val)
#     ax[1].set_title('log-scaled')
#     ax[1].plot(np.log1p(_val) if p1 else np.log(_val))
#     plt.show()

In [393]:
# def apply_weights(uim, wl, axis, top):
#     assert len(wl) == len(axis), 'weights and axis lists must be same-dimensional'
#     res_mat = []
#     for ax, w in zip(axis, wl):
#         if ax in [1, 'u', 'user']:
#             mat = csr_matrix((uim.T * w).T).tocsr()
#         elif ax in [0, 'i', 'item']:
#             w = w[w.index.isin(top)]
#             w[-1] = 1
#             mat = csr_matrix(uim * w).tocsr()
#         else:
#             mat = csr_matrix(uim).tocsr()
#         res_mat.append(mat)
#     return res_mat

## Baseline

In [394]:
# %%time
# # стандартный топ5000 по кол-ву проданных единиц
# top5k = popularity_measure(train, ['quantity'], k=5000, add_target='popularity')
# top5k[:7]

In [395]:
%%time
# берем топ5000 по оценке популярности товара
top5k = popularity_measure(train, ['quantity', 'sales_value'], beta=[1., 1.], k=5000, add_target='popularity', scaler=StandardScaler)
top5k[:7]

CPU times: user 212 ms, sys: 3.79 ms, total: 216 ms
Wall time: 216 ms


[6534178, 6533889, 1082185, 6534166, 6533765, 995242, 981760]

In [396]:
# обработка товаров не из топа
top_train = train.copy()
top_train.loc[~top_train['item_id'].isin(top5k), 'item_id'] = -1     # товары не из топ5000 превращаем в один товар
top_train.head(3)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,popularity
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,0.415439
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0,0.549036
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0,0.509067


In [397]:
# подготовка обучающих данных: составление таблицы user-item на основе популярности товара для пользователя
user_item_matrix = pd.pivot_table(top_train,
                                  index='user_id',
                                  columns='item_id',
                                  values='popularity',
                                  aggfunc='sum',
                                  fill_value=0
                                 )

user_item_matrix = user_item_matrix.astype(float)

# переведем в формат sparse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

In [398]:
user_item_matrix.head(3)

item_id,-1,201704,259120,397896,420647,480014,731106,818980,819063,819255,...,15927403,15927661,15971546,15972074,15972298,16053266,16100266,16769635,16809471,17242362
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,223.148156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,89.109578,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,64.136018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.127754,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [399]:
# remap dictionaries
id_to_itemid = dict(enumerate(user_item_matrix.columns.values))
id_to_userid = dict(enumerate(user_item_matrix.index.values))

itemid_to_id = {v: k for k, v in id_to_itemid.items()}
userid_to_id = {v: k for k, v in id_to_userid.items()}

In [400]:
# подготовка параметров обучения модели и параметров предсказателя
model_params = {'factors': 44,
                'regularization': 0.001,
                'iterations': 15,
                'calculate_training_loss': True,
                'use_gpu': False,
                'random_state': 23}

recommender_params = {
    'user_items': sparse_user_item,
    'N': 5, # кол-во рекомендаций
    'filter_already_liked_items': False,
    'filter_items': [itemid_to_id[-1]],
    'recalculate_user': True
}

# sparse_user_item = csr_matrix(bm25_weight(user_item_matrix.T).T).tocsr()

In [401]:
%%time
# обучение
baseline = true_values.copy()
model = check_model(sparse_user_item, model_params, recommender_params, baseline, 'als_baseline')

CPU times: user 2min 2s, sys: 1min 33s, total: 3min 35s
Wall time: 27.8 s


In [402]:
compare_metrics(baseline)

Unnamed: 0,als_baseline
precision@k,0.207444
map@k,0.345969


## Recommender commons

In [403]:
def get_nearest(mdl, elem_id, k, mode):
    """ Get top K the nearest users/items to the given
    :param mdl: ALS fitted model
    :param elem_id: real user/item id
    :param k: number of items to find
    :param mode: users/items return switcher
    :return: list of similar users/items depend on mode
    """
    if (mode == 'user') or (mode == 0):
        return [id_to_userid[idx] for idx, _ in mdl.similar_users(userid=userid_to_id[elem_id], N=k + 1)[1:]]
    if (mode == 'item') or (mode == 1):
        return [id_to_itemid[idx] for idx, _ in mdl.similar_items(itemid=itemid_to_id[elem_id], N=k + 1)[1:]]
    return []

In [404]:
def filter_top_for_users(items, users, measure='popularity', k=5):
    """ Get users top purchases
    :param items: data grouped by users and items
    :param users: user ids array
    :param measure: ranging measure
    :param k: number of items to find
    :return ungrouped dataframe
    """
    filter_mask = (items['user_id'].isin(users)) & (items['item_id'] != -1)
    return items[filter_mask].sort_values(by=['user_id', measure], ascending=[True, False]).groupby('user_id').head(k)

In [405]:
def basic_filter(items, k, placeholder=()):
    """ Из списка товаров берем K первых, отличный от товара-заглушки, а если таких нет, то возвращаем заглушку """
    return result[:k] if (result := [item for item in items if item != -1]) else placeholder

def private_label_filter(items, k, placeholder=()):
    """ Из списка товаров берем сука только мыло и веревку """
    # убираем товары-заглушки
    if not (goods := pd.Series([idx for idx in items if idx != -1], dtype=np.int64)).size:
        return placeholder
    # собираем и присоединяем информацию по брендам товаров
    brand_map = products.set_index('product_id')['brand'].to_dict()
    prods = pd.DataFrame({'item_id': goods.values, 'brand': goods.map(brand_map)})
    # фильтруем private brand brand_map топ, не нарушая исходной сортировки
    filter_mask = prods['brand'] == 'Private'
    return pd.concat([prods[filter_mask], prods[~filter_mask]])['item_id'].head(k).values

In [406]:
def check_items_count(items, k):
    """ Check number of predictions for each user
    :param items: Series with users predictions. User ids must be in index
    :param k: number of required predictions
    :return: corrected predictions
    """
    # если похожие пользователи мало покупали, то рекомендаций может не хватить
    sizes = items.apply(len)
    if (low_pred := items.index[sizes < k]).any():
        cprint(f"Some users have less than {k} predictions!", BColor.WARNING)
        print(low_pred.tolist())
        # какая-то обработка подобных ситуаций
    if (nan_pred := items.index[sizes == 0]).any():
        cprint(f"Some users have no predictions at all!", BColor.FAIL)
        print(nan_pred.tolist())
        # какая-то обработка подобных ситуаций
    return items


## Similar-Item recommender

In [407]:
def agg_func(src):
    """ Аггрегатор похожих товаров: для каждого товара берем верхние в очереди если они еще не встречались в подборке """
    arr = np.array(list(zip_longest(*src)), dtype='float')
    res = []
    for row in range(arr.shape[0]):
        for col in range(arr.shape[1]):
            if np.isnan(item := arr[row, col]):
                continue
            if item not in res:
                res.append(item)
            else:
                for col_item in arr[row + 1:, col]:
                    if not np.isnan(col_item) and col_item not in res:
                        res.append(col_item)
                        break
    return np.array(res, dtype='int')

In [408]:
def similar_item_recommend(mdl, users, data, measure='popularity', k=5,
                           filter_func=basic_filter, placeholder=(), title='similar_items'):
    """ Recommend similar items based on top K purchases
    :param mdl: ALS fitted model
    :param users: user ids to recommend for
    :param data: source dataset
    :param measure: target field in the dataset
    :param k: number of items to recommend
    :param filter_func: additional filters like func(items: list) -> list
    :param placeholder: value to use if no predictions available
    :param title: name of target column
    :return: list of predictions for given user
    """
    # по userid получаем топ покупок пользователей
    group_items = data.groupby(['user_id', 'item_id'])[measure].sum().reset_index()
    user_item_top = filter_top_for_users(group_items, users, measure, k)

    # для каждого товара из топа пользователя находим ближайшие K товаров из топ5к
    user_item_top[title] = user_item_top['item_id'].apply(lambda x: get_nearest(mdl, x, k, 'item'))
    # для каждого товара итеративно берем его ближайший, если его еще нет в предложке,
    preds = user_item_top.groupby('user_id')[title].agg(agg_func)

    # теперь можем дополнительно отфильтровать полученные списки
    #   если фильтр не указан - берем первые К товаров
    preds = preds.apply(lambda val: filter_func(val, k, placeholder) if filter_func and callable(filter_func) else lambda x: x[:k])

    # добавляем тех, для кого предсказания отсутствуют
    items = pd.Series([np.array(placeholder)] * len(users), index=users, name=title)
    items.update(preds)
    # проверяем количество предсказаний
    items = check_items_count(items, k)
    return items

In [409]:
# init predictions
predictions = baseline.copy()

In [410]:
%%time
# базовые рекомендации по similar item
basic_similar_items = similar_item_recommend(model, baseline['user_id'], top_train,
                                             k=5, title='basic_similar_items')
predictions = predictions.merge(basic_similar_items, on='user_id', how='left')

[93mSome users have less than 5 predictions![0m
[650, 954, 1987]
[91mSome users have no predictions at all![0m
[650, 954, 1987]
CPU times: user 7.43 s, sys: 5.66 s, total: 13.1 s
Wall time: 2.05 s


In [411]:
%%time
# рекомендации по similar item предпочтительно private brand
brand_similar_items = similar_item_recommend(model, baseline['user_id'], top_train,
                                             k=5, title='brand_similar_items', filter_func=private_label_filter)
predictions = predictions.merge(brand_similar_items, on='user_id', how='left')

[93mSome users have less than 5 predictions![0m
[650, 954, 1987]
[91mSome users have no predictions at all![0m
[650, 954, 1987]
CPU times: user 2min 33s, sys: 5.71 s, total: 2min 39s
Wall time: 2min 28s


In [412]:
predictions.head(3)

Unnamed: 0,user_id,actual,als_baseline,basic_similar_items,brand_similar_items
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[995242, 1082185, 1005186, 5978656, 6534178]","[880310, 1029743, 10150194, 1022053, 6534178]","[1029743, 6534178, 1098066, 981760, 826249]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[5569230, 6534178, 1133018, 1082185, 1053690]","[12810391, 1075979, 12263692, 847573, 1082185]","[896974, 1125904, 1138953, 1062425, 1026586]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[866211, 1127831, 878996, 834484, 854852]","[6534178, 948650, 825541, 1135476, 1082185]","[6534178, 981760, 1062425, 1000970, 1211227]"


In [413]:
# расчет метрик
compare_metrics(predictions)

Unnamed: 0,als_baseline,basic_similar_items,brand_similar_items
precision@k,0.207444,0.121156,0.085015
map@k,0.345969,0.188039,0.185877


## Similar-User recommender

In [415]:
def similar_user_recommend(mdl, users, data, measure='popularity', k=5,
                           filter_func=basic_filter, placeholder=(), title='similar_users'):
    """ Recommend items based on similar user purchases
    :param mdl: ALS fitted model
    :param users: user ids to recommend for
    :param data: source dataset
    :param measure: target field in the dataset
    :param k: number of items to recommend
    :param filter_func: additional filters like func(items: list) -> list
    :param placeholder: value to use if no predictions available
    :param title: name of target column
    :return: list of predictions for given user
    """
    # для каждого юзера из запроса находим K ближайших
    sim = pd.Series(users).apply(lambda uid: get_nearest(mdl, uid, k, 'user'))
    # для каждого пользователя в запросе составляем общий список товаров из топ К покупок каждого ближайшего пользователя
    # полученные списки содержат наиболее релевантные товары ближайшего(-их) пользователя(-ей)
    all_items = data.groupby(['user_id', 'item_id'])[measure].sum().reset_index()
    items = sim.apply(lambda x: filter_top_for_users(all_items, x, measure, k)['item_id'].drop_duplicates().values)
    # теперь можем дополнительно отфильтровать полученные списки
    #   если фильтр не указан - берем первые К товаров
    items = items.apply(lambda val: filter_func(val, k, placeholder) if filter_func and callable(filter_func) else lambda x: x[:k])
    # индексируем номерами пользователей
    items.name = title
    items.index = users
    # проверяем количество предсказаний
    items = check_items_count(items, k)
    return items

In [416]:
# init predictions
# predictions = baseline.copy()

In [417]:
%%time
# базовые рекомендации по similar user
basic_similar_users = similar_user_recommend(model, baseline['user_id'], top_train, measure='popularity',
                                             k=5, title='basic_similar_users')
predictions = predictions.merge(basic_similar_users, on='user_id', how='left')

[93mSome users have less than 5 predictions![0m
[650, 954, 1413, 1583, 1987]
CPU times: user 13 s, sys: 1.23 s, total: 14.3 s
Wall time: 12 s


In [418]:
%%time
# рекомендации по similar user предпочтительно private brand
brand_similar_users = similar_user_recommend(model, baseline['user_id'], top_train, measure='popularity',
                                             k=5, title='brand_similar_users', filter_func=private_label_filter)
predictions = predictions.merge(brand_similar_users, on='user_id', how='left')

[93mSome users have less than 5 predictions![0m
[650, 954, 1413, 1583, 1987]
CPU times: user 2min 33s, sys: 1.14 s, total: 2min 34s
Wall time: 2min 32s


In [419]:
predictions.head(3)

Unnamed: 0,user_id,actual,als_baseline,basic_similar_items,brand_similar_items,basic_similar_users,brand_similar_users
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[995242, 1082185, 1005186, 5978656, 6534178]","[880310, 1029743, 10150194, 1022053, 6534178]","[1029743, 6534178, 1098066, 981760, 826249]","[995242, 1082185, 908531, 904240, 1024306]","[995242, 908531, 904240, 862349, 994928]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[5569230, 6534178, 1133018, 1082185, 1053690]","[12810391, 1075979, 12263692, 847573, 1082185]","[896974, 1125904, 1138953, 1062425, 1026586]","[5569230, 949965, 897298, 1021324, 844179]","[907143, 1102139, 6534178, 1070015, 851254]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[866211, 1127831, 878996, 834484, 854852]","[6534178, 948650, 825541, 1135476, 1082185]","[6534178, 981760, 1062425, 1000970, 1211227]","[973802, 839656, 1049788, 948650, 8358621]","[839656, 1049788, 5996007, 980353, 1092363]"


In [420]:
# расчет метрик
compare_metrics(predictions)

Unnamed: 0,als_baseline,basic_similar_items,brand_similar_items,basic_similar_users,brand_similar_users
precision@k,0.207444,0.121156,0.085015,0.118805,0.11381
map@k,0.345969,0.188039,0.185877,0.288515,0.285108


## Metrics comparison

In [None]:
metrics = pd.read_csv('metrics.csv')

In [2]:
metrics[metrics['metric'] == 'precision@k']

Unnamed: 0,mode,metric,als_baseline,basic_similar_items,brand_similar_items,basic_similar_users,brand_similar_users
0,default,precision@k,0.167091,0.11048,0.101567,0.116748,0.126543
2,default (bm25),precision@k,0.201469,0.135945,0.131342,0.08668,0.099804
4,mixed top5k,precision@k,0.207444,0.121156,0.085015,0.118805,0.11381
6,mixed top5k (bm25),precision@k,0.222625,0.097551,0.097356,0.090597,0.09569


In [3]:
metrics[metrics['metric'] == 'map@k']

Unnamed: 0,mode,metric,als_baseline,basic_similar_items,brand_similar_items,basic_similar_users,brand_similar_users
1,default,map@k,0.290147,0.185881,0.171422,0.280848,0.289
3,default (bm25),map@k,0.357718,0.29487,0.227105,0.199244,0.224113
5,mixed top5k,map@k,0.345969,0.188039,0.185877,0.288515,0.285108
7,mixed top5k (bm25),map@k,0.426566,0.183015,0.189551,0.200237,0.218181


In [422]:
#