In [2]:
import json
import swifter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pylab import rcParams
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender

# Метрики
from implicit.evaluation import train_test_split
from implicit.evaluation import mean_average_precision_at_k, AUC_at_k, ndcg_at_k

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from metrics import precision_at_k, recall_at_k, money_precision_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

In [472]:
def prefilter_items(data, take_n_popular=3000, item_features=None):
    # Датасет с популярными товарами
    popularity = data_train.groupby('item_id')['user_id'].nunique().reset_index()
    popularity['share_unique_users'] = popularity['user_id'] / data_train['user_id'].nunique()
    
    # Уберем самые НЕ популярные товары (их и так НЕ купят)
    top_notpopular = popularity[popularity['share_unique_users'] < 0.001].item_id.tolist()
    print(data.shape[0])
    data = data[~data['item_id'].isin(top_notpopular)]
    print(data.shape[0])

    # Уберем слишком дешевые товары (менее 1 доллара): 
    items_law_price_lst = item_prices.loc[item_prices['item_price']<1].index.to_list()
    data = data[~data['item_id'].isin(items_law_price_lst)]


    # Возьмем топ по популярности
    popularity = data.groupby('item_id')['sales_value'].sum().reset_index()
    top = popularity.sort_values('sales_value', ascending=False).head(take_n_popular).item_id.tolist()

    # Заведем фиктивный item_id (если юзер покупал товары из топ, то он "купил" такой товар)
    data.loc[~data['item_id'].isin(top), 'item_id'] = pseudo_item_id

    return data

In [473]:
def get_recommendations(user, model, N=30):
    if user in userid_to_id.keys():
        res = [id_to_itemid[rec[0]] for rec in 
                        model.recommend(userid=userid_to_id[user], 
                                        user_items=csr_matrix(user_item_matrix).tocsr(),   # на вход user-item matrix
                                        N=N, 
                                        filter_already_liked_items=False, 
                                        filter_items=[itemid_to_id[pseudo_item_id]],
                                        recalculate_user=False)]
    else:
        res = overall_top_purchases[:N]
    return res

In [474]:
def get_recommend_lst_func(u):
    res = get_recommendations(u, model=model)
    return res

In [475]:
def get_own_recommend_lst_func(u):
    res = get_recommendations(u, model=model_own)
    return res

In [476]:
data = pd.read_csv('./raw_data/retail_train.csv')

data.columns = [col.lower() for col in data.columns]
data.rename(columns={'household_key': 'user_id',
                    'product_id': 'item_id'},
           inplace=True)


test_size_weeks = 3

# data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_train = data[(data['week_no'] < data['week_no'].max() - test_size_weeks) & (data['week_no'] > 85)]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

# # Warm start
# train_users = data_train['user_id'].unique()
# data_test = data_test[data_test['user_id'].isin(train_users)]


data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2104867,2070,40618492260,594,1019940,1,1.0,311,-0.29,40,86,0.0,0.0
2107468,2021,40618753059,594,840361,1,0.99,443,0.0,101,86,0.0,0.0


In [477]:
print('Min неделя в трейне', data_train['week_no'].min())
print('Max неделя в трейне', data_train['week_no'].max())
print('Кол-во записей в трейне', data_train.shape[0])

Min неделя в трейне 86
Max неделя в трейне 91
Кол-во записей в трейне 169711


In [478]:
#словарь с ценами для подсчета money_precision
# item_prices = data.groupby('item_id', as_index=False).agg({'sales_value': 'sum', 'quantity':sum})
# item_prices['item_price'] = item_prices.sales_value / item_prices.quantity
# item_prices.drop(['sales_value', 'quantity'], axis=1, inplace=True)
# item_prices = item_prices.set_index('item_id')
# item_prices_dict = item_prices.to_dict()['item_price']

In [479]:
#словарь с ценами для подсчета money_precision
new_data = pd.concat([data_train, data_test])
item_prices = new_data.groupby('item_id', as_index=False).agg({'sales_value': 'sum', 'quantity':sum})
item_prices['item_price'] = item_prices.sales_value / item_prices.quantity
item_prices.drop(['sales_value', 'quantity'], axis=1, inplace=True)
item_prices = item_prices.set_index('item_id')
item_prices_dict = item_prices.to_dict()['item_price']

In [480]:
def add_price_column(result, item_prices_dict, column_name):
    """
    добавляем в датафрейм список с ценами по указанному столбцу
    """
    result[column_name+'_price'] = result[column_name].apply(lambda x: [item_prices_dict[i] for i in x])
    return result

In [481]:
#создаем датасет с реальными покупками пользователя
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
#добавляем в датасет результатов список цен по покупкам пользователя
result = add_price_column(result, item_prices_dict, 'actual')
result.head(2)

Unnamed: 0,user_id,actual,actual_price
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[0.6791304347826085, 0.5213412816691507, 2.734..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1.9899999999999998, 0.3262365591397851, 0.968..."


In [482]:
#Префильтрация товаров
pseudo_item_id = 999999
print('Товаров в трейне до фильтрации', data_train.item_id.nunique())
data_train = prefilter_items(data_train)
print('Товаров в трейне после фильтрации', data_train.item_id.nunique())

Товаров в трейне до фильтрации 27649
169711
147266
Товаров в трейне после фильтрации 3001


In [483]:
top_col = 'quantity'
overall_top_purchases = data_train.groupby('item_id')[top_col].count().reset_index()
overall_top_purchases.sort_values(top_col, ascending=False, inplace=True)
overall_top_purchases = overall_top_purchases[overall_top_purchases['item_id'] != pseudo_item_id]
overall_top_purchases = overall_top_purchases.item_id.tolist()

In [484]:
%%time

user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity', # Можно пробовать другие варианты
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(3)

Wall time: 1.14 s


item_id,819255,819304,819308,819518,819765,819840,819845,819927,819978,820122,...,17178955,17179084,17179627,17179662,17179907,17208470,17238070,17240213,17249614,17284423
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [485]:
%%time
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

Wall time: 4 ms


### 2. TF-IDF взвешивание

In [303]:
%%time
user_item_matrix_tfidf = tfidf_weight(user_item_matrix.T).T  # Применяется к item-user матрице ! 

Wall time: 68.8 ms


In [286]:
%%time

model = AlternatingLeastSquares(factors=10, 
                                regularization=0.001,
                                iterations=15,
                                calculate_training_loss=True,
#                                 filter_items=pseudo_item_id,
                                num_threads=4)

model.fit(csr_matrix(user_item_matrix_tfidf).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))


Wall time: 3.37 s


In [212]:
%%time
#ТОП500 по кол-ву 0.212
#ТОП1000 по кол-ву 0.205
#ТОП1500 по кол-ву 0.197
#ТОП3000 по кол-ву 0.19
#ТОП5000 по кол-ву 0.187
result['als_tfidf'] = result['user_id'].swifter.apply(get_recommend_lst_func)

result.apply(lambda row: precision_at_k(row['als_tfidf'], row['actual']), axis=1).mean()

HBox(children=(FloatProgress(value=0.0, description='Dask Apply', max=16.0, style=ProgressStyle(description_wi…


Wall time: 1min 50s


0.19098922624877324

In [213]:
# добавляем в датасет результатов список цен по рекомендациям для пользователя
result = add_price_column(result, item_prices_dict, 'als_tfidf')
# и считаем money_precision_at_k
result.apply(lambda row: money_precision_at_k(row['als_tfidf'], 
                                              row['actual'], 
                                              row['als_tfidf_price'], 
                                              row['actual_price']), axis=1).mean()

0.18136473564209685

### Без взвешивания

In [486]:
%%time

model = AlternatingLeastSquares(factors=10, 
                                regularization=0.001,
                                iterations=15,
                                calculate_training_loss=True,
#                                 filter_items=pseudo_item_id,
                                num_threads=4)

model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
          show_progress=True)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))


Wall time: 2.82 s


In [487]:
%%time
result['als'] = result['user_id'].swifter.apply(get_recommend_lst_func)

result.apply(lambda row: precision_at_k(row['als'], row['actual']), axis=1).mean()

HBox(children=(FloatProgress(value=0.0, description='Dask Apply', max=16.0, style=ProgressStyle(description_wi…


Wall time: 1min 41s


0.25181194906953747

In [488]:
#ТОП3000 по выручке c 80-й недели 0.215
#ТОП3000 по выручке c 85-й недели 0.233
# добавляем в датасет результатов список цен по рекомендациям для пользователя
result = add_price_column(result, item_prices_dict, 'als')
# и считаем money_precision_at_k
result.apply(lambda row: money_precision_at_k(row['als'], 
                                              row['actual'], 
                                              row['als_price'], 
                                              row['actual_price']), axis=1).mean()

0.2310115012479895

In [406]:
#Цены на рекомендованные товары в одной из рекомендаций
result['als_price'][0]

[1.3054055525313024,
 1.0118170307059164,
 2.547097345132744,
 1.035513307984792,
 2.968911917098447,
 3.2458596973865377,
 2.6802876106194753,
 1.2408316430020299,
 1.768274044795792,
 4.019344262295083,
 2.4376064610866397,
 2.0080118694362015,
 1.3280831826401507,
 4.486156156156157,
 4.043352517985609,
 1.4808823529411728,
 2.818333333333338,
 1.6891862567811975,
 1.3421184919210079,
 2.863959999999994,
 2.0231881188118708,
 1.0181767515923565,
 3.731694630872486,
 4.112590909090915,
 2.6093703703703706,
 1.2904697986577212,
 1.7559322033898355,
 4.2376056338028185,
 2.0516279069767442,
 1.4260775862068988]

### Фильтруем по категориям:

In [489]:
# Загружаем датафрейм с категориями
item_features = pd.read_csv('./raw_data/product.csv')
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [490]:
#нумеруем категории, чтобы проще сделать список и делаем из них словарь
sub_commodity_desc_dict = dict((category, code) for code, category in enumerate(item_features.sub_commodity_desc.unique()))
item_features['sub_commodity_desc_num'] = item_features.sub_commodity_desc.map(sub_commodity_desc_dict)
item_features = item_features.set_index('item_id')
item_cats_dict = item_features[['sub_commodity_desc_num']].to_dict()['sub_commodity_desc_num']

In [491]:
def postfilter_cats_items(items_lst):
    used_cats_id = []
    result_items = []
    for i in items_lst:
        if item_cats_dict[i] not in used_cats_id:
            result_items.append(i)
            used_cats_id.append(item_cats_dict[i])
    return result_items

In [492]:
result['als_catsfiltered'] = result['als'].apply(lambda x: postfilter_cats_items(x))
result['als_catsfiltered_count'] = result['als_catsfiltered'].apply(lambda x: len(x))
result.head(2)

Unnamed: 0,user_id,actual,actual_price,als,als_price,als_catsfiltered,als_catsfiltered_count
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[0.6791304347826085, 0.5213412816691507, 2.734...","[1082185, 995242, 1029743, 1126899, 840361, 96...","[1.0118170307059164, 1.3054055525313024, 2.474...","[1082185, 995242, 840361, 961554, 866211, 9797...",27
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1.9899999999999998, 0.3262365591397851, 0.968...","[1082185, 1029743, 995242, 1106523, 981760, 84...","[1.0118170307059164, 2.47451335877862, 1.30540...","[1082185, 1029743, 981760, 840361, 962568, 914...",19


In [493]:
print('Кол-во юзеров с меньше 5 рекомендациями:', result[result['als_catsfiltered_count']<5].shape[0])
print('Всего юзеров', result.shape[0])

Кол-во юзеров с меньше 5 рекомендациями: 0
Всего юзеров 2042


In [494]:
# добавляем в датасет результатов список цен по рекомендациям для пользователя
result = add_price_column(result, item_prices_dict, 'als_catsfiltered')
# считаем money_precision_at_k
result.apply(lambda row: money_precision_at_k(row['als_catsfiltered'], 
                                              row['actual'], 
                                              row['als_catsfiltered_price'], 
                                              row['actual_price']), axis=1).mean()

0.2038749334242597

In [513]:
result.head(2)

Unnamed: 0,user_id,actual,actual_price,als,als_price,als_catsfiltered,als_catsfiltered_count,als_catsfiltered_price
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[0.6791304347826085, 0.5213412816691507, 2.734...","[1082185, 995242, 1029743, 1126899, 840361, 96...","[1.0118170307059164, 1.3054055525313024, 2.474...","[1082185, 995242, 840361, 961554, 866211, 9797...",27,"[1.0118170307059164, 1.3054055525313024, 1.035..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1.9899999999999998, 0.3262365591397851, 0.968...","[1082185, 1029743, 995242, 1106523, 981760, 84...","[1.0118170307059164, 2.47451335877862, 1.30540...","[1082185, 1029743, 981760, 840361, 962568, 914...",19,"[1.0118170307059164, 2.47451335877862, 1.09134..."


### Добавляем дорогой товар

In [520]:
def set_cats(item_lst):
    res = []
    for i in item_lst:
        res.append(item_cats_dict[i])
    return res

In [521]:
result['als_cats'] = result['als_catsfiltered'].apply(lambda x: set_cats(x))

In [507]:
#Кол-во юзеров, которым порекомендовали товар дороже 7 долларов
(result['als_catsfiltered_price'].apply(lambda x: int(sum([1 if n>7 else 0 for n in x[:5]])>0)).sum(), 
 result.shape[0])

(11, 2042)

In [512]:
#Получаем список дорогих товаров из дата трейна
items_under7_price_lst = item_prices.loc[item_prices['item_price']<7].index.to_list() + [pseudo_item_id]
hi_price_df = data_train[~data_train['item_id'].isin(items_under7_price_lst)]
hi_price_df.sort_values(by=['quantity'], ascending=False, inplace=True)
hi_price_lst = hi_price_df['item_id'].to_list()
len(hi_price_lst)

3659

In [522]:
result['als_seven_in_k'] = result['als_catsfiltered_price'].apply(lambda x: int(sum([1 if n>7 else 0 for n in x[:5]])>0))
result['als_seven_in_rec'] = result['als_catsfiltered_price'].apply(lambda x: int(sum([1 if n>7 else 0 for n in x])>0))
result.head(2)

Unnamed: 0,user_id,actual,actual_price,als,als_price,als_catsfiltered,als_catsfiltered_count,als_catsfiltered_price,als_seven_in_k,als_seven_in_rec,als_cats
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[0.6791304347826085, 0.5213412816691507, 2.734...","[1082185, 995242, 1029743, 1126899, 840361, 96...","[1.0118170307059164, 1.3054055525313024, 2.474...","[1082185, 995242, 840361, 961554, 866211, 9797...",27,"[1.0118170307059164, 1.3054055525313024, 1.035...",0,0,"[713, 48, 260, 330, 318, 509, 96, 59, 971, 772..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1.9899999999999998, 0.3262365591397851, 0.968...","[1082185, 1029743, 995242, 1106523, 981760, 84...","[1.0118170307059164, 2.47451335877862, 1.30540...","[1082185, 1029743, 981760, 840361, 962568, 914...",19,"[1.0118170307059164, 2.47451335877862, 1.09134...",0,0,"[713, 48, 59, 260, 757, 232, 741, 64, 18, 47, ..."


In [562]:
def add_seven(row, k=5):
    res = row['als_catsfiltered'][:]
    if row['als_seven_in_k']>0:
        return res
    elif row['als_seven_in_rec']>0:
        for n, p in zip(row['als_catsfiltered'], row['als_catsfiltered_price']):
            if p>7:
                seven = n
                res[4] = n
                return res
    else:
        for np in hi_price_lst:
            if item_cats_dict[np] not in row['als_cats'][:k]:
                res[4] = np
                return res

In [563]:
result['als_catsfiltered_added7'] = result.apply(lambda row: add_seven(row), axis=1)

In [564]:
result.head(2)

Unnamed: 0,user_id,actual,actual_price,als,als_price,als_catsfiltered,als_catsfiltered_count,als_catsfiltered_price,als_seven_in_k,als_seven_in_rec,als_cats,als_catsfiltered_added7,als_catsfiltered_added7_count,als_catsfiltered_added7_price
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[0.6791304347826085, 0.5213412816691507, 2.734...","[1082185, 995242, 1029743, 1126899, 840361, 96...","[1.0118170307059164, 1.3054055525313024, 2.474...","[1082185, 995242, 840361, 961554, 866211, 9797...",27,"[1.0118170307059164, 1.3054055525313024, 1.035...",0,0,"[713, 48, 260, 330, 318, 509, 96, 59, 971, 772...","[1082185, 995242, 840361, 961554, 13416117, 97...",27,"[7.085238095238096, 1.3054055525313024, 1.0355..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1.9899999999999998, 0.3262365591397851, 0.968...","[1082185, 1029743, 995242, 1106523, 981760, 84...","[1.0118170307059164, 2.47451335877862, 1.30540...","[1082185, 1029743, 981760, 840361, 962568, 914...",19,"[1.0118170307059164, 2.47451335877862, 1.09134...",0,0,"[713, 48, 59, 260, 757, 232, 741, 64, 18, 47, ...","[1082185, 1029743, 981760, 840361, 13416117, 9...",19,"[7.085238095238096, 2.47451335877862, 1.091340..."


In [565]:
#Кол-во юзеров, которым порекомендовали товар дороже 7 долларов
(result['als_catsfiltered_added7'].apply(lambda x: int(sum([1 if n>7 else 0 for n in x[:5]])>0)).sum(), 
 result.shape[0])

(2042, 2042)

In [566]:
result['als_catsfiltered_added7_count'] = result['als_catsfiltered_added7'].apply(lambda x: len(x))

In [567]:
print('Кол-во юзеров с меньше 5 рекомендациями:', result[result['als_catsfiltered_added7_count']<5].shape[0])
print('Всего юзеров', result.shape[0])

Кол-во юзеров с меньше 5 рекомендациями: 0
Всего юзеров 2042


In [568]:
# добавляем в датасет результатов список цен по рекомендациям для пользователя
result = add_price_column(result, item_prices_dict, 'als_catsfiltered_added7')
# считаем money_precision_at_k
result.apply(lambda row: money_precision_at_k(row['als_catsfiltered_added7'], 
                                              row['actual'], 
                                              row['als_catsfiltered_added7_price'], 
                                              row['actual_price']), axis=1).mean()

0.1145584389251117

In [569]:
result['final_rec'] = result['als_catsfiltered_added7'].apply(lambda x: x[:5])

In [570]:
result[['user_id', 'final_rec']].to_csv('project_final_rec.csv', index)