<a href="https://colab.research.google.com/github/igorarkon/Rek_sis/blob/main/course_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Course project


## **Основное**
- Дедлайн - 4 апреля 23:59
- Целевая метрика precision@5
- Бейзлайн решения - [MainRecommender](https://github.com/geangohn/recsys-tutorial/blob/master/src/recommenders.py)
- Сдаем ссылку на github с решением. В решении должны быть отчетливо видна метрика на новом тестовом сете из файла retail_test1.csv, то есть вам нужно для всех юзеров из этого файла выдать выши рекомендации, и посчитать на actual покупках precision@5. 

**!! Мы не рассматриваем холодный старт для пользователя, все наши пользователя одинаковы во всех сетах, поэтому нужно позаботиться об их исключении из теста.**


**Hints:** 

Сначала просто попробуйте разные параметры MainRecommender:  
- N в топ-N товарах при формировании user-item матирцы (сейчас топ-5000)  
- Различные веса в user-item матрице (0/1, кол-во покупок, log(кол-во покупок + 1), сумма покупки, ...)  
- Разные взвешивания матрицы (TF-IDF, BM25 - у него есть параметры)  
- Разные смешивания рекомендаций (обратите внимание на бейзлайн - прошлые покупки юзера)  

Сделайте MVP - минимально рабочий продукт - (пусть даже top-popular), а потом его улучшайте

Если вы делаете двухуровневую модель - следите за валидацией 

In [None]:
!pip install implicit==0.4.4



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import libs

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции

#from metrics import precision_at_k, recall_at_k
#from utils import prefilter_items
#from recommenders import MainRecommender

In [None]:
import sys    
path_to_module = "../content/drive/MyDrive/Colab Notebooks/RegSis"
sys.path.append(path_to_module)
from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders import MainRecommender

## Read data

In [None]:
#PATH_DATA = "../../data"
PATH_DATA = "../content/drive/MyDrive/Colab Notebooks/RegSis/data"

In [None]:
data = pd.read_csv(os.path.join(PATH_DATA,'retail_train.csv'))
item_features = pd.read_csv(os.path.join(PATH_DATA,'product.csv'))
user_features = pd.read_csv(os.path.join(PATH_DATA,'hh_demographic.csv'))

# Set global const

In [None]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'
ACTUAL_COL = 'actual'

# N = Neighbors
N_PREDICT = 50 

# Process features dataset

In [None]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

# Split dataset for train, eval, test

In [None]:
# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)


VAL_MATCHER_WEEKS = 6
VAL_RANKER_WEEKS = 3

In [None]:
# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]

# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

In [None]:
# сделаем объединенный сет данных для первого уровня (матчинга)
df_join_train_matcher = pd.concat([data_train_matcher, data_val_matcher])

In [None]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")

In [None]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (2108779, 12) Users: 2498 Items: 83685
val_matcher
Shape: (169711, 12) Users: 2154 Items: 27649
train_ranker
Shape: (169711, 12) Users: 2154 Items: 27649
val_ranker
Shape: (118314, 12) Users: 2042 Items: 24329


In [None]:
# выше видим разброс по пользователям и товарам и дальше мы перейдем к warm-start (только известные пользователи)

In [None]:
data_val_matcher.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2104867,2070,40618492260,594,1019940,1,1.0,311,-0.29,40,86,0.0,0.0
2107468,2021,40618753059,594,840361,1,0.99,443,0.0,101,86,0.0,0.0


# Prefilter items

In [None]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=4000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))


Decreased # items from 83685 to 4001


# Make cold-start to warm-start

In [None]:
# ищем общих пользователей
common_users = data_train_matcher.user_id.values

data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (861404, 13) Users: 2495 Items: 4001
val_matcher
Shape: (169615, 12) Users: 2151 Items: 27644
train_ranker
Shape: (169615, 12) Users: 2151 Items: 27644
val_ranker
Shape: (118282, 12) Users: 2040 Items: 24325


# Init/train recommender

In [None]:
recommender = MainRecommender(data_train_matcher)

GPU training requires factor size to be a multiple of 32. Increasing factors from 20 to 32.


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

### Варианты, как получить кандидатов

Можно потом все эти варианты соединить в один

(!) Если модель рекомендует < N товаров, то рекомендации дополняются топ-популярными товарами до N

In [None]:
# Берем тестового юзера 2375

In [None]:
recommender.get_als_recommendations(2375, N=5)

[1039464, 1063739, 965530, 1085637, 960075]

In [None]:
recommender.get_own_recommendations(2375, N=5)

[918046, 9802981, 907099, 847962, 873980]

In [None]:
recommender.get_similar_items_recommendation(2375, N=5)

[9483753, 1083111, 956399, 1112592, 9575582]

In [None]:
recommender.get_similar_users_recommendation(2375, N=5)

[1057168, 1124971, 901976, 894360, 1097635]

# Eval recall of matching

### Измеряем recall@k

Это будет в ДЗ: 

A) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?
- Пока пробуем отобрать 50 кандидатов (k=50)
- Качество измеряем на data_val_lvl_1: следующие 6 недель после трейна

Дают ли own recommendtions + top-popular лучший recall?  

B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500}  
C)* Исходя из прошлого вопроса, как вы думаете, какое значение k является наиболее разумным?


In [None]:
result_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [None]:
%%time
# для понятности расписано все в строчку, без функций, ваша задача уметь оборачивать все это в функции
result_eval_matcher['own_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))
result_eval_matcher['sim_item_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_similar_items_recommendation(x, N=N_PREDICT))
result_eval_matcher['als_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_als_recommendations(x, N=N_PREDICT))

CPU times: user 1min, sys: 35 s, total: 1min 35s
Wall time: 1min


In [None]:
%%time
# result_eval_matcher['sim_user_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_similar_users_recommendation(x, N=50))

CPU times: user 3 µs, sys: 2 µs, total: 5 µs
Wall time: 7.39 µs


### Пример оборачивания

In [None]:
# # сырой и простой пример как можно обернуть в функцию
def evalRecall(df_result, target_col_name, recommend_model):
    result_col_name = 'result'
    df_result[result_col_name] = df_result[target_col_name].apply(lambda x: recommend_model(x, N=25))
    return df_result.apply(lambda row: recall_at_k(row[result_col_name], row[ACTUAL_COL], k=N_PREDICT), axis=1).mean()

In [None]:
# evalRecall(result_eval_matcher, USER_COL, recommender.get_own_recommendations)

In [None]:
def calc_recall(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [None]:
def calc_precision(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

### Recall@50 of matching

In [None]:
TOPK_RECALL = 50

In [None]:
sorted(calc_recall(result_eval_matcher, TOPK_RECALL), key=lambda x: x[1],reverse=True)

[('own_rec', 0.06761423318230154),
 ('sim_item_rec', 0.012452509369936323),
 ('als_rec', 0.011823575635104588)]

### Precision@5 of matching

In [None]:
TOPK_PRECISION = 5

In [None]:
sorted(calc_precision(result_eval_matcher, TOPK_PRECISION), key=lambda x: x[1],reverse=True)

[('own_rec', 0.18512319851231782),
 ('als_rec', 0.02166434216643427),
 ('sim_item_rec', 0.005485820548582051)]

# Ranking part

### Обучаем модель 2-ого уровня на выбранных кандидатах

- Обучаем на data_train_ranking
- Обучаем *только* на выбранных кандидатах
- Я *для примера* сгенерирую топ-50 кадидиатов через get_own_recommendations
- (!) Если юзер купил < 50 товаров, то get_own_recommendations дополнит рекоммендации топ-популярными

In [None]:
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 

## Подготовка данных для трейна

In [None]:
# взяли пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

In [None]:
# собираем кандитатов с первого этапа (matcher)
df_match_candidates['candidates'] = df_match_candidates[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

In [None]:
df_match_candidates.head(2)

Unnamed: 0,user_id,candidates
0,2070,"[1105426, 879194, 1097350, 944588, 1092937, 91..."
1,2021,"[950935, 1119454, 835578, 863762, 1019142, 102..."


In [None]:
# разворачиваем товары
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

In [None]:
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

In [None]:
df_match_candidates.head(4)

Unnamed: 0,user_id,item_id
0,2070,1105426
0,2070,879194
0,2070,1097350
0,2070,944588


### Check warm start

In [None]:
print_stats_data(df_match_candidates, 'match_candidates')

match_candidates
Shape: (107550, 2) Users: 2151 Items: 3794


### Создаем трейн сет для ранжирования с учетом кандидатов с этапа 1 

In [None]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1  # тут только покупки 

df_ranker_train.head()

Unnamed: 0,user_id,item_id,target
2104867,2070,1019940,1
2107468,2021,840361,1
2107469,2021,856060,1
2107470,2021,869344,1
2107471,2021,896862,1


In [None]:
df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

# чистим дубликаты
df_ranker_train = df_ranker_train.drop_duplicates(subset=[USER_COL, ITEM_COL])

df_ranker_train['target'].fillna(0, inplace= True)

In [None]:
df_ranker_train.target.value_counts()

0.0    98480
1.0     8205
Name: target, dtype: int64

In [None]:
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target
0,2070,1105426,0.0
1,2070,879194,0.0


(!) На каждого юзера 50 item_id-кандидатов

In [None]:
df_ranker_train['target'].mean()

0.0769086563246942

![hard_choice.png](attachment:hard_choice.png)

1) Pointwise
2) Pairwise
3) ListWise

Слайд из [презентации](https://github.com/aprotopopov/retailhero_recommender/blob/master/slides/retailhero_recommender.pdf) решения 2-ого места X5 Retail Hero

- Пока для простоты обучения выберем LightGBM c loss = binary. Это классическая бинарная классификация
- Это пример *без* генерации фич

## Подготавливаем фичи для обучения модели

### Описательные фичи

In [None]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [None]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [None]:
df_ranker_train = df_ranker_train.merge(item_features, on=ITEM_COL, how='left')
df_ranker_train = df_ranker_train.merge(user_features, on=USER_COL, how='left')
df_ranker_train = df_ranker_train[[USER_COL, ITEM_COL, "target"]]

#df_ranker_train.head(2)

**Фичи user_id:**
    - Средний чек
    - Средняя сумма покупки 1 товара в каждой категории
    - Кол-во покупок в каждой категории
    - Частотность покупок раз/месяц
    - Долю покупок в выходные
    - Долю покупок утром/днем/вечером

**Фичи item_id**:
    - Кол-во покупок в неделю
    - Среднее ол-во покупок 1 товара в категории в неделю
    - (Кол-во покупок в неделю) / (Среднее ол-во покупок 1 товара в категории в неделю)
    - Цена (Можно посчитать из retil_train.csv)
    - Цена / Средняя цена товара в категории
    
**Фичи пары user_id - item_id**
    - (Средняя сумма покупки 1 товара в каждой категории (берем категорию item_id)) - (Цена item_id)
    - (Кол-во покупок юзером конкретной категории в неделю) - (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)
    - (Кол-во покупок юзером конкретной категории в неделю) / (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)

### Поведенческие фичи

##### Чтобы считать поведенческие фичи, нужно учесть все данные что были до data_val_ranker

In [None]:
df_join_train_matcher.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


## !!! Пока выполните нотбук без этих строк, потом вернитесь и запустите их, обучите ранкер и посмотрите на метрики с ранжированием

In [None]:
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('sales_value').sum().rename('total_item_sales_value'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('total_quantity_value'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq'), how='left',on=USER_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('sales_value').sum().rename('total_user_sales_value'), how='left',on=USER_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=USER_COL)


df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_baskter')/df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)


df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)


In [None]:
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target,total_item_sales_value,total_quantity_value,item_freq,user_freq,total_user_sales_value,item_quantity_per_week,user_quantity_per_week,item_quantity_per_basket,user_quantity_per_baskter,item_freq_per_basket,user_freq_per_basket
0,2070,1105426,0.0,442.9,113,99,1996,5754.86,1.241758,1218.32967,0.000461,0.452137,0.000404,0.00814
1,2070,879194,0.0,390.81,54,46,1996,5754.86,0.593407,1218.32967,0.00022,0.452137,0.000188,0.00814
2,2070,1097350,0.0,619.46,54,51,1996,5754.86,0.593407,1218.32967,0.00022,0.452137,0.000208,0.00814
3,2070,944588,0.0,263.01,75,67,1996,5754.86,0.824176,1218.32967,0.000306,0.452137,0.000273,0.00814
4,2070,1092937,1.0,1386.25,577,445,1996,5754.86,6.340659,1218.32967,0.002353,0.452137,0.001815,0.00814


In [None]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]

In [None]:
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

## Обучение модели ранжирования

In [None]:
lgb = LGBMClassifier(objective='binary',
                     max_depth=10,
                     n_estimators=100,
                     learning_rate=0.1,
                     categorical_column=cat_feats)

lgb.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


LGBMClassifier(categorical_column=['total_item_sales_value',
                                   'total_quantity_value', 'item_freq',
                                   'user_freq', 'total_user_sales_value',
                                   'item_quantity_per_week',
                                   'user_quantity_per_week',
                                   'item_quantity_per_basket',
                                   'user_quantity_per_baskter',
                                   'item_freq_per_basket',
                                   'user_freq_per_basket'],
               max_depth=10, objective='binary')

In [None]:
train_preds = lgb.predict_proba(X_train)

In [None]:
df_ranker_predict = df_ranker_train.copy()

In [None]:
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

## Подведем итоги

    Мы обучили модель ранжирования на покупках из сета data_train_ranker и на кандитатах от own_recommendations, что является тренировочным сетом, и теперь наша задача предсказать и оценить именно на тестовом сете.

# Evaluation on test dataset

In [None]:
result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
result_eval_ranker.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


## Eval matching on test dataset

In [None]:
%%time
result_eval_ranker['own_rec'] = result_eval_ranker[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

CPU times: user 11.4 s, sys: 135 ms, total: 11.5 s
Wall time: 11.4 s


In [None]:
# померяем precision только модели матчинга, чтобы понимать влияение ранжирования на метрики

sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True)

[('own_rec', 0.14941176470588105)]

## Eval re-ranked matched result on test dataset
    Вспомним df_match_candidates сет, который был получен own_recommendations на юзерах, набор пользователей мы фиксировали и он одинаков, значи и прогноз одинаков, поэтому мы можем использовать этот датафрейм для переранжирования.
    

In [None]:
def rerank(user_id):
    return df_ranker_predict[df_ranker_predict[USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()

In [None]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))

## Проверьте данные метрики с фичами и без (PS: должен быть прирост)

In [None]:
# смотрим на метрики выше и сравниваем что с ранжированием и без, добавляем фичи и то же смотрим
# в первом приближении метрики должны расти с использованием второго этапа

print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

  return flags.sum() / len(recommended_list)


('reranked_own_rec', 0.16449086161879753)
('own_rec', 0.14941176470588105)


**Проект**

In [None]:
!pip install catboost



In [None]:
from catboost import CatBoostClassifier
from catboost import Pool
from catboost import CatBoost

**добавим фичей**

In [None]:
df_ranker_train = []

In [None]:
data_item = data.copy()

data_item = data_item.merge(item_features[[ITEM_COL,'department']], on='item_id', how='left')


In [None]:
# Средний чек
add_user_features = pd.DataFrame(data_item.groupby('user_id')['sales_value'].median()).reset_index()
add_user_features.rename(columns={'sales_value': 'avg_sales_value'}, inplace=True)

In [None]:
# Средняя сумма покупки 1 товара в каждой категории
departments = data_item['department'].unique().tolist()
departments.remove(' ')
for department in departments:
    add_user_features[f'avg_buy_{department}'] = 0

user_department_price = pd.DataFrame(data_item.groupby(['user_id', 'department'])['sales_value'].median()).reset_index()
user_department_price = user_department_price[user_department_price['department'] != ' ']

for user_id, department, avg_buy in user_department_price.values:
    add_user_features.loc[add_user_features['user_id'] == user_id, 
                         f"avg_buy_{department}"] = avg_buy

add_user_features.head()

Unnamed: 0,user_id,avg_sales_value,frq_pur_month,avg_buy_PRODUCE,avg_buy_GROCERY,avg_buy_DRUG GM,avg_buy_MEAT,avg_buy_MEAT-PCKGD,avg_buy_DELI,avg_buy_SEAFOOD-PCKGD,...,avg_buy_CHARITABLE CONT,avg_buy_RX,avg_buy_TOYS,avg_buy_PHOTO,avg_buy_DELI/SNACK BAR,avg_buy_GRO BAKERY,avg_buy_PHARMACY SUPPLY,avg_buy_ELECT &PLUMBING,avg_buy_MEAT-WHSE,avg_buy_VIDEO
0,1,2.29,33.05,1.535,2.18,2.29,4.11,2.99,3.385,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
1,2,2.39,18.411765,1.76,2.0,2.99,5.31,2.99,3.35,8.99,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
2,3,2.0,15.9375,1.29,2.0,1.49,6.93,3.29,6.17,4.99,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
3,4,2.59,10.058824,1.69,2.5,3.995,4.16,3.995,7.175,4.69,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
4,5,2.58,7.941176,1.095,2.49,2.99,7.12,4.495,3.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0


In [None]:
df_join_train_matcher['month'] = df_join_train_matcher['day'].apply(lambda x: (x+30)//30)

user_life_cycle = df_join_train_matcher.groupby(['user_id']).agg({ 'month': ['min', 'max'] }).reset_index()

user_life_cycle['life_cycle_month'] = user_life_cycle['month','max'] - user_life_cycle['month','min']
user_life_cycle.drop([('month','min'), ('month','max')], axis=1, inplace=True)
user_life_cycle.set_index('user_id',inplace = True)

frq_pur_month = data_train_matcher.groupby(['user_id'])['basket_id'].count() / user_life_cycle['life_cycle_month']
frq_pur_month.name = 'frq_pur_month'
#frq_pur_month[:3]
add_user_features = add_user_features.merge(frq_pur_month, on=USER_COL, how='left')

In [None]:
df_ranker_train = df_ranker_train.merge(add_user_features, on=USER_COL, how='left')


In [None]:
# Среднее кол-во покупок в неделю
item_week_quantity = pd.DataFrame(data_item.groupby(['item_id', 'week_no'])['quantity'].sum()).reset_index()
add_item_features = pd.DataFrame(item_week_quantity.groupby('item_id')['quantity'].mean()).reset_index()
add_item_features.rename(columns={'quantity': 'my_quantity_per_week'}, inplace=True)
#Средняя по покупкам
avg_purch_week = df_join_train_matcher.groupby(['item_id'])['quantity'].sum() / df_join_train_matcher['week_no'].max() 
avg_purch_week.name = 'avg_purch_week'
add_item_features = add_item_features.merge(avg_purch_week, on='item_id', how='left')

# Цена товара
data_item['price'] = data_item['sales_value'] / data_item['quantity']

# Усредним полученные значения цены по каждому товару
item_prices = pd.DataFrame(data_item.groupby('item_id')['price'].mean()).reset_index()
add_item_features = add_item_features.merge(item_prices, on='item_id', how='left')

add_item_features.head()

Unnamed: 0,item_id,my_quantity_per_week,avg_purch_week,price
0,25671,2.0,0.065934,3.49
1,26081,1.0,0.010989,0.99
2,26093,1.0,0.010989,1.59
3,26190,1.0,0.010989,1.54
4,26355,2.0,0.021978,0.99


In [None]:
#Среднее кол-во покупок 1 товара в конкретной категории в неделю
merge_df = pd.merge(df_join_train_matcher, item_features, how='inner', on='item_id')

pusrch_depart = merge_df.groupby(['department'])['quantity'].sum()
pusrch_item_depart = merge_df.groupby(['department','item_id'])['quantity'].sum().reset_index()

purchases_dep = pd.merge(pusrch_item_depart, pusrch_depart, on='department')
purchases_dep['avg_sales_group_week'] = purchases_dep['quantity_x'] / purchases_dep['quantity_y'] / df_join_train_matcher['week_no'].max()
purchases_dep.set_index('item_id', inplace=True)
avg_sales_group_week = purchases_dep['avg_sales_group_week']
met = avg_purch_week / avg_sales_group_week
met.name = 'metrik1'
add_item_features = add_item_features.merge(avg_sales_group_week, on='item_id', how='left')
add_item_features = add_item_features.merge(met, on='item_id', how='left')

In [None]:
df_ranker_train = df_ranker_train.merge(add_item_features, on=ITEM_COL, how='left')
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('store_id').nunique().rename('n_unique_store_id'), how='left',on=ITEM_COL)

In [None]:
X_train = df_ranker_train
X_train

Unnamed: 0,user_id,item_id,target,total_item_sales_value,total_quantity_value,item_freq,user_freq,total_user_sales_value,item_quantity_per_week,user_quantity_per_week,item_quantity_per_basket,user_quantity_per_baskter,item_freq_per_basket,user_freq_per_basket,avg_sales_value,frq_pur_month,my_quantity_per_week,avg_purch_week,price,n_unique_store_id
0,2070,1105426,0.0,442.90,113,99,1996,5754.86,1.241758,1218.329670,0.000461,0.452137,0.000404,0.008140,1.99,32.888889,2.000000,1.241758,3.910300,29
1,2070,879194,0.0,390.81,54,46,1996,5754.86,0.593407,1218.329670,0.000220,0.452137,0.000188,0.008140,1.99,32.888889,1.588235,0.593407,7.266000,9
2,2070,1097350,0.0,619.46,54,51,1996,5754.86,0.593407,1218.329670,0.000220,0.452137,0.000208,0.008140,1.99,32.888889,1.350000,0.593407,11.460588,6
3,2070,944588,0.0,263.01,75,67,1996,5754.86,0.824176,1218.329670,0.000306,0.452137,0.000273,0.008140,1.99,32.888889,1.680000,0.824176,3.599726,37
4,2070,1092937,1.0,1386.25,577,445,1996,5754.86,6.340659,1218.329670,0.002353,0.452137,0.001815,0.008140,1.99,32.888889,6.848837,6.340659,2.565991,87
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106680,1745,870547,0.0,2260.57,1134,1114,897,2397.25,12.461538,13.208791,0.004625,0.004902,0.004543,0.003658,2.09,16.523810,12.726316,12.461538,1.995511,110
106681,1745,828490,0.0,159.73,80,64,897,2397.25,0.879121,13.208791,0.000326,0.004902,0.000261,0.003658,2.09,16.523810,1.860465,0.879121,2.019661,34
106682,1745,969977,0.0,451.67,198,130,897,2397.25,2.175824,13.208791,0.000807,0.004902,0.000530,0.003658,2.09,16.523810,3.766667,2.175824,2.283014,71
106683,1745,944249,0.0,494.15,172,162,897,2397.25,1.890110,13.208791,0.000701,0.004902,0.000661,0.003658,2.09,16.523810,2.407895,1.890110,2.873642,70


In [None]:
%%time

params = {
    'cat_features': cat_feats, 
    'silent': False,
    'random_state': 15,
    'iterations': 1500,
    'max_depth': 10,
    'l2_leaf_reg': 2,
    "task_type": "GPU",
    "eta": 0.1
}


model = CatBoostClassifier(**params)

# Обучение на train_test для получения метрик
model.fit(X_train, y_train) 

0:	learn: 0.3034584	total: 9.02ms	remaining: 13.5s
1:	learn: 0.1291809	total: 17.4ms	remaining: 13s
2:	learn: 0.0588574	total: 26.6ms	remaining: 13.3s
3:	learn: 0.0291116	total: 41.7ms	remaining: 15.6s
4:	learn: 0.0142747	total: 50.6ms	remaining: 15.1s
5:	learn: 0.0076273	total: 61.8ms	remaining: 15.4s
6:	learn: 0.0042743	total: 71.9ms	remaining: 15.3s
7:	learn: 0.0024776	total: 81.2ms	remaining: 15.1s
8:	learn: 0.0017307	total: 113ms	remaining: 18.8s
9:	learn: 0.0010918	total: 122ms	remaining: 18.2s
10:	learn: 0.0007190	total: 132ms	remaining: 17.8s
11:	learn: 0.0005272	total: 143ms	remaining: 17.8s
12:	learn: 0.0004210	total: 177ms	remaining: 20.2s
13:	learn: 0.0003500	total: 212ms	remaining: 22.5s
14:	learn: 0.0002964	total: 237ms	remaining: 23.4s
15:	learn: 0.0002554	total: 274ms	remaining: 25.4s
16:	learn: 0.0002208	total: 297ms	remaining: 25.9s
17:	learn: 0.0001943	total: 320ms	remaining: 26.4s
18:	learn: 0.0001732	total: 352ms	remaining: 27.4s
19:	learn: 0.0001560	total: 383ms	r

In [None]:
train_preds = model.predict_proba(X_train)
df_ranker_predict = df_ranker_train.copy()
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

In [None]:
def rerank(user_id):
    return df_ranker_predict[df_ranker_predict[USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()

In [None]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))

In [None]:
# смотрим на метрики выше и сравниваем что с ранжированием и без, добавляем фичи и то же смотрим
# в первом приближении метрики должны расти с использованием второго этапа

print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.1966579634464736)
('own_rec', 0.14941176470588105)


  return flags.sum() / len(recommended_list)


In [None]:
#for col in ['department', 'brand', 'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product', 'age_desc', 'marital_status_code', 'income_desc', 'homeowner_desc', 'hh_comp_desc', 'household_size_desc', 'kid_category_desc']:
 #   X_train[col] = X_train[col].astype('category')

In [None]:
lgb = LGBMClassifier(objective='binary',
                     max_depth=50,
                     n_estimators=500,
                     learning_rate=0.1,
                     categorical_column=cat_feats)

lgb.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


LGBMClassifier(categorical_column=[], max_depth=50, n_estimators=500,
               objective='binary')

In [None]:
train_preds_lgb = lgb.predict_proba(X_train)
df_ranker_predict = df_ranker_train.copy()
df_ranker_predict['proba_item_purchase'] = train_preds_lgb[:,1]

In [None]:
def rerank(user_id):
    return df_ranker_predict[df_ranker_predict[USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()

In [None]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))

In [None]:
# смотрим на метрики выше и сравниваем что с ранжированием и без, добавляем фичи и то же смотрим
# в первом приближении метрики должны расти с использованием второго этапа

print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.21211488250652555)
('own_rec', 0.14941176470588105)


  return flags.sum() / len(recommended_list)


# Оценка на тесте для выполнения курсового проекта

In [None]:
# df_transactions = pd.read_csv('../data/transaction_data.csv')

In [None]:
df_test = pd.read_csv("../content/drive/MyDrive/Colab Notebooks/RegSis/data/retail_test1.csv")

In [None]:
df_test.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0
2,2070,41652857291,664,995242,5,9.1,311,-0.6,46,96,0.0,0.0
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,0.0,0.0
4,1602,41665647035,664,927712,1,0.59,334,-0.4,1741,96,0.0,0.0


In [None]:
result_test = df_test.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_test.columns=[USER_COL, ACTUAL_COL]
result_test.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784..."


In [None]:
result_test['reranked_own_rec'] = result_test[USER_COL].apply(lambda user_id: rerank(user_id))

In [None]:
print(*sorted(calc_precision(result_test, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.17646396396396258)


  return flags.sum() / len(recommended_list)
