### Постановка задачи
Необходимо:

- Сделать бейзлайны
- Сделать модель, подобрать оптимальные параметры
- Для каждого юзера оставить по 5 рекомендаций
- Целевая метрика precision@5
- Минимальный скор на Тесте = 0.18 на retail_test1


### Загрузка библиотек и функций

In [1]:
import pandas as pd
import numpy as np

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares as als
from implicit.bpr import BayesianPersonalizedRanking as bpr

# Модель второго уровня
from catboost import CatBoostClassifier

#import os, sys
#sys.path.insert(1, os.getcwd() + '/src/')

from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items, postfilter_items, make_unique_recommendations
from src.recommenders import MainRecommender

In [2]:
def print_info(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape}, Unique users: {df_data['user_id'].nunique()}, Unique items: {df_data['item_id'].nunique()}")
    
def get_result(df_result, recommend_model, N=50):
    return df_result['user_id'].apply(lambda x: recommend_model(x, N=N))

# def calc_recall_at_k(df_data, top_k, ACTUAL_COL='actual'):
#     for col_name in df_data.columns[2:]:
#         yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()
        
def calc_precision_at_k(df_data, k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row['actual'], k=k), axis=1).mean()
        
def rerank(user_id, df, USER_COL='user_id', proba_col_name='proba_item_purchase', N=5):
    return df[df[USER_COL]==user_id].sort_values(proba_col_name, ascending=False).head(N).item_id.tolist()

def get_scores(df_result, recommend_model, N_PREDICT=50, USER_COL='user_id'):
    return df_result[USER_COL].apply(lambda x: recommend_model(x, N=N_PREDICT))

### Загрузка данных

In [3]:
DATASET_PATH = 'data/retail_train.csv'
ITEM_FEATURES_PATH = 'data/product.csv'
USER_FEATURES_PATH = 'data/hh_demographic.csv'

In [4]:
data = pd.read_csv(DATASET_PATH)
item_features = pd.read_csv(ITEM_FEATURES_PATH)
user_features = pd.read_csv(USER_FEATURES_PATH)

### Подготовка данных

In [5]:
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id' }, inplace=True)

In [6]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [7]:
user_features.head(3)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8


###  Train test split
Делим датасет на 3 части:  
1) обучающий для модели 1 уровня   
2) валидационный для модели 1 уровня = обучающий для модели 2 уровня   
3) валидационный для модели 2 уровня  
   
Модель 1 уровня - MATCHER (сопоставление, нахождение первичных рекомендаций)  
Модель 2 уровня - RANKER (модель для ранжирования, классификационная модель)

In [8]:
val_lvl_1_size_weeks = 5
val_lvl_2_size_weeks = 3

# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]


# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]


In [9]:
print_info(data_train_matcher,'train_matcher')
print_info(data_val_matcher,'val_matcher')
print_info(data_train_ranker,'train_ranker')
print_info(data_val_ranker,'val_ranker')

train_matcher
Shape: (2136728, 12), Unique users: 2498, Unique items: 84180
val_matcher
Shape: (141762, 12), Unique users: 2097, Unique items: 25770
train_ranker
Shape: (141762, 12), Unique users: 2097, Unique items: 25770
val_ranker
Shape: (118314, 12), Unique users: 2042, Unique items: 24329


In [10]:
#Проведем префильтрацию данных

n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=10000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 84180 to 10001


In [11]:
# сделаем объединенный сет данных для первого уровня (матчинга)
df_join_train_matcher = pd.concat([data_train_matcher, data_val_matcher])

In [12]:
# Добавим параметр категории к исходному обучающему датасету для удобства создания новых фичей
df_join_train_matcher = df_join_train_matcher.merge(item_features[['item_id', 'department']], on='item_id', how='inner')
df_join_train_matcher.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price,department
0,1285,30407595228,239,852015,1,9.79,334,-2.2,915,35,0.0,0.0,9.79,DRUG GM
1,2076,30768591161,251,852015,1,9.79,388,-2.2,1244,37,-1.0,0.0,9.79,DRUG GM


Оставим только пользователей, которые встречаются в тренировочном датасете, чтобы избежать проблемы холодного старта.

In [13]:
# ищем общих пользователей
common_users = list(set(data_train_matcher.user_id.values)&(set(data_val_matcher.user_id.values))&set(data_val_ranker.user_id.values))

# оставляем общих пользователей
data_train_matcher = data_train_matcher[data_train_matcher.user_id.isin(common_users)]
data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_info(data_train_matcher,'train_matcher')
print_info(data_val_matcher,'val_matcher')
print_info(data_train_ranker,'train_ranker')
print_info(data_val_ranker,'val_ranker')

train_matcher
Shape: (546741, 13), Unique users: 1870, Unique items: 9997
val_matcher
Shape: (136228, 12), Unique users: 1870, Unique items: 25273
train_ranker
Shape: (136228, 12), Unique users: 1870, Unique items: 25273
val_ranker
Shape: (114844, 12), Unique users: 1870, Unique items: 23952


### Построение baseline рекоммендаций

In [14]:
recommender = MainRecommender(data_train_matcher, weighting = True)



  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/9997 [00:00<?, ?it/s]

  0%|          | 0/9997 [00:00<?, ?it/s]

  0%|          | 0/9997 [00:00<?, ?it/s]

  0%|          | 0/9997 [00:00<?, ?it/s]

In [15]:
result = data_val_matcher.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[1005186, 907466, 909497, 940947, 963542, 1067..."
1,6,"[873654, 994928, 1098844, 1122879, 8357613, 98..."


In [16]:
N = 30

result['random_recommendations'] = (result['user_id']
                                    .apply(lambda x: recommender.random_recommendation(N=N)))
result['top_popular_recs'] = (result['user_id']
                              .apply(lambda x: recommender.popularity_recommendation(N=N)))
result['weighted_random_recs'] = (result['user_id']
                                  .apply(lambda x: recommender.weighted_random_recommendation(N=N)))

result.head(2)

Unnamed: 0,user_id,actual,random_recommendations,top_popular_recs,weighted_random_recs
0,1,"[1005186, 907466, 909497, 940947, 963542, 1067...","[8019255, 13038925, 9935616, 6396174, 12781921...","[999999, 1029743, 916122, 1106523, 5569230, 84...","[885582, 932675, 992986, 9677850, 1022969, 875..."
1,6,"[873654, 994928, 1098844, 1122879, 8357613, 98...","[821134, 8359154, 902362, 9446246, 985893, 556...","[999999, 1029743, 916122, 1106523, 5569230, 84...","[931153, 1088414, 828437, 933835, 898847, 8308..."


In [17]:
k = 5
sorted(calc_precision_at_k(result, k), key=lambda x: x[1],reverse=True)

[('top_popular_recs', 0.10802139037433155),
 ('weighted_random_recs', 0.0032085561497326204),
 ('random_recommendations', 0.0031016042780748665)]

Получили значения baseline метрик . Лучшей из baseline моделей по presision является top_popular_recs. Далее построем двухуровневую модель

### Построение модели первого уровня Matcher

Для модели 1ого уровня возьмем 30 кандидатов

In [18]:
N = 30

result['own_rec'] = get_result(result, recommender.get_own_recommendations, N=N)
result['als_rec'] = get_result(result, recommender.get_als_recommendations, N=N)
result['bpr_rec'] = get_result(result, recommender.get_bpr_recommendations, N=N)
result['bm25_rec'] = get_result(result, recommender.get_bm25_recommendations, N=N)
result['tfidf_rec'] = get_result(result, recommender.get_tfidf_recommendations, N=N)
result['cosine_rec'] = get_result(result, recommender.get_cosine_recommendations, N=N)
result['similar_users'] = get_result(result, recommender.get_similar_users_recommendation, N=N)
result['similar_items'] = get_result(result, recommender.get_similar_items_recommendation, N=N)

# #similar_users_recommendation, similar_items_recommendation показывают очень низкий результат
# не будем их учитывать в сравнении

In [19]:
result.head(2)

Unnamed: 0,user_id,actual,random_recommendations,top_popular_recs,weighted_random_recs,own_rec,als_rec,bpr_rec,bm25_rec,tfidf_rec,cosine_rec,similar_users,similar_items
0,1,"[1005186, 907466, 909497, 940947, 963542, 1067...","[8019255, 13038925, 9935616, 6396174, 12781921...","[999999, 1029743, 916122, 1106523, 5569230, 84...","[885582, 932675, 992986, 9677850, 1022969, 875...","[940947, 1004906, 856942, 865456, 5582712, 101...","[965766, 1062002, 1132771, 8090521, 940947, 85...","[916122, 1029743, 866211, 844179, 1127831, 100...","[940947, 1004906, 1013167, 1127831, 844179, 10...","[900875, 877373, 6391068, 877391, 1109465, 108...","[877373, 900875, 1087895, 9297615, 877391, 856...","[1016800, 8090532, 1070820, 832678, 907631, 10...","[828647, 5582712, 9297615, 896666, 1080354, 92..."
1,6,"[873654, 994928, 1098844, 1122879, 8357613, 98...","[821134, 8359154, 902362, 9446246, 985893, 556...","[999999, 1029743, 916122, 1106523, 5569230, 84...","[931153, 1088414, 828437, 933835, 898847, 8308...","[5569230, 965267, 863447, 993638, 1024306, 951...","[1127831, 878996, 1044078, 866871, 1004906, 99...","[916122, 1029743, 866211, 844179, 1127831, 100...","[878996, 965267, 930118, 1105488, 863447, 1044...","[1004906, 866211, 878996, 1127831, 844179, 916...","[878996, 863447, 13003092, 1098844, 896613, 82...","[1026118, 5569471, 5569230, 1026118, 866211, 1...","[5569845, 13002975, 42346, 948650, 999999, 104..."


In [20]:
sorted(calc_precision_at_k(result, k=5), key=lambda x: x[1], reverse=True)

[('own_rec', 0.2851336898395722),
 ('cosine_rec', 0.24727272727272728),
 ('bm25_rec', 0.1918716577540107),
 ('tfidf_rec', 0.17871657754010697),
 ('als_rec', 0.13133689839572193),
 ('top_popular_recs', 0.10802139037433155),
 ('bpr_rec', 0.09604278074866311),
 ('similar_users', 0.0948663101604278),
 ('similar_items', 0.03967914438502674),
 ('weighted_random_recs', 0.0032085561497326204),
 ('random_recommendations', 0.0031016042780748665)]

Очевидно, лучшие результаты показывает модель на основе **предыдущих покупок пользователя** 

### Подготовка датасета и генерация признаков для модели 2 уровня (модели ранжирования)

In [21]:
# берем пользователей из трейна для модели ранжирования
df_match_candidates = pd.DataFrame(data_train_ranker['user_id'].unique())
df_match_candidates.columns = ['user_id']

# собираем для каждого юзера кандитатов с первого этапа (matcher)
df_match_candidates['candidates'] = df_match_candidates['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=N))

In [22]:
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

In [23]:
# соберем итоговый датафрейм с комбинациями user_id - item_id
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

In [24]:
df_match_candidates.head(3)

Unnamed: 0,user_id,item_id
0,1827,907631
0,1827,940947
0,1827,1029743


In [25]:
print_info(df_match_candidates, 'match_candidates')

match_candidates
Shape: (56100, 2), Unique users: 1870, Unique items: 4324


In [26]:
df_ranker_train = data_train_ranker[['user_id', 'item_id']].copy()
df_ranker_train['target'] = 1  # здесь только фактические покупки 

df_ranker_train = df_match_candidates.merge(df_ranker_train, on=['user_id', 'item_id'], how='left')

# чистим дубликаты
df_ranker_train = df_ranker_train.drop_duplicates(subset=['user_id', 'item_id'])

# все, что не фактические покупки - заполняем нулями
df_ranker_train['target'].fillna(0, inplace= True)

df_ranker_train.head()

Unnamed: 0,user_id,item_id,target
0,1827,907631,0.0
1,1827,940947,0.0
2,1827,1029743,1.0
4,1827,5568378,0.0
5,1827,854405,0.0


In [27]:
# Соотношение классов:

df_ranker_train.target.value_counts()

0.0    48109
1.0     7839
Name: target, dtype: int64

Очевидно, что объектов 0 класса гораздо больше, чем 1
Вероятно это связано с количеством кандидатов

In [28]:
# Присоединим к новому тренировочному датасету фичи юзеров и товаров - создадим общий тренировочный датасет

df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,1827,907631,0.0,1039,GROCERY,National,FROZEN PIZZA,SNACKS/APPETIZERS,,,,,,,,
1,1827,940947,0.0,2082,MEAT-PCKGD,National,HEAT/SERVE,ENTREES,24 OZ,,,,,,,


### Генерация новых признаков

In [29]:
# Добавим параметр категории к исходному обучающему датасету для удобства создания новых фичей
data_department = data_train_ranker.merge(item_features[['item_id', 'department']], on='item_id', how='inner')
data_department.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,department
0,1827,40702967646,601,891141,2,2.73,33923,0.0,7,87,0.0,0.0,PRODUCE
1,496,40739402373,603,891141,1,1.83,445,0.0,2226,87,0.0,0.0,PRODUCE


In [30]:
# Добавим признаки

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by='item_id').agg('sales_value').sum().\
                                        rename('total_item_sales_value'), how='left',on='item_id')

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by='item_id').agg('quantity').sum().\
                                        rename('total_quantity_value'), how='left',on='item_id')

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by='item_id').agg('user_id').count().\
                                        rename('item_freq'), how='left',on='item_id')

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by='user_id').agg('user_id').count().\
                                        rename('user_freq'), how='left',on='user_id')

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by='user_id').agg('sales_value').sum().\
                                        rename('total_user_sales_value'), how='left',on='user_id')


df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by='item_id').agg('quantity').sum().\
                rename('item_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on='item_id')

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by='user_id').agg('quantity').sum().\
                rename('user_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on='user_id')


df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by='item_id').agg('quantity').sum().\
            rename('item_quantity_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on='item_id')

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by='user_id').agg('quantity').sum().\
            rename('user_quantity_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on='user_id')


df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by='item_id').agg('user_id').count().\
                rename('item_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on='item_id')

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by='user_id').agg('user_id').count().\
                rename('user_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on='user_id')




**Добавление дополнительных признаков**

In [31]:
''' ДЛЯ ЮЗЕРОВ '''

# Средний чек
users_sales = data_train_ranker.groupby('user_id')['sales_value'].mean().reset_index()
users_sales.rename(columns={'sales_value': 'avg_cheque'}, inplace=True)
df_ranker_train = df_ranker_train.merge(users_sales[['user_id', 'avg_cheque']], on='user_id', how='left')

# Количество уникальных категорий покупателя
users_departments = data_department.groupby('user_id')['department'].nunique().reset_index()
users_departments.rename(columns = {'department':'users_unique_departments'}, inplace=True)
df_ranker_train = df_ranker_train.merge(users_departments, on='user_id', how='left')

# Среднее время покупки
bought_time = data_train_ranker.groupby('user_id')['trans_time'].mean().reset_index()
bought_time.rename(columns = {'trans_time':'mean_trans_time_by_user'}, inplace=True)
df_ranker_train = df_ranker_train.merge(bought_time, on='user_id', how='left')

# Средний чек корзины 
baskets_sales_value = data_train_ranker.groupby(['user_id','basket_id'])['sales_value'].mean().reset_index()
mean_basket_sales_value = baskets_sales_value.groupby('user_id')['sales_value'].mean().reset_index()
mean_basket_sales_value.rename(columns = {'sales_value':'mean_sales_value_per_basket'}, inplace=True)
df_ranker_train = df_ranker_train.merge(mean_basket_sales_value, on='user_id', how='left')

# Количество купленных уникальных товаров 
unique_bought_items = data_train_ranker.groupby('user_id')['item_id'].nunique().reset_index()
unique_bought_items.rename(columns = {'item_id':'unique_bought_items'}, inplace=True)
df_ranker_train = df_ranker_train.merge(unique_bought_items, on='user_id', how='left')



# Среднее количество уникальных категорий в корзине
users_baskets = data_department.groupby(['user_id', 'basket_id'])['department'].nunique().reset_index()
users_baskets = users_baskets.groupby('user_id')['department'].mean().reset_index()
users_baskets.rename(columns={'department': 'avg_basket_department'}, inplace=True)
df_ranker_train = df_ranker_train.merge(users_baskets[['user_id', 'avg_basket_department']], on='user_id', how='left')

# Средняя сумма покупки в категории
department_sales = data_department.groupby('department')['sales_value'].mean().reset_index()
department_sales.rename(columns={'sales_value': 'mean_sales_value_category'}, inplace=True)
df_ranker_train = df_ranker_train.merge(department_sales, on='department', how='left')

# Средная цена купленных товаров пользователем
users_sales = data_train_ranker.groupby('user_id')[['sales_value', 'quantity']].sum().reset_index()
users_sales['avg_price'] = users_sales['sales_value'] / users_sales['quantity']
df_ranker_train = df_ranker_train.merge(users_sales[['user_id', 'avg_price']], on='user_id', how='left')


In [32]:
''' ДЛЯ ТОВАРОВ '''

# Среднее количество покупок товара в неделю
num_purchase_week = data_train_ranker.groupby('item_id').agg({'week_no': 'nunique', 'quantity': 'sum'}).reset_index()
num_purchase_week['avg_num_purchases_week'] = num_purchase_week['quantity'] / num_purchase_week['week_no']
df_ranker_train = df_ranker_train.merge(num_purchase_week[['item_id', 'avg_num_purchases_week']], on='item_id', how='left')
df_ranker_train['avg_num_purchases_week'].fillna(0, inplace= True)



# Цена товара
items_sales = data_department.groupby('item_id')[['sales_value', 'quantity']].sum().reset_index()
items_sales['price'] = items_sales['sales_value'] / items_sales['quantity']
items_sales['price'].fillna(0, inplace=True)
df_ranker_train = df_ranker_train.merge(items_sales[['item_id', 'price']], on='item_id', how='left')


# Среднее время покупки товара
bought_item_time = data_train_ranker.groupby('item_id')['trans_time'].mean().reset_index()
bought_item_time.rename(columns = {'trans_time':'mean_trans_time_by_item'}, inplace=True)
da_ranker_train = df_ranker_train.merge(bought_item_time, on = 'item_id', how = 'left')


# Количество магазинов, где есть товар
items_stores = data_department.groupby('item_id')['store_id'].sum().reset_index()
items_stores.rename(columns={'store_id': 'n_stores_with_item'}, inplace=True)
items_stores['n_stores_with_item'].fillna(0, inplace = True)
df_ranker_train = df_ranker_train.merge(items_stores, on='item_id', how='left')

# Количество уникальных магазинов, где есть товар
items_stores = data_department.groupby('item_id')['store_id'].nunique().reset_index()
items_stores.rename(columns={'store_id': 'n_unique_stores_with_item'}, inplace=True)
items_stores['n_unique_stores_with_item'].fillna(0, inplace = True)
df_ranker_train = df_ranker_train.merge(items_stores, on='item_id', how='left')


In [33]:
# Построим признак, отражающий средний интервал между покупками пользователя.
users_days = df_join_train_matcher.groupby('user_id')['day'].unique().reset_index()
users_days['day'] = users_days['day'].apply(lambda x: sorted(x))
users_days.head()

Unnamed: 0,user_id,day
0,1,"[246, 263, 274, 276, 282, 291, 300, 311, 317, ..."
1,2,"[239, 249, 263, 291, 318, 335, 345, 415, 432, ..."
2,3,"[242, 243, 248, 253, 260, 266, 291, 304, 320, ..."
3,4,"[244, 248, 251, 264, 288, 292, 302, 321, 328, ..."
4,5,"[258, 300, 320, 334, 418, 433, 449, 450, 460, ..."


In [34]:
def avg_ndays(days):
    diff = 0
    if len(days) > 1:
        for i in range(len(days) - 1):
            diff += days[i+1] - days[i]
        return diff / (len(days) - 1)
    else:
        return 0
    
users_days['avg_interval'] = users_days['day'].apply(avg_ndays)

df_ranker_train = df_ranker_train.merge(users_days[['user_id', 'avg_interval']], on='user_id', how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,mean_sales_value_per_basket,unique_bought_items,avg_basket_department,mean_sales_value_category,avg_price,avg_num_purchases_week,price,n_stores_with_item,n_unique_stores_with_item,avg_interval
0,1827,907631,0.0,1039,GROCERY,National,FROZEN PIZZA,SNACKS/APPETIZERS,,,...,2.965362,28,2.4,2.54529,2.208947,16.0,2.985,276749.0,39.0,10.333333
1,1827,940947,0.0,2082,MEAT-PCKGD,National,HEAT/SERVE,ENTREES,24 OZ,,...,2.965362,28,2.4,3.825916,2.208947,19.4,2.650825,383770.0,48.0,10.333333


Построим признак, в котором будет закодировано место товара в пяти последних покупках клиента.

In [35]:
users_items = data_train_ranker.groupby('user_id')['item_id'].apply(list).reset_index()
users_items['item_id'] = users_items['item_id'].apply(lambda x: x[-5:])
users_items.head()

Unnamed: 0,user_id,item_id
0,1,"[5577022, 8293439, 9526676, 9527558, 10149640]"
1,6,"[1099058, 895268, 1017061, 1082185, 1119051]"
2,7,"[9837501, 12524016, 13072715, 13987153, 13987338]"
3,8,"[924610, 999142, 1080014, 1121694, 1130286]"
4,9,"[7467081, 10150194, 10457112, 12132773, 12171886]"


In [36]:
users_items.loc[users_items['user_id'] == 67, 'item_id'].item()

[1135408, 9194664, 9526666, 10182813, 12385050]

In [37]:
def code_last_sales(x, df=users_items):
    last_sales = df.loc[df['user_id'] == x[0], 'item_id'].item()
    code = str()
    last_sales.reverse()
    for item in last_sales:
        code += '1' if item == x[1] else '0'
    return code

df_ranker_train['Last5sales'] = df_ranker_train[['user_id', 'item_id']].apply(code_last_sales, axis=1)
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,unique_bought_items,avg_basket_department,mean_sales_value_category,avg_price,avg_num_purchases_week,price,n_stores_with_item,n_unique_stores_with_item,avg_interval,Last5sales
0,1827,907631,0.0,1039,GROCERY,National,FROZEN PIZZA,SNACKS/APPETIZERS,,,...,28,2.4,2.54529,2.208947,16.0,2.985,276749.0,39.0,10.333333,0
1,1827,940947,0.0,2082,MEAT-PCKGD,National,HEAT/SERVE,ENTREES,24 OZ,,...,28,2.4,3.825916,2.208947,19.4,2.650825,383770.0,48.0,10.333333,0


In [38]:
# Проверим наличие пропусков
df_ranker_train.isnull().sum()

user_id                            0
item_id                            0
target                             0
manufacturer                       0
department                         0
brand                              0
commodity_desc                     0
sub_commodity_desc                 0
curr_size_of_product               0
age_desc                       33239
marital_status_code            33239
income_desc                    33239
homeowner_desc                 33239
hh_comp_desc                   33239
household_size_desc            33239
kid_category_desc              33239
total_item_sales_value             0
total_quantity_value               0
item_freq                          0
user_freq                          0
total_user_sales_value             0
item_quantity_per_week             0
user_quantity_per_week             0
item_quantity_per_basket           0
user_quantity_per_basket           0
item_freq_per_basket               0
user_freq_per_basket               0
a

In [39]:
#Заполним пропуски у количественных признаков
df_ranker_train[['price', 'n_stores_with_item', 'n_unique_stores_with_item']].fillna(0, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


**Разделение на X_train и y_train и обучение модели**

In [40]:
df_ranker_train.columns

Index(['user_id', 'item_id', 'target', 'manufacturer', 'department', 'brand',
       'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product',
       'age_desc', 'marital_status_code', 'income_desc', 'homeowner_desc',
       'hh_comp_desc', 'household_size_desc', 'kid_category_desc',
       'total_item_sales_value', 'total_quantity_value', 'item_freq',
       'user_freq', 'total_user_sales_value', 'item_quantity_per_week',
       'user_quantity_per_week', 'item_quantity_per_basket',
       'user_quantity_per_basket', 'item_freq_per_basket',
       'user_freq_per_basket', 'avg_cheque', 'users_unique_departments',
       'mean_trans_time_by_user', 'mean_sales_value_per_basket',
       'unique_bought_items', 'avg_basket_department',
       'mean_sales_value_category', 'avg_price', 'avg_num_purchases_week',
       'price', 'n_stores_with_item', 'n_unique_stores_with_item',
       'avg_interval', 'Last5sales'],
      dtype='object')

In [41]:
X_train = df_ranker_train.drop(columns = ['target',
                                          'total_quantity_value',
                                          'user_quantity_per_week',
                                          'mean_sales_value_category',
                                          'item_quantity_per_basket'
                                         ])

#Убрала количественные признаки в весом feature_importances ниже 1 (категориальные оставим)
y_train = df_ranker_train[['target']]

In [43]:
X_train.head(2)

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,...,mean_sales_value_per_basket,unique_bought_items,avg_basket_department,avg_price,avg_num_purchases_week,price,n_stores_with_item,n_unique_stores_with_item,avg_interval,Last5sales
0,1827,907631,1039,GROCERY,National,FROZEN PIZZA,SNACKS/APPETIZERS,,,,...,2.965362,28,2.4,2.208947,16.0,2.985,276749.0,39.0,10.333333,0
1,1827,940947,2082,MEAT-PCKGD,National,HEAT/SERVE,ENTREES,24 OZ,,,...,2.965362,28,2.4,2.208947,19.4,2.650825,383770.0,48.0,10.333333,0


In [44]:
# Обработка категориальных признаков

cat_feats = [
          'department',
         'brand',
         'commodity_desc',
         'sub_commodity_desc',
         'curr_size_of_product',
         'age_desc',
         'marital_status_code',
         'income_desc',
         'homeowner_desc',
         'hh_comp_desc',
         'household_size_desc',
         'kid_category_desc',
         'Last5sales'
    
]


for col in cat_feats:
    X_train[col].fillna(0, inplace=True)

X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc',
 'Last5sales']

In [45]:
#Рассчитаем дисбаланс классов - насколько объектов 0 класса больше, чем объектов 1 класса
disbalance = y_train.value_counts()[0] / y_train.value_counts()[1]
disbalance

6.137134838627376

**Пострение модели**

In [46]:
#ПАРАМЕТРЫ БЫЛИ ПОДОБРАНЫ ПОИСКОМ ПО СЕТКЕ, НЕ ВКЛЮЧЕНО В ИТОГОВЫЙ ПРОЕКТ ДЛЯ СОКРАЩЕНИЯ ВРЕМЕНИ РАБОТЫ НОУТБУКА

ctb = CatBoostClassifier(learning_rate=0.1,
                        max_depth=12,
                        n_estimators=550,
                        random_state=42, 
                        cat_features=cat_feats, 
                        class_weights=[1, disbalance],
                        silent=True)

ctb.fit(X_train, y_train)

train_preds = ctb.predict_proba(X_train)

In [47]:
# Изучим важность признаков в модели
fi = pd.DataFrame(ctb.feature_importances_, index=X_train.columns, columns=['importance'])
fi.sort_values(by='importance', ascending=False)

Unnamed: 0,importance
unique_bought_items,6.064318
item_id,5.973385
user_id,5.334449
mean_trans_time_by_user,5.157933
avg_basket_department,5.036407
sub_commodity_desc,4.490401
commodity_desc,4.469624
price,4.458278
avg_interval,3.822881
avg_cheque,3.759799


In [48]:
# Оценим качество построенной модели 
df_ranker_predict = df_ranker_train.copy()
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

In [49]:
result = data_val_ranker.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']

#Добавляем сначала предсказания (рекомендации) модели 1 уровня
result['own_rec'] = result['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=N))

Сделаем переранжирование рекомендаций на основе результатов классификации модели 2 уровня

In [50]:
def rerank(user_id, N):
    return df_ranker_predict[df_ranker_predict['user_id']==user_id].\
            sort_values('proba_item_purchase', ascending=False).head(N).item_id.tolist()

In [55]:
TOPK_PRECISION = 5

result['reranked_own_rec'] = result['user_id'].apply(lambda user_id:\
                                                            rerank(user_id, N=5))

# Оставляем заведомо больше ранжированных предсказаний для постфильтрации
result['postfiltered_reranked_own_rec'] = result['user_id'].apply(lambda user_id:\
                                     postfilter_items(rerank(user_id, N=20), item_features = item_features, N=5))
result['uniquue_reranked_own_rec'] = result['user_id'].apply(lambda user_id:\
                         make_unique_recommendations(rerank(user_id, N=20), N=5))

print(*sorted(calc_precision_at_k(result, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.2838502673796791)
('uniquue_reranked_own_rec', 0.2838502673796791)
('own_rec', 0.2621390374331551)
('postfiltered_reranked_own_rec', 0.23679144385026737)


Удалив дубликаты в df_train_ranker на этапе подготовки датасета, мы сразу получили уникальные товары, поэтому применяя фильтрацию товаров, чтобы остались только уникальные - метрика не меняется, как и товары.  
  
Когда мы делаем постфильтрацию товаров таким образом, чтобы каждый товар был из отдельной категории, метрика существенно падает -> постфильтрация ухудщает результат

**Проверим, везде ли одинаковое количество рекомендаций**  
если их кажется меньше, то необходимо будет реализовать дополнение

In [56]:
for num, row in enumerate(result['own_rec']):
    if len(row) != 30:
        print(num)

In [57]:
for num, row in enumerate(result['reranked_own_rec']):
    if len(row) != 5:
        print(num)

### Оценка на тестовом наборе данных

In [59]:
df_test = pd.read_csv('data/retail_test.csv')
df_test.shape

(88734, 12)

In [60]:
df_test = df_test[df_test.user_id.isin(common_users)]

In [61]:
final_test = df_test.groupby('user_id')['item_id'].unique().reset_index()
final_test.columns=['user_id', 'actual']

In [62]:
final_test['own_rec'] = final_test['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=N))
final_test['reranked_own_rec'] = final_test['user_id'].apply(lambda user_id: rerank(user_id, N=5))

print(*sorted(calc_precision_at_k(final_test, k), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.23681957186544345)
('own_rec', 0.22422018348623854)


In [63]:
# Проверим, есть ли строки где количество предсказаний != 5
for num, row in enumerate(final_test['reranked_own_rec']):
    if len(row) != 5:
        print(num)

In [64]:
# Сохраним рекомендации
recommendations = final_test[['user_id', 'reranked_own_rec']]
recommendations.rename(columns = {'reranked_own_rec' : 'recommendations'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [65]:
recommendations.to_csv('recommendations.csv', index=False)
recommendations.head(2)

Unnamed: 0,user_id,recommendations
0,1,"[5577022, 8293439, 10149640, 9527558, 9655212]"
1,6,"[1098844, 1024306, 6548453, 878996, 1029743]"
