Задание 1
A) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?

Пока пробуем отобрать 50 кандидатов (k=50)
Качество измеряем на data_val_lvl_1: следующие 6 недель после трейна
Дают ли own recommendtions + top-popular лучший recall?

B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500}
C)* Исходя из прошлого вопроса, как вы думаете, какое значение k является наиболее разумным?

In [1]:
!pip install implicit



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pickle
import re

from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import csr_matrix, coo_matrix
import implicit
from implicit import als
from implicit.nearest_neighbours import ItemItemRecommender
from lightgbm import LGBMClassifier

import lightgbm as lgb

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders import MainRecommender

import warnings
warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

In [3]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy() 
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)


Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [4]:
def get_result_table(data: pd.DataFrame):
    result = data.groupby('user_id')['item_id'].unique().reset_index()
    result.columns = ['user_id', 'actual']
    return result

result_lvl_1 = get_result_table(data_val_lvl_1)
result_lvl_2 = get_result_table(data_val_lvl_2)
display(result_lvl_1.head(2), result_lvl_2.head(2))

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [5]:

n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1, top_n_popular = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 10086


In [6]:

common_users = data_train_lvl_1.user_id.values

data_val_lvl_1 = data_val_lvl_1[data_val_lvl_1.user_id.isin(common_users)]
data_train_lvl_2 = data_train_lvl_2[data_train_lvl_2.user_id.isin(common_users)]
data_val_lvl_2 = data_val_lvl_2[data_val_lvl_2.user_id.isin(common_users)]

In [7]:
recommender = MainRecommender(data_train_lvl_1, top_n_popular)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [8]:
model_recs = recommender.get_model_recommendation()

In [9]:
similar_items_recs = recommender.get_similar_items_recommendation()

In [10]:
recall_at_k_dict = {}

k_list = [20, 50, 100, 200, 500]

for k in k_list:
    comp_name = 'model_rec_' + str(k)

    model_recs = \
        recommender.get_model_recommendation(N=k)

    result_lvl_1 = result_lvl_1.merge(model_recs,
                                      on='user_id',
                                      how='inner').rename(columns={'model_rec': comp_name})

    recall_at_k_dict[comp_name] = \
        result_lvl_1.apply(lambda row: recall_at_k(row[comp_name],
                                                   row['actual'], k=k), axis=1).mean()

In [11]:
recall_at_k_dict

{'model_rec_20': 0.03478771998011408,
 'model_rec_50': 0.06581554103075965,
 'model_rec_100': 0.09981077903362479,
 'model_rec_200': 0.14556268212745618,
 'model_rec_500': 0.22788611157569408}

In [12]:

for k in k_list:
    comp_name = 'similar_recommendation_' + str(k)

    similar_items_recs = \
        recommender.get_similar_items_recommendation(N=k)

    result_lvl_1 = result_lvl_1.merge(similar_items_recs,
                                      on='user_id',
                                      how='inner').rename(columns={'similar_recommendation': comp_name})

    recall_at_k_dict[comp_name] = \
        result_lvl_1.apply(lambda row: recall_at_k(row[comp_name],
                                                   row['actual'], k=k), axis=1).mean()

In [13]:
recall_at_k_dict

{'model_rec_20': 0.03478771998011408,
 'model_rec_50': 0.06581554103075965,
 'model_rec_100': 0.09981077903362479,
 'model_rec_200': 0.14556268212745618,
 'model_rec_500': 0.22788611157569408,
 'similar_recommendation_20': 0.025150504663065645,
 'similar_recommendation_50': 0.04909394737371636,
 'similar_recommendation_100': 0.08119986479499786,
 'similar_recommendation_200': 0.1320420874977376,
 'similar_recommendation_500': 0.2160368866937793}

Задание 2.

Обучите модель 2-ого уровня, при этом:
    - Добавьте минимум по 2 фичи для юзера, товара и пары юзер-товар
    - Измерьте отдельно precision@5 модели 1-ого уровня и двухуровневой модели на data_val_lvl_2
    - Вырос ли precision@5 при использовании двухуровневой модели?

In [14]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

# Пока только warm start
train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

In [15]:

users_lvl_2 = users_lvl_2.merge(recommender.get_model_recommendation(N=500),
                                on='user_id',
                                how='inner')

users_lvl_2.columns = ['user_id', 'candidates']
users_lvl_2.head(2)

Unnamed: 0,user_id,candidates
0,2070,"[1107553, 844165, 879755, 1053690, 1085604, 11..."
1,2021,"[871756, 981521, 1131344, 896938, 912681, 1037..."


In [16]:
s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2['flag'] = 1

users_lvl_2.head(4)

Unnamed: 0,user_id,item_id,flag
0,2070,1107553,1
0,2070,844165,1
0,2070,879755,1
0,2070,1053690,1


In [17]:
data_train_lvl_2.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2104867,2070,40618492260,594,1019940,1,1.0,311,-0.29,40,86,0.0,0.0
2107468,2021,40618753059,594,840361,1,0.99,443,0.0,101,86,0.0,0.0


In [18]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id', 'quantity', 'sales_value']].copy()
targets_lvl_2['target'] = 1  # тут только покупки

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('flag', axis=1, inplace=True)

targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,quantity,sales_value,target
0,2070,1107553,,,0.0
1,2070,844165,,,0.0


In [19]:
item_features.info(), user_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92353 entries, 0 to 92352
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   item_id               92353 non-null  int64 
 1   manufacturer          92353 non-null  int64 
 2   department            92353 non-null  object
 3   brand                 92353 non-null  object
 4   commodity_desc        92353 non-null  object
 5   sub_commodity_desc    92353 non-null  object
 6   curr_size_of_product  92353 non-null  object
dtypes: int64(2), object(5)
memory usage: 4.9+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 801 entries, 0 to 800
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   age_desc             801 non-null    object
 1   marital_status_code  801 non-null    object
 2   income_desc          801 non-null    object
 3   homeowner_desc       801 non-null    o

(None, None)

In [20]:
targets_lvl_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1080418 entries, 0 to 1080417
Data columns (total 5 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   user_id      1080418 non-null  int64  
 1   item_id      1080418 non-null  int64  
 2   quantity     42542 non-null    float64
 3   sales_value  42542 non-null    float64
 4   target       1080418 non-null  float64
dtypes: float64(3), int64(2)
memory usage: 49.5 MB


In [21]:
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')

targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,quantity,sales_value,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1107553,,,0.0,103,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,844165,,,0.0,103,GROCERY,National,SOFT DRINKS,SFT DRNK 2 LITER BTL CARB INCL,2 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


In [22]:
targets_lvl_2['quantity'].fillna(targets_lvl_2['quantity'].median(),
                                 inplace=True)
targets_lvl_2['sales_value'].fillna(targets_lvl_2['sales_value'].mean(),
                                    inplace=True)

targets_lvl_2.info(2)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1080418 entries, 0 to 1080417
Data columns (total 18 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   user_id               1080418 non-null  int64  
 1   item_id               1080418 non-null  int64  
 2   quantity              1080418 non-null  float64
 3   sales_value           1080418 non-null  float64
 4   target                1080418 non-null  float64
 5   manufacturer          1080418 non-null  int64  
 6   department            1080418 non-null  object 
 7   brand                 1080418 non-null  object 
 8   commodity_desc        1080418 non-null  object 
 9   sub_commodity_desc    1080418 non-null  object 
 10  curr_size_of_product  1080418 non-null  object 
 11  age_desc              401633 non-null   object 
 12  marital_status_code   401633 non-null   object 
 13  income_desc           401633 non-null   object 
 14  homeowner_desc        401633 non-n

In [23]:
df = pd.pivot_table(targets_lvl_2,
                    index='user_id', columns='department',
                    values='quantity',
                    aggfunc='count',
                    fill_value=0)

df = df.idxmax(axis=1).reset_index()
df.columns = ['user_id', 'top_department']

targets_lvl_2 = targets_lvl_2.merge(df,
                                    on='user_id',
                                    how='inner')

In [24]:
df = pd.pivot_table(targets_lvl_2,
                    index='user_id', columns='brand',
                    values='quantity',
                    aggfunc='count',
                    fill_value=0)

df = df.idxmax(axis=1).reset_index()
df.columns = ['user_id', 'top_brand']

targets_lvl_2 = targets_lvl_2.merge(df,
                                    on='user_id',
                                    how='inner')
     

In [25]:
df = pd.pivot_table(targets_lvl_2,
                    index='user_id', columns='department',
                    values='sales_value',
                    aggfunc='mean',
                    fill_value=0
                    )

df = df.stack().reset_index()
df.columns = ['user_id', 'department', 'mean_sales_value_of_user_in_department']

targets_lvl_2 = targets_lvl_2.merge(df,
                                    on=['user_id', 'department'],
                                    how='inner')


In [26]:
targets_lvl_2['age_desc'].unique()

array(['45-54', nan, '35-44', '55-64', '25-34', '65+', '19-24'],
      dtype=object)

In [27]:
df = \
    targets_lvl_2.groupby(by=['user_id'])['age_desc']\
    .apply(lambda x: pd.Series.mode(x))
df = df.reset_index()
df.drop(columns='level_1',
        inplace=True)

df.columns=['user_id', 'age_desc_corrected']

targets_lvl_2 = targets_lvl_2.merge(df,
                                    on='user_id',
                                    how='inner')

In [28]:
targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,quantity,sales_value,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,...,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,top_department,top_brand,mean_sales_value_of_user_in_department,age_desc_corrected
0,2070,1107553,1.0,2.433413,0.0,103,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,...,U,50-74K,Unknown,Unknown,1,None/Unknown,GROCERY,National,2.37752,45-54
1,2070,844165,1.0,2.433413,0.0,103,GROCERY,National,SOFT DRINKS,SFT DRNK 2 LITER BTL CARB INCL,...,U,50-74K,Unknown,Unknown,1,None/Unknown,GROCERY,National,2.37752,45-54


In [29]:
targets_lvl_2.columns

Index(['user_id', 'item_id', 'quantity', 'sales_value', 'target',
       'manufacturer', 'department', 'brand', 'commodity_desc',
       'sub_commodity_desc', 'curr_size_of_product', 'age_desc',
       'marital_status_code', 'income_desc', 'homeowner_desc', 'hh_comp_desc',
       'household_size_desc', 'kid_category_desc', 'top_department',
       'top_brand', 'mean_sales_value_of_user_in_department',
       'age_desc_corrected'],
      dtype='object')

In [30]:
feature_columns = \
    ['user_id',
     'item_id',
     'quantity',
     'sales_value',
     'department',
     'manufacturer',
     'age_desc_corrected',
     'brand',
     'top_department',
     'top_brand',
     'mean_sales_value_of_user_in_department'
    ]

In [31]:
targets_lvl_2[feature_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 401633 entries, 0 to 401632
Data columns (total 11 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   user_id                                 401633 non-null  int64  
 1   item_id                                 401633 non-null  int64  
 2   quantity                                401633 non-null  float64
 3   sales_value                             401633 non-null  float64
 4   department                              401633 non-null  object 
 5   manufacturer                            401633 non-null  int64  
 6   age_desc_corrected                      401633 non-null  object 
 7   brand                                   401633 non-null  object 
 8   top_department                          401633 non-null  object 
 9   top_brand                               401633 non-null  object 
 10  mean_sales_value_of_user_in_department  4016

In [32]:
X_train = targets_lvl_2[feature_columns]
y_train = targets_lvl_2['target']

X_train.head(2)

Unnamed: 0,user_id,item_id,quantity,sales_value,department,manufacturer,age_desc_corrected,brand,top_department,top_brand,mean_sales_value_of_user_in_department
0,2070,1107553,1.0,2.433413,GROCERY,103,45-54,National,GROCERY,National,2.37752
1,2070,844165,1.0,2.433413,GROCERY,103,45-54,National,GROCERY,National,2.37752


In [33]:
cat_feats = ['user_id', 'item_id', 'manufacturer',
              'age_desc_corrected', 'department',
             'brand', 'top_department','top_brand']

X_train[cat_feats] = X_train[cat_feats].astype('category')

In [34]:
X_train.isna().sum()

user_id                                   0
item_id                                   0
quantity                                  0
sales_value                               0
department                                0
manufacturer                              0
age_desc_corrected                        0
brand                                     0
top_department                            0
top_brand                                 0
mean_sales_value_of_user_in_department    0
dtype: int64

In [35]:
!pip install catboost



In [36]:
from catboost import CatBoostClassifier

In [37]:
result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns = ['user_id', 'actual']
result_lvl_2.head(5)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412..."


In [38]:
def map_at_k(recommended_list, bought_list, k=5):
    flags = np.isin(np.array(recommended_list), np.array(bought_list))
    if sum(flags) == 0:
        return 0
    sum_ = 0
    for i in range(min(k, len(flags))):
        if flags[i]:
            p_k = precision_at_k(recommended_list, bought_list, k=i+1)
            sum_ += p_k
    return sum_ / k

In [39]:
table_metric = {'iterations':[], 'learning_rate':[], 'depth':[],'Precision@5':[], 'MAP@5':[]}

for iterations_ in [50, 150, 300]:
  for learning_rate_ in [0.001, 0.015, 0.05]:
    for depth_ in [5, 10, 15]:
      model_ = CatBoostClassifier(
        random_seed=55,
        iterations=iterations_,
        learning_rate=learning_rate_,
        depth=depth_)

      model_.fit(X_train, y_train,
              cat_features=cat_feats,
              verbose=50)
      train_preds_ = model_.predict(X_train)
      train_preds_ = train_preds_.astype(bool)

      rec_items_ = X_train[train_preds_].groupby(by=['user_id'])['item_id'].unique().reset_index()
      rec_items_.columns = ['user_id', f'model_preds_iter_{iterations_}_rate{learning_rate_}_depth{depth_}']

      rec_items_[f'model_preds_iter_{iterations_}_rate{learning_rate_}_depth{depth_}'] = \
      rec_items_[f'model_preds_iter_{iterations_}_rate{learning_rate_}_depth{depth_}'].apply(lambda x: x[:5] if len(x) >= 5 else x)

      result_lvl_2 = result_lvl_2.merge(rec_items_,
                                   on='user_id',
                                   how='inner')

      test_presicion_ = result_lvl_2.apply(lambda row: precision_at_k(row[f'model_preds_iter_{iterations_}_rate{learning_rate_}_depth{depth_}'], row['actual']), axis=1).mean()
      test_map = result_lvl_2.apply(lambda row:  map_at_k(row[f'model_preds_iter_{iterations_}_rate{learning_rate_}_depth{depth_}'], row['actual']), axis=1).mean()

      table_metric['iterations'].append(iterations_)
      table_metric['learning_rate'].append(learning_rate_)
      table_metric['depth'].append(depth_)
      table_metric['Precision@5'].append(test_presicion_)
      table_metric['MAP@5'].append(test_map)


0:	learn: 0.6921492	total: 227ms	remaining: 11.1s
49:	learn: 0.6455774	total: 2.58s	remaining: 0us
0:	learn: 0.6921492	total: 68.4ms	remaining: 3.35s
49:	learn: 0.6455980	total: 2.92s	remaining: 0us
0:	learn: 0.6921492	total: 65.1ms	remaining: 3.19s
49:	learn: 0.6455924	total: 3.79s	remaining: 0us
0:	learn: 0.6782820	total: 60.4ms	remaining: 2.96s
49:	learn: 0.2688427	total: 2.4s	remaining: 0us
0:	learn: 0.6782820	total: 60.5ms	remaining: 2.96s
49:	learn: 0.2689607	total: 2.92s	remaining: 0us
0:	learn: 0.6782820	total: 61.7ms	remaining: 3.02s
49:	learn: 0.2689807	total: 3.77s	remaining: 0us
0:	learn: 0.6444698	total: 60.8ms	remaining: 2.98s
49:	learn: 0.0414574	total: 2.48s	remaining: 0us
0:	learn: 0.6444698	total: 60.5ms	remaining: 2.97s
49:	learn: 0.0414886	total: 3.1s	remaining: 0us
0:	learn: 0.6444698	total: 61.9ms	remaining: 3.04s
49:	learn: 0.0415213	total: 4.1s	remaining: 0us
0:	learn: 0.6921492	total: 66ms	remaining: 9.83s
50:	learn: 0.6446716	total: 2.44s	remaining: 4.74s
100:

In [40]:
table_metric = pd.DataFrame(table_metric)
table_metric.sort_values(by=['Precision@5'], ascending=False)

Unnamed: 0,iterations,learning_rate,depth,Precision@5,MAP@5
2,50,0.001,15,0.330511,0.23236
5,50,0.015,15,0.330511,0.23236
0,50,0.001,5,0.330242,0.231987
11,150,0.001,15,0.330242,0.231987
20,300,0.001,15,0.330242,0.231987
19,300,0.001,10,0.330242,0.231987
18,300,0.001,5,0.330242,0.231987
14,150,0.015,15,0.330242,0.231987
1,50,0.001,10,0.330242,0.232096
12,150,0.015,5,0.330242,0.231987
