# Курсовая работа

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import sys  
sys.path.insert(0, '../my_moduls/')

# Написанные нами функции
from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders import MainRecommender

## Read data

In [2]:
data = pd.read_csv('../../data/retail_train.csv')
item_features = pd.read_csv('../../data/product.csv')
user_features = pd.read_csv('../../data/hh_demographic.csv')

# Process features dataset

In [3]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'
ACTUAL_COL = 'actual'

# N = Neighbors
N_PREDICT = 50 

In [4]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

# Split dataset for train, eval, test

In [5]:
# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)

# для валидации будем ипользовать retail_test1.csv
VAL_MATCHER_WEEKS = 6
VAL_RANKER_WEEKS = 3

In [6]:
# берем данные для тренировки matching модели
# data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]
data_train_matcher = data[data['week_no'] < data['week_no'].max() - VAL_MATCHER_WEEKS]

# берем данные для валидации matching модели
# data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
#                       (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]
data_val_matcher = data[data['week_no'] >= data['week_no'].max() - VAL_MATCHER_WEEKS]

# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
# data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]
data_val_ranker = pd.read_csv('../data/retail_test1.csv')

In [7]:
# сделаем объединенный сет данных для первого уровня (матчинга)
df_join_train_matcher = pd.concat([data_train_matcher, data_val_matcher])

In [8]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")

In [9]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (2193515, 12) Users: 2499 Items: 85334
val_matcher
Shape: (203289, 12) Users: 2197 Items: 30040
train_ranker
Shape: (203289, 12) Users: 2197 Items: 30040
val_ranker
Shape: (88734, 12) Users: 1885 Items: 20497


In [10]:
data_train_matcher.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


# Prefilter items

In [11]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=4000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['cost'] = data['price'] * np.maximum(data['quantity'], 1)


Decreased # items from 85334 to 4001


# Make cold-start to warm-start

In [12]:
# ищем общих пользователей
common_users = list(set(data_train_matcher.user_id.values)&(set(data_val_matcher.user_id.values)))

# оставляем общих пользователей
data_train_matcher = data_train_matcher[data_train_matcher.user_id.isin(common_users)]
data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (2070946, 14) Users: 2196 Items: 4001
val_matcher
Shape: (203268, 12) Users: 2196 Items: 30039
train_ranker
Shape: (203268, 12) Users: 2196 Items: 30039
val_ranker
Shape: (87559, 12) Users: 1810 Items: 20389


# Init/train recommender

In [13]:
%%time
recommender = MainRecommender(data_train_matcher)



bm25_weight


HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4001.0), HTML(value='')))


Wall time: 5.98 s


In [14]:
def calc_recall(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [15]:
def calc_precision(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [16]:
TOPK_RECALL = 50

TOPK_PRECISION = 5

# Ranking part

## Подготовка данных для трейна

In [17]:
# взяли пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

# собираем кандитатов с первого этапа (matcher)
df_match_candidates['candidates'] = df_match_candidates[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

In [18]:
df_match_candidates.head(2)

Unnamed: 0,user_id,candidates
0,84,"[829722, 12731432, 5569374, 1092937, 879504, 1..."
1,1753,"[879755, 1089066, 1085604, 901543, 13842224, 9..."


In [19]:
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

df_match_candidates.head(4)

Unnamed: 0,user_id,item_id
0,84,829722
0,84,12731432
0,84,5569374
0,84,1092937


### Check warm start

In [20]:
print_stats_data(df_match_candidates, 'match_candidates')

match_candidates
Shape: (109800, 2) Users: 2196 Items: 3183


### Создаем трейн сет для ранжирования с учетом кандидатов с этапа 1 

In [21]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1  # тут только покупки 

df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

df_ranker_train['target'].fillna(0, inplace= True)

In [22]:
df_ranker_train.target.value_counts()

0.0    94734
1.0    25405
Name: target, dtype: int64

In [23]:
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target
0,84,829722,0.0
1,84,12731432,0.0


## Подготавливаем фичи для обучения модели

In [24]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [25]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


**Фичи user_id:**
    - Средний чек
    - Средняя сумма покупки 1 товара в каждой категории
    - Кол-во покупок в каждой категории
    - Частотность покупок раз/месяц
    - Долю покупок в выходные
    - Долю покупок утром/днем/вечером

**Фичи item_id**:
    - Кол-во покупок в неделю
    - Среднее ол-во покупок 1 товара в категории в неделю
    - (Кол-во покупок в неделю) / (Среднее ол-во покупок 1 товара в категории в неделю)
    - Цена (Можно посчитать из retil_train.csv)
    - Цена / Средняя цена товара в категории
    
**Фичи пары user_id - item_id**
    - (Средняя сумма покупки 1 товара в каждой категории (берем категорию item_id)) - (Цена item_id)
    - (Кол-во покупок юзером конкретной категории в неделю) - (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)
    - (Кол-во покупок юзером конкретной категории в неделю) / (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)

#### Фичи user_id:
Cредний чек

In [26]:
data_train_matcher.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price,cost
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,1.39,1.39
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0,0.82,0.82
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0,0.99,0.99
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0,1.21,1.21
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0,1.5,1.5


In [27]:
df_join_train_matcher['price'] = df_join_train_matcher['sales_value'] / (np.maximum(df_join_train_matcher['quantity'], 1))
df_join_train_matcher['cost'] = np.maximum(df_join_train_matcher['quantity'], 1) *  df_join_train_matcher['price']

user_avg_receipt = df_join_train_matcher.groupby(['user_id', 'basket_id'])[['cost']].mean().reset_index().groupby('user_id')['cost'].mean()
user_avg_receipt.name = 'user_avg_receipt'
user_avg_receipt[:3]

user_id
1    2.726818
2    3.281768
3    3.268026
Name: user_avg_receipt, dtype: float64

Частота покупок раз в месяц

In [28]:
df_join_train_matcher['month'] = df_join_train_matcher['day'].apply(lambda x: (x+30)//30)

user_life_cycle = df_join_train_matcher.groupby(['user_id']).agg({ 'month': ['min', 'max'] }).reset_index()

user_life_cycle['life_cycle_month'] = user_life_cycle['month','max'] - user_life_cycle['month','min']
user_life_cycle.drop([('month','min'), ('month','max')], axis=1, inplace=True)
user_life_cycle.set_index('user_id',inplace = True)

frq_pur_month = data_train_matcher.groupby(['user_id'])['basket_id'].count() / user_life_cycle['life_cycle_month']
frq_pur_month.name = 'frq_pur_month'
frq_pur_month[:3]

user_id
1    68.000000
2    36.294118
3    48.222222
Name: frq_pur_month, dtype: float64

#### Фичи item_id
Среднее кол-во покупок в неделю

In [29]:
avg_purch_week = df_join_train_matcher.groupby(['item_id'])['quantity'].sum() / df_join_train_matcher['week_no'].max() 
avg_purch_week.name = 'avg_purch_week'
avg_purch_week[:4]

item_id
25671    0.063158
26081    0.010526
26093    0.010526
26190    0.010526
Name: avg_purch_week, dtype: float64

Среднее ол-во покупок 1 товара в категории в неделю

In [30]:
merge_df = pd.merge(df_join_train_matcher, item_features, how='inner', on='item_id')

pusrch_depart = merge_df.groupby(['department'])['quantity'].sum()
pusrch_item_depart = merge_df.groupby(['department','item_id'])['quantity'].sum().reset_index()


purchases_dep = pd.merge(pusrch_item_depart, pusrch_depart, on='department')
purchases_dep['avg_sales_group_week'] = purchases_dep['quantity_x'] / purchases_dep['quantity_y'] / df_join_train_matcher['week_no'].max()
purchases_dep.set_index('item_id', inplace=True)
avg_sales_group_week = purchases_dep['avg_sales_group_week']
avg_sales_group_week[:3]

item_id
5126087   NaN
5126088   NaN
5126106   NaN
Name: avg_sales_group_week, dtype: float64

(Кол-во покупок в неделю) / (Среднее ол-во покупок 1 товара в категории в неделю)

In [31]:
metrik1 = avg_purch_week / avg_sales_group_week
metrik1.name = 'metrik1'
metrik1[:3]

item_id
25671    2025562.0
26081       3274.0
26093      46029.0
Name: metrik1, dtype: float64

### Добавляем соответствующие фичи в таблицы

In [32]:
item_features = item_features.merge(avg_purch_week, how='left', on='item_id')

item_features = item_features.merge(avg_sales_group_week, how='left', on='item_id')

item_features = item_features.merge(metrik1, how='left', on='item_id')
item_features[~item_features['avg_sales_group_week'].isna()].head(3)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,avg_purch_week,avg_sales_group_week,metrik1
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB,0.063158,3.118043e-08,2025562.0
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,,0.010526,3.215124e-06,3274.0
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,,0.010526,2.286888e-07,46029.0


In [33]:
user_features = user_features.merge(user_avg_receipt, how='left', on='user_id')

user_features = user_features.merge(frq_pur_month, how='left', on='user_id')
user_features.head(3) 

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id,user_avg_receipt,frq_pur_month
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1,2.726818,68.0
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7,2.989986,40.590909
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8,4.734543,75.15


In [34]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,avg_purch_week,...,metrik1,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_avg_receipt,frq_pur_month
0,84,829722,0.0,70,GROCERY,National,LAUNDRY DETERGENTS,LIQUID LAUNDRY DETERGENTS,32 LOAD,1.736842,...,2025562.0,,,,,,,,,
1,84,12731432,0.0,4356,MEAT,National,SMOKED MEATS,HAMS-WHOLE BONELESS,,2.347368,...,110327.0,,,,,,,,,


## !!! Пока выполните нотбук без этих строк, потом вернитесь и запустите их, обучите ранкер и посмотрите на метрики с ранжированием

In [35]:
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('sales_value').sum().rename('total_item_sales_value'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('total_quantity_value'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq'), how='left',on=USER_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('sales_value').sum().rename('total_user_sales_value'), how='left',on=USER_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=USER_COL)


df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_baskter')/df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)


df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)


In [36]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]

cat_feats = ['manufacturer', 'department', 'brand',
       'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product', 'age_desc',
       'marital_status_code', 'income_desc', 'homeowner_desc', 'hh_comp_desc',
       'household_size_desc', 'kid_category_desc']

X_train[cat_feats] = X_train[cat_feats].astype('category')

## Обучение модели ранжирования

In [37]:
lgb = LGBMClassifier(objective='binary',
                     max_depth=12,
                     n_estimators=300,
                     learning_rate=0.08,
                     categorical_column=cat_feats)

lgb.fit(X_train, y_train)

train_preds = lgb.predict_proba(X_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)




## Подготавливаем test dataset

In [38]:
df_ranker_val = data_val_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_val = df_match_candidates.merge(df_ranker_val, on=[USER_COL, ITEM_COL], how='left')

df_ranker_val = df_ranker_val.merge(item_features, on='item_id', how='left')
df_ranker_val = df_ranker_val.merge(user_features, on='user_id', how='left')

cat_feats = ['manufacturer', 'department', 'brand',
       'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product', 'age_desc',
       'marital_status_code', 'income_desc', 'homeowner_desc', 'hh_comp_desc',
       'household_size_desc', 'kid_category_desc']
df_ranker_val[cat_feats] = df_ranker_val[cat_feats].astype('category')

df_ranker_val.head()

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,avg_purch_week,avg_sales_group_week,metrik1,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_avg_receipt,frq_pur_month
0,84,829722,70,GROCERY,National,LAUNDRY DETERGENTS,LIQUID LAUNDRY DETERGENTS,32 LOAD,1.736842,8.574618e-07,2025562.0,,,,,,,,,
1,84,12731432,4356,MEAT,National,SMOKED MEATS,HAMS-WHOLE BONELESS,,2.347368,2.127646e-05,110327.0,,,,,,,,,
2,84,5569374,1208,GROCERY,National,SOFT DRINKS,SOFT DRINKS 12/18&15PK CAN CAR,12 OZ,15.505263,7.654796e-06,2025562.0,,,,,,,,,
3,84,1092937,1089,MEAT-PCKGD,National,LUNCHMEAT,BOLOGNA,16OZ,6.2,4.513062e-05,137379.0,,,,,,,,,
4,84,879504,972,GROCERY,National,BAG SNACKS,POTATO CHIPS,12 OZ,3.894737,1.922793e-06,2025562.0,,,,,,,,,


In [39]:
df_ranker_val = df_ranker_val.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('sales_value').sum().rename('total_item_sales_value'), how='left',on=ITEM_COL)

df_ranker_val = df_ranker_val.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('total_quantity_value'), how='left',on=ITEM_COL)

df_ranker_val = df_ranker_val.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq'), how='left',on=ITEM_COL)

df_ranker_val = df_ranker_val.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq'), how='left',on=USER_COL)

df_ranker_val = df_ranker_val.merge(df_join_train_matcher.groupby(by=USER_COL).agg('sales_value').sum().rename('total_user_sales_value'), how='left',on=USER_COL)

df_ranker_val = df_ranker_val.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=ITEM_COL)

df_ranker_val = df_ranker_val.merge(df_join_train_matcher.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=USER_COL)


df_ranker_val = df_ranker_val.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)

df_ranker_val = df_ranker_val.merge(df_join_train_matcher.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_baskter')/df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)


df_ranker_val = df_ranker_val.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)

df_ranker_val = df_ranker_val.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)


# Оценка на тесте для выполнения курсового проекта он же data_val_ranker

In [40]:
result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
result_eval_ranker.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784..."


In [41]:
%%time
result_eval_ranker['own_rec'] = result_eval_ranker[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

Wall time: 18.9 s


In [44]:
val_preds = lgb.predict_proba(df_ranker_val)
df_ranker_val_pred = df_ranker_val.copy()
df_ranker_val_pred['proba_item_purchase'] = val_preds[:,1]
df_ranker_val_pred.head()

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,avg_purch_week,avg_sales_group_week,...,item_freq,user_freq,total_user_sales_value,item_quantity_per_week,user_quantity_per_week,item_quantity_per_basket,user_quantity_per_baskter,item_freq_per_basket,user_freq_per_basket,proba_item_purchase
0,84,829722,70,GROCERY,National,LAUNDRY DETERGENTS,LIQUID LAUNDRY DETERGENTS,32 LOAD,1.736842,8.574618e-07,...,147,439,1153.65,1.736842,91.294737,0.000642,0.033747,0.000572,0.001708,0.054707
1,84,12731432,4356,MEAT,National,SMOKED MEATS,HAMS-WHOLE BONELESS,,2.347368,2.127646e-05,...,202,439,1153.65,2.347368,91.294737,0.000868,0.033747,0.000786,0.001708,0.154324
2,84,5569374,1208,GROCERY,National,SOFT DRINKS,SOFT DRINKS 12/18&15PK CAN CAR,12 OZ,15.505263,7.654796e-06,...,819,439,1153.65,15.505263,91.294737,0.005731,0.033747,0.003187,0.001708,0.104297
3,84,1092937,1089,MEAT-PCKGD,National,LUNCHMEAT,BOLOGNA,16OZ,6.2,4.513062e-05,...,456,439,1153.65,6.2,91.294737,0.002292,0.033747,0.001774,0.001708,0.047733
4,84,879504,972,GROCERY,National,BAG SNACKS,POTATO CHIPS,12 OZ,3.894737,1.922793e-06,...,353,439,1153.65,3.894737,91.294737,0.00144,0.033747,0.001374,0.001708,0.081687


## Eval re-ranked matched result on test dataset
    Вспомним df_match_candidates сет, который был получен own_recommendations на юзерах, набор пользователей мы фиксировали и он одинаков, значи и прогноз одинаков, поэтому мы можем использовать этот датафрейм для переранжирования.
    

In [45]:
def rerank_val(user_id):
    return df_ranker_val_pred[df_ranker_val_pred[USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()

In [46]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank_val(user_id))

In [47]:
print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.2317127071823175)
('own_rec', 0.1792265193370148)
