# Вебинар 6. Двухуровневые модели рекомендаций


Код для src, utils, metrics вы можете скачать из [этого](https://github.com/geangohn/recsys-tutorial) github репозитория

In [127]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', 500)

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

In [2]:
data = pd.read_csv('./raw_data/retail_train.csv')

# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [4]:
#все покупки
data['week_no'].max(), data['week_no'].min()

(95, 1)

In [3]:
#давние покупки
data_train_lvl_1['week_no'].max(), data_train_lvl_1['week_no'].min()

(85, 1)

In [6]:
#валидационные 6 недель для первого уровня
data_val_lvl_1['week_no'].max(), data_val_lvl_1['week_no'].min()

(91, 86)

In [5]:
#6 недель для второго уровня
data_train_lvl_2['week_no'].max(), data_train_lvl_2['week_no'].min()

(91, 86)

In [7]:
#валидационные 3 недели для первого уровня
data_val_lvl_2['week_no'].max(), data_val_lvl_2['week_no'].min()

(95, 92)

In [9]:
#популярность товаров считается как кол-во уников, купивших товар
popularity = data_train_lvl_1.groupby('item_id')['user_id'].nunique().reset_index()

In [10]:
#доля товаров - отношение кол-ва уников, купивших товар, к кол-ву всего уников
popularity['share'] = popularity['user_id']/ data_train_lvl_1['user_id'].nunique()

In [12]:
#товар, который покупали чаще всего
popularity.loc[popularity['share']==popularity['share'].max()]

Unnamed: 0,item_id,user_id,share
34192,1082185,1995,0.798639


In [14]:
#всего уникальных товаров в датасете data_train_lvl_1 
popularity.shape

(83685, 3)

In [15]:
#товары, которые покупали менее 2% юзеров
popularity[popularity['share'] < 0.02]

Unnamed: 0,item_id,user_id,share
0,25671,3,0.001201
1,26081,1,0.000400
2,26093,1,0.000400
3,26190,1,0.000400
4,26355,1,0.000400
...,...,...,...
83680,17179426,1,0.000400
83681,17208239,1,0.000400
83682,17208470,1,0.000400
83683,17209402,3,0.001201


In [16]:
#товары, которые покупали более 2% юзеров
popularity[popularity['share'] > 0.2]

Unnamed: 0,item_id,user_id,share
6205,826249,1236,0.494796
6952,833025,752,0.301041
7123,834484,805,0.322258
7763,840361,1234,0.493995
8180,844165,635,0.254203
...,...,...,...
59365,8090521,531,0.212570
59369,8090537,538,0.215372
63920,9526410,657,0.263010
64241,9527290,571,0.228583


In [19]:
item_features = pd.read_csv('./raw_data/product.csv')
user_features = pd.read_csv('./raw_data/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [17]:
#всего уникальных товаров в датасете data_train_lvl_1 
n_items_before = data_train_lvl_1['item_id'].nunique()

In [20]:
#префильтрация товаров
data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)

In [21]:
n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


In [22]:
recommender = MainRecommender(data_train_lvl_1)



weighting done


In [23]:
recommender.get_als_recommendations(200000375, N=200)

[999999,
 1029743,
 1106523,
 5569230,
 916122,
 844179,
 1044078,
 1126899,
 1070820,
 1127831,
 866211,
 8090521,
 878996,
 8090537,
 5569471,
 1004906,
 854852,
 899624,
 986912,
 933835,
 1075368,
 1081177,
 6034857,
 5585510,
 965267,
 834117,
 940947,
 983584,
 12810393,
 913210,
 874972,
 5569845,
 5568378,
 999858,
 908318,
 985999,
 901062,
 1040807,
 1018740,
 951412,
 1101010,
 1105488,
 1037840,
 880150,
 1043751,
 857503,
 1122358,
 1132771,
 823704,
 854405,
 909714,
 993638,
 930118,
 839419,
 971922,
 910032,
 863447,
 835098,
 865456,
 976199,
 12301109,
 1070702,
 5569374,
 8090532,
 872137,
 883003,
 1023720,
 1012587,
 1000753,
 893018,
 1024306,
 897954,
 1042438,
 907631,
 5568729,
 1068719,
 1137775,
 1087102,
 944534,
 1020581,
 999270,
 832678,
 838186,
 1138443,
 999104,
 1112238,
 1050851,
 902172,
 965766,
 12301100,
 952163,
 957951,
 8090509,
 852856,
 885863,
 1056509,
 845307,
 1062002,
 944836,
 819978,
 957736,
 999779,
 1027168,
 1051323,
 8065410,
 1

In [28]:
recommender.get_own_recommendations(2000000375, N=200)

[999999,
 1029743,
 1106523,
 5569230,
 916122,
 844179,
 1044078,
 1126899,
 1070820,
 1127831,
 866211,
 8090521,
 878996,
 8090537,
 5569471,
 1004906,
 854852,
 899624,
 986912,
 933835,
 1075368,
 1081177,
 6034857,
 5585510,
 965267,
 834117,
 940947,
 983584,
 12810393,
 913210,
 874972,
 5569845,
 5568378,
 999858,
 908318,
 985999,
 901062,
 1040807,
 1018740,
 951412,
 1101010,
 1105488,
 1037840,
 880150,
 1043751,
 857503,
 1122358,
 1132771,
 823704,
 854405,
 909714,
 993638,
 930118,
 839419,
 971922,
 910032,
 863447,
 835098,
 865456,
 976199,
 12301109,
 1070702,
 5569374,
 8090532,
 872137,
 883003,
 1023720,
 1012587,
 1000753,
 893018,
 1024306,
 897954,
 1042438,
 907631,
 5568729,
 1068719,
 1137775,
 1087102,
 944534,
 1020581,
 999270,
 832678,
 838186,
 1138443,
 999104,
 1112238,
 1050851,
 902172,
 965766,
 12301100,
 952163,
 957951,
 8090509,
 852856,
 885863,
 1056509,
 845307,
 1062002,
 944836,
 819978,
 957736,
 999779,
 1027168,
 1051323,
 8065410,
 1

In [25]:
recommender.get_similar_items_recommendation(20000000000000375, N=200)

[999999,
 1029743,
 1106523,
 5569230,
 916122,
 844179,
 1044078,
 1126899,
 1070820,
 1127831,
 866211,
 8090521,
 878996,
 8090537,
 5569471,
 1004906,
 854852,
 899624,
 986912,
 933835,
 1075368,
 1081177,
 6034857,
 5585510,
 965267,
 834117,
 940947,
 983584,
 12810393,
 913210,
 874972,
 5569845,
 5568378,
 999858,
 908318,
 985999,
 901062,
 1040807,
 1018740,
 951412,
 1101010,
 1105488,
 1037840,
 880150,
 1043751,
 857503,
 1122358,
 1132771,
 823704,
 854405,
 909714,
 993638,
 930118,
 839419,
 971922,
 910032,
 863447,
 835098,
 865456,
 976199,
 12301109,
 1070702,
 5569374,
 8090532,
 872137,
 883003,
 1023720,
 1012587,
 1000753,
 893018,
 1024306,
 897954,
 1042438,
 907631,
 5568729,
 1068719,
 1137775,
 1087102,
 944534,
 1020581,
 999270,
 832678,
 838186,
 1138443,
 999104,
 1112238,
 1050851,
 902172,
 965766,
 12301100,
 952163,
 957951,
 8090509,
 852856,
 885863,
 1056509,
 845307,
 1062002,
 944836,
 819978,
 957736,
 999779,
 1027168,
 1051323,
 8065410,
 1

In [26]:
recommender.get_similar_users_recommendation(20000375, N=200)

[999999,
 1029743,
 1106523,
 5569230,
 916122,
 844179,
 1044078,
 1126899,
 1070820,
 1127831,
 866211,
 8090521,
 878996,
 8090537,
 5569471,
 1004906,
 854852,
 899624,
 986912,
 933835,
 1075368,
 1081177,
 6034857,
 5585510,
 965267,
 834117,
 940947,
 983584,
 12810393,
 913210,
 874972,
 5569845,
 5568378,
 999858,
 908318,
 985999,
 901062,
 1040807,
 1018740,
 951412,
 1101010,
 1105488,
 1037840,
 880150,
 1043751,
 857503,
 1122358,
 1132771,
 823704,
 854405,
 909714,
 993638,
 930118,
 839419,
 971922,
 910032,
 863447,
 835098,
 865456,
 976199,
 12301109,
 1070702,
 5569374,
 8090532,
 872137,
 883003,
 1023720,
 1012587,
 1000753,
 893018,
 1024306,
 897954,
 1042438,
 907631,
 5568729,
 1068719,
 1137775,
 1087102,
 944534,
 1020581,
 999270,
 832678,
 838186,
 1138443,
 999104,
 1112238,
 1050851,
 902172,
 965766,
 12301100,
 952163,
 957951,
 8090509,
 852856,
 885863,
 1056509,
 845307,
 1062002,
 944836,
 819978,
 957736,
 999779,
 1027168,
 1051323,
 8065410,
 1

### Задание 1

A) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?
- Пока пробуем отобрать 200 кандидатов (k=200)
- Качество измеряем на data_val_lvl_1: следующие 6 недель после трейна

Дают ли own recommendtions + top-popular лучший recall?  

B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500}  
C)* Исходя из прошлого вопроса, как вы думаете, какое значение k является наиболее разумным?


In [29]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


a) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?

In [30]:
%%time
result_lvl_1['get_als_recommendations'] = result_lvl_1['user_id'].apply(lambda x: 
                                                                   np.array(recommender.get_als_recommendations(x, N=200)))
result_lvl_1.head(2)

Wall time: 22.6 s


Unnamed: 0,user_id,actual,get_als_recommendations
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[959455, 865026, 1077133, 10455984, 9677939, 9..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[916122, 1127831, 9858819, 1004906, 9526159, 8..."


In [31]:
%%time
result_lvl_1['get_own_recommendations'] = result_lvl_1['user_id'].apply(lambda x: 
                                                                   np.array(recommender.get_own_recommendations(x, N=200)))
result_lvl_1.head(2)

Wall time: 8.29 s


Unnamed: 0,user_id,actual,get_als_recommendations,get_own_recommendations
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[959455, 865026, 1077133, 10455984, 9677939, 9...","[1029743, 1004906, 12810393, 8090521, 6034857,..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[916122, 1127831, 9858819, 1004906, 9526159, 8...","[916122, 1106523, 5569230, 1127831, 899624, 80..."


In [32]:
%%time
result_lvl_1['get_similar_items_recommendation'] = result_lvl_1['user_id'].apply(lambda x: 
                                                                   np.array(recommender.get_similar_items_recommendation(x, N=200)))
result_lvl_1.head(2)

Wall time: 30.8 s


Unnamed: 0,user_id,actual,get_als_recommendations,get_own_recommendations,get_similar_items_recommendation
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[959455, 865026, 1077133, 10455984, 9677939, 9...","[1029743, 1004906, 12810393, 8090521, 6034857,...","[1076056, 825665, 1007512, 974177, 904833, 896..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[916122, 1127831, 9858819, 1004906, 9526159, 8...","[916122, 1106523, 5569230, 1127831, 899624, 80...","[1076056, 8090509, 5569845, 8090537, 985999, 8..."


In [25]:
%%time
result_lvl_1['get_similar_users_recommendation'] = result_lvl_1['user_id'].apply(lambda x: 
                                                                   np.array(recommender.get_similar_users_recommendation(x, N=200)))
result_lvl_1.head(2)

Wall time: 26min 23s


Unnamed: 0,user_id,actual,get_als_recommendations,get_own_recommendations,get_similar_items_recommendation,get_similar_users_recommendation
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[8090560, 962615, 8090541, 10356272, 1120261, ...","[1029743, 1004906, 12810393, 8090521, 6034857,...","[944486, 882013, 5582712, 990656, 9419443, 103...","[1004906, 999999, 1126899, 1029743, 1127831, 1..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[1127831, 866211, 967041, 910032, 1029743, 556...","[916122, 1106523, 5569230, 1127831, 899624, 80...","[944486, 8090509, 5569845, 1035207, 985999, 84...","[1022254, 1029743, 844179, 1029743, 916122, 10..."


In [32]:
%%time
print('precision_at_k', 'get_als_recommendations',
      result_lvl_1.apply(lambda row: precision_at_k(row['get_als_recommendations'], row['actual']), axis=1).mean())
print('precision_at_k', 'get_own_recommendations',
      result_lvl_1.apply(lambda row: precision_at_k(row['get_own_recommendations'], row['actual']), axis=1).mean())
print('precision_at_k', 'get_similar_items_recommendation',
      result_lvl_1.apply(lambda row: precision_at_k(row['get_similar_items_recommendation'], row['actual']), axis=1).mean())
print('precision_at_k', 'get_similar_users_recommendation',
      result_lvl_1.apply(lambda row: precision_at_k(row['get_similar_users_recommendation'], row['actual']), axis=1).mean())

precision_at_k get_als_recommendations 0.09953574744661027
precision_at_k get_own_recommendations 0.17901578458681228
precision_at_k get_similar_items_recommendation 0.04168987929433637
precision_at_k get_similar_users_recommendation 0.07706592386258108
Wall time: 535 ms


In [31]:
%%time
print('recall_at_k', 'get_als_recommendations', 
      result_lvl_1.apply(lambda row: recall_at_k(row['get_als_recommendations'], row['actual']), axis=1).mean())
print('recall_at_k', 'get_own_recommendations',
      result_lvl_1.apply(lambda row: recall_at_k(row['get_own_recommendations'], row['actual']), axis=1).mean())
print('recall_at_k', 'get_similar_items_recommendation',
      result_lvl_1.apply(lambda row: recall_at_k(row['get_similar_items_recommendation'], row['actual']), axis=1).mean())
print('recall_at_k', 'get_similar_users_recommendation', 
      result_lvl_1.apply(lambda row: recall_at_k(row['get_similar_users_recommendation'], row['actual']), axis=1).mean())

recall_at_k get_als_recommendations 0.009075426937001743
recall_at_k get_own_recommendations 0.020395456287105648
recall_at_k get_similar_items_recommendation 0.004816114666616715
recall_at_k get_similar_users_recommendation 0.00853458216425187
Wall time: 506 ms


own recommendtions + top-popular дают лучший recall и precision_at_k

B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500}

In [33]:
k = [20, 50, 100, 200, 500]
print('recall_at_k', 'get_own_recommendations')
for i in k:
    print(f'k={i}',
      result_lvl_1.apply(lambda row: recall_at_k(row['get_own_recommendations'], row['actual'], i), axis=1).mean())

recall_at_k get_own_recommendations
k=20 0.04225947870645031
k=50 0.06754947411543526
k=100 0.09010043577407759
k=200 0.12090688172022042
k=500 0.12090688172022042


Чем больше k, тем выше recall@k. В моем примере при recall@200 и recall@500 равны, потому что у меня всего 200 предсказаний на пользователя в таблице

C)* Исходя из прошлого вопроса, как вы думаете, какое значение k является наиболее разумным?

Поскольку, в большинстве случаев пользователю показываются небольшое кол-во рекомендаций, то лучше брать небольшие значения - например, 20. Редко пользователи просматривают больше.

### Задание 2.

Обучите модель 2-ого уровня, при этом:
    - Добавьте минимум по 2 фичи для юзера, товара и пары юзер-товар
    - Измерьте отдельно precision@5 модели 1-ого уровня и двухуровневой модели на data_val_lvl_2
    - Вырос ли precision@5 при использовании двухуровневой модели?

In [285]:
# создаем датасет с униками из data_train_lvl_2
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']
users_lvl_2['user_id'].nunique()

2154

In [286]:
# Пока только warm start (убираем тех из users_lvl_2, кого нет в датасете data_train_lvl_1)
train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

In [287]:
# получаем собственные покупки пользователей датасета data_train_lvl_2 через предсказания ItemItemRecommender
users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=200))
users_lvl_2.head(2)

Unnamed: 0,user_id,candidates
0,2070,"[1029743, 916122, 1044078, 1106523, 5569230, 1..."
1,2021,"[844179, 1044078, 5569230, 1004906, 899624, 10..."


In [288]:
# переводим колонку с собственными покупками в строки, чтобы было соответствие user_id - item_id
s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2['drop'] = 1  # фиктивная переменная

users_lvl_2.head(4)

Unnamed: 0,user_id,item_id,drop
0,2070,1029743,1
0,2070,916122,1
0,2070,1044078,1
0,2070,1106523,1


In [289]:
#Всего строк в таблице собственных покупок и всего уников
users_lvl_2.shape[0], users_lvl_2['user_id'].nunique()

(430200, 2151)

In [290]:
data_train_lvl_2['item_id'].nunique()

27649

In [291]:
targets_lvl_2['item_id'].nunique()

3052

In [292]:
users_lvl_2['item_id'].nunique()

3053

### Генерация фичей:

#### Фичи для пользователей

In [293]:
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,user_id_orders,timeperiod_order
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,5,night
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0,5,night


In [294]:
#Вычисляем кол-во заказов на пользователя, кол-во уникальных товаров в заказах, сумму заказов на пользователя и т.д.
users_orders_df = data.groupby('user_id', as_index=False).agg({
    'basket_id': 'nunique',
    'item_id': 'nunique',
    'retail_disc': 'sum',
    'coupon_disc': 'sum',
    'coupon_match_disc': 'sum',
    'sales_value': 'sum'}).rename(columns={'basket_id': 'orders_per_user', 
                                           'item_id': 'uniques_items', 
                                           'sales_value': 'value_per_user'})
users_orders_df.head(2)

Unnamed: 0,user_id,orders_per_user,uniques_items,retail_disc,coupon_disc,coupon_match_disc,value_per_user
0,1,79,627,-653.47,-78.96,-24.75,3959.91
1,2,44,510,-318.96,-9.0,0.0,1823.45


In [295]:
#Вычисляем среднюю сумму заказа
users_orders_df['av_order_value'] = users_orders_df.value_per_user / users_orders_df.orders_per_user

#Вычисляем среднее кол-во уникальных товаров в заказе
users_orders_df['av_unique_items_per_order'] = users_orders_df.uniques_items / users_orders_df.orders_per_user
users_orders_df.head(2)

Unnamed: 0,user_id,orders_per_user,uniques_items,retail_disc,coupon_disc,coupon_match_disc,value_per_user,av_order_value,av_unique_items_per_order
0,1,79,627,-653.47,-78.96,-24.75,3959.91,50.125443,7.936709
1,2,44,510,-318.96,-9.0,0.0,1823.45,41.442045,11.590909


In [296]:
#вычисляем время суток (ночь/утро/день/вечер) 
data['timeperiod_order'] = 'night'
data.loc[(data.trans_time / 60 > 6) & (data.trans_time / 60 <= 11), 'timeperiod_order'] = 'morning'
data.loc[(data.trans_time / 60 > 17) & (data.trans_time / 60 <= 23), 'timeperiod_order'] = 'evening'
data.loc[(data.trans_time / 60 > 11) & (data.trans_time / 60 <= 17), 'timeperiod_order'] = 'day'

In [297]:
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,user_id_orders,timeperiod_order
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,5,night
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0,5,night


In [298]:
#Вычисляем кол-во заказов пользователя по времени суток
users_time_df = data.groupby(['user_id', 'timeperiod_order']).agg({
    'basket_id': 'nunique'}).rename(columns={'basket_id': 'orders_per_timeperiod'})
users_time_df = users_time_df.unstack()
users_time_df.fillna(0, inplace=True)
users_time_df.columns = ['timeperiod_day', 
                          'timeperiod_evening', 
                          'timeperiod_morning', 
                          'timeperiod_night']
users_time_df.reset_index(inplace=True)
users_time_df.head(2)

Unnamed: 0,user_id,timeperiod_day,timeperiod_evening,timeperiod_morning,timeperiod_night
0,1,2.0,35.0,0.0,42.0
1,2,0.0,4.0,0.0,40.0


In [299]:
#Вычисляем кол-во заказов пользователя в неделю
users_week_df = data.groupby(['user_id', 'week_no'], as_index=False).agg({
    'basket_id': 'nunique'})
users_week_df = users_week_df.groupby('user_id', as_index=False).agg({
    'basket_id': 'mean'}).rename(columns={'basket_id': 'orders_per_week'})
users_week_df.head(2)

Unnamed: 0,user_id,orders_per_week
0,1,1.234375
1,2,1.294118


In [300]:
#Вычисляем кол-во заказов пользователя в день
users_day_df = data.groupby(['user_id', 'day'], as_index=False).agg({
    'basket_id': 'nunique'})
users_day_df = users_day_df.groupby('user_id', as_index=False).agg({
    'basket_id': 'mean'}).rename(columns={'basket_id': 'orders_per_day'})
users_day_df.head(2)

Unnamed: 0,user_id,orders_per_day
0,1,1.082192
1,2,1.0


In [301]:
#Вычисляем id магазина, где пользователь чаще всего покупает
users_store_df = data.groupby(['user_id', 'store_id'], as_index=False).agg({
    'basket_id': 'nunique'})
users_store_df = users_store_df.groupby('user_id', as_index=False).agg({
    'basket_id': 'max'}).rename(columns={'basket_id': 'favorite_store_id'})
users_store_df.head(2)

Unnamed: 0,user_id,favorite_store_id
0,1,78
1,2,25


In [302]:
#Кол-во товаров на пользователя
users_items_df = data.groupby(['user_id'], as_index=False).agg({
    'item_id': 'sum'}).rename(columns={'item_id': 'items_per_user'})
users_items_df.head(2)

Unnamed: 0,user_id,items_per_user
0,1,4731186649
1,2,1910021893


In [303]:
# Соединяем вместе все датафреймы с вычислениями
users_orders_df_feat = users_orders_df.merge(users_store_df, on='user_id', how='left')
users_orders_df_feat = users_orders_df_feat.merge(users_time_df, on='user_id', how='left')
users_orders_df_feat = users_orders_df_feat.merge(users_week_df, on='user_id', how='left')
users_orders_df_feat = users_orders_df_feat.merge(users_day_df, on='user_id', how='left')
users_orders_df_feat = users_orders_df_feat.merge(users_items_df, on='user_id', how='left')
users_orders_df_feat.head(2)

Unnamed: 0,user_id,orders_per_user,uniques_items,retail_disc,coupon_disc,coupon_match_disc,value_per_user,av_order_value,av_unique_items_per_order,favorite_store_id,timeperiod_day,timeperiod_evening,timeperiod_morning,timeperiod_night,orders_per_week,orders_per_day,items_per_user
0,1,79,627,-653.47,-78.96,-24.75,3959.91,50.125443,7.936709,78,2.0,35.0,0.0,42.0,1.234375,1.082192,4731186649
1,2,44,510,-318.96,-9.0,0.0,1823.45,41.442045,11.590909,25,0.0,4.0,0.0,40.0,1.294118,1.0,1910021893


In [304]:
users_orders_df_feat['av_items_per_order'] = users_orders_df_feat.items_per_user / users_orders_df_feat.orders_per_user
users_orders_df_feat.head(2)

Unnamed: 0,user_id,orders_per_user,uniques_items,retail_disc,coupon_disc,coupon_match_disc,value_per_user,av_order_value,av_unique_items_per_order,favorite_store_id,timeperiod_day,timeperiod_evening,timeperiod_morning,timeperiod_night,orders_per_week,orders_per_day,items_per_user,av_items_per_order
0,1,79,627,-653.47,-78.96,-24.75,3959.91,50.125443,7.936709,78,2.0,35.0,0.0,42.0,1.234375,1.082192,4731186649,59888440.0
1,2,44,510,-318.96,-9.0,0.0,1823.45,41.442045,11.590909,25,0.0,4.0,0.0,40.0,1.294118,1.0,1910021893,43409590.0


In [305]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id,income_desc_mean,age_desc_mean
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,0,1,42,70
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,0,7,63,50


In [306]:
# средний income пользователя
user_features['income_desc_mean'] = user_features.income_desc.map({
    'Under 15K': 7,
    '50-74K': 63,
    '35-49K': 42,
    '75-99K': 83,
    '25-34K': 30,
    '15-24K': 20,
    '125-149K': 133,
    '100-124K': 113,
    '150-174K': 163,
    '250K+': 250,
    '175-199K': 188,
    '200-249K': 225,
})

In [307]:
# средний возраст пользователя
user_features['age_desc_mean'] = user_features.age_desc.map({
    '19-24': 22,
    '25-34': 30,
    '35-44': 40,
    '45-54': 50,
    '55-64': 60,
    '65+': 70,
})

In [308]:
# средний размер семьи
user_features['household_size_desc'] = user_features.household_size_desc.map({'5+': 5, 
                                                            '1': 1, 
                                                            '2': 2, 
                                                            '3': 3, 
                                                            '4': 4,
                                                           })

In [309]:
# кол-во детей в семье
user_features['kid_category_desc'] = user_features.kid_category_desc.map({'None/Unknown': 0, 
                                                            '1': 1, 
                                                            '2': 2, 
                                                            '3+': 3, 
                                                           })

In [310]:
new_user_features_about_users = user_features.drop(['age_desc', 'income_desc', 'hh_comp_desc'], axis=1)
new_user_features_about_users.head(2)

Unnamed: 0,marital_status_code,homeowner_desc,household_size_desc,kid_category_desc,user_id,income_desc_mean,age_desc_mean
0,A,Homeowner,,,1,42,70
1,A,Homeowner,,,7,63,50


In [311]:
#добавляем характеристики пользователя
users_orders_df_feat = users_orders_df_feat.merge(new_user_features_about_users, on='user_id', how='left')
users_orders_df_feat.head(2)

Unnamed: 0,user_id,orders_per_user,uniques_items,retail_disc,coupon_disc,coupon_match_disc,value_per_user,av_order_value,av_unique_items_per_order,favorite_store_id,timeperiod_day,timeperiod_evening,timeperiod_morning,timeperiod_night,orders_per_week,orders_per_day,items_per_user,av_items_per_order,marital_status_code,homeowner_desc,household_size_desc,kid_category_desc,income_desc_mean,age_desc_mean
0,1,79,627,-653.47,-78.96,-24.75,3959.91,50.125443,7.936709,78,2.0,35.0,0.0,42.0,1.234375,1.082192,4731186649,59888440.0,A,Homeowner,,,42.0,70.0
1,2,44,510,-318.96,-9.0,0.0,1823.45,41.442045,11.590909,25,0.0,4.0,0.0,40.0,1.294118,1.0,1910021893,43409590.0,,,,,,


#### Фичи для товаров

In [312]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,items_per_department,items_per_commodity_desc,items_per_sub_commodity_desc
0,25671,2,GROCERY,0,FRZN ICE,ICE - CRUSHED/CUBED,22 LB,39021,29,29
1,26081,2,MISC. TRANS.,0,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,,490,490,429


In [313]:
# СТМ или нет (0, 1) вместо строкового значения
item_features['brand'] = item_features['brand'].map({'National': 0, 'Private': 1})

In [314]:
#Кол-во товаров по категориальным фичам
item_features['items_per_department'] = item_features['department'].map(item_features.groupby(
    'department').size())

item_features['items_per_commodity_desc'] = item_features['commodity_desc'].map(item_features.groupby(
    'commodity_desc').size())

item_features['items_per_sub_commodity_desc'] = item_features['sub_commodity_desc'].map(item_features.groupby(
    'sub_commodity_desc').size())

In [315]:
item_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92353 entries, 0 to 92352
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   item_id                       92353 non-null  int64  
 1   manufacturer                  92353 non-null  int64  
 2   department                    92353 non-null  object 
 3   brand                         0 non-null      float64
 4   commodity_desc                92353 non-null  object 
 5   sub_commodity_desc            92353 non-null  object 
 6   curr_size_of_product          92353 non-null  object 
 7   items_per_department          92353 non-null  int64  
 8   items_per_commodity_desc      92353 non-null  int64  
 9   items_per_sub_commodity_desc  92353 non-null  int64  
dtypes: float64(1), int64(5), object(4)
memory usage: 7.0+ MB


In [316]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,items_per_department,items_per_commodity_desc,items_per_sub_commodity_desc
0,25671,2,GROCERY,,FRZN ICE,ICE - CRUSHED/CUBED,22 LB,39021,29,29
1,26081,2,MISC. TRANS.,,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,,490,490,429


In [317]:
# убираем лишнюю фичу
new_item_features = item_features.drop('curr_size_of_product', axis=1)
new_item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,items_per_department,items_per_commodity_desc,items_per_sub_commodity_desc
0,25671,2,GROCERY,,FRZN ICE,ICE - CRUSHED/CUBED,39021,29,29
1,26081,2,MISC. TRANS.,,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,490,490,429


In [318]:
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,user_id_orders,timeperiod_order
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,5,night
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0,5,night


In [319]:
"""
sales_value = quantity * price, та цена, которую платит юзер (скидка уже вычтена) 
retail_disc - аналогично, скидка на весь объем покупки
"""

#Вычисляем кол-во заказов с товаром, кол-во покупок товара, сумму заказов по товару и т.д.
items_orders_df = data.groupby('item_id', as_index=False).agg({
    'basket_id': 'nunique',
    'quantity': 'sum',
    'retail_disc': 'sum',
    'coupon_disc': 'sum',
    'coupon_match_disc': 'sum',
    'sales_value': 'sum'}).rename(columns={'basket_id': 'unique_orders_per_item', 
                                           'quantity': 'amount_items', 
                                           'sales_value': 'value_per_item'})
items_orders_df.head(2)

Unnamed: 0,item_id,unique_orders_per_item,amount_items,retail_disc,coupon_disc,coupon_match_disc,value_per_item
0,25671,3,6,0.0,0.0,0.0,20.94
1,26081,1,1,0.0,0.0,0.0,0.99


In [320]:
#средняя стоимость товара
items_orders_df['av_item_price'] = items_orders_df.value_per_item / items_orders_df.amount_items

#средняя скидка на товар
items_orders_df['av_item_discount'] = items_orders_df.coupon_disc / items_orders_df.amount_items

In [321]:
#Кол-во уникальных товаров в датасетах
items_orders_df.shape[0], new_item_features.item_id.nunique()

(89051, 92353)

In [322]:
items_orders_df_feat = items_orders_df.merge(new_item_features, on='item_id', how='left')
items_orders_df_feat.head(2)

Unnamed: 0,item_id,unique_orders_per_item,amount_items,retail_disc,coupon_disc,coupon_match_disc,value_per_item,av_item_price,av_item_discount,manufacturer,department,brand,commodity_desc,sub_commodity_desc,items_per_department,items_per_commodity_desc,items_per_sub_commodity_desc
0,25671,3,6,0.0,0.0,0.0,20.94,3.49,0.0,2,GROCERY,,FRZN ICE,ICE - CRUSHED/CUBED,39021,29,29
1,26081,1,1,0.0,0.0,0.0,0.99,0.99,0.0,2,MISC. TRANS.,,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,490,490,429


#### Объединяем фичи

In [323]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

# Если товара не было в датасете targets_lvl_2, то значит его пользователь не покупал - ставим таргет 0
targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('drop', axis=1, inplace=True)
targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target
0,2070,1029743,0.0
1,2070,916122,1.0


In [324]:
users_orders_df_feat = users_orders_df_feat.rename(columns={'retail_disc': 'retail_disc_user', 
                                                            'coupon_disc': 'coupon_disc_user', 
                                                            'coupon_match_disc': 'coupon_match_disc_user'})
items_orders_df_feat = items_orders_df_feat.rename(columns={'retail_disc': 'retail_disc_item', 
                                                            'coupon_disc': 'coupon_disc_item', 
                                                            'coupon_match_disc': 'coupon_match_disc_item'})

In [325]:
# Добавляем фичи к датасету, который будем отправлять в модель 2-го уровня
targets_lvl_2 = targets_lvl_2.merge(users_orders_df_feat, on='user_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(items_orders_df_feat, on='item_id', how='left')
targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target,orders_per_user,uniques_items,retail_disc_user,coupon_disc_user,coupon_match_disc_user,value_per_user,av_order_value,av_unique_items_per_order,favorite_store_id,timeperiod_day,timeperiod_evening,timeperiod_morning,timeperiod_night,orders_per_week,orders_per_day,items_per_user,av_items_per_order,marital_status_code,homeowner_desc,household_size_desc,kid_category_desc,income_desc_mean,age_desc_mean,unique_orders_per_item,amount_items,retail_disc_item,coupon_disc_item,coupon_match_disc_item,value_per_item,av_item_price,av_item_discount,manufacturer,department,brand,commodity_desc,sub_commodity_desc,items_per_department,items_per_commodity_desc,items_per_sub_commodity_desc
0,2070,1029743,0.0,540,1289,-1615.06,-49.15,-8.45,6981.86,12.92937,2.387037,514,19.0,23.0,83.0,415.0,6.585366,1.69279,7278567672,13478830.0,U,Unknown,,,63.0,50.0,13455.0,15840.0,-2582.01,-62.02,0.0,37981.91,2.397848,-0.003915,69.0,GROCERY,,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,39021.0,455.0,194.0
1,2070,916122,1.0,540,1289,-1615.06,-49.15,-8.45,6981.86,12.92937,2.387037,514,19.0,23.0,83.0,415.0,6.585366,1.69279,7278567672,13478830.0,U,Unknown,,,63.0,50.0,4082.0,6493.0,-27890.91,0.0,0.0,26828.79,4.131956,0.0,4314.0,MEAT,,CHICKEN,CHICKEN BREAST BONELESS,2544.0,364.0,57.0


In [326]:
#где и сколько пропусков
targets_lvl_2.isna().sum()

user_id                              0
item_id                              0
target                               0
orders_per_user                      0
uniques_items                        0
retail_disc_user                     0
coupon_disc_user                     0
coupon_match_disc_user               0
value_per_user                       0
av_order_value                       0
av_unique_items_per_order            0
favorite_store_id                    0
timeperiod_day                       0
timeperiod_evening                   0
timeperiod_morning                   0
timeperiod_night                     0
orders_per_week                      0
orders_per_day                       0
items_per_user                       0
av_items_per_order                   0
marital_status_code             274930
homeowner_desc                  274930
household_size_desc             438129
kid_category_desc               438129
income_desc_mean                274930
age_desc_mean            

In [327]:
# Сколько значений в целевых классах:
targets_lvl_2['target'].value_counts()

0.0    410787
1.0     27342
Name: target, dtype: int64

In [328]:
targets_lvl_2['target'].mean()

0.06240627760317167

In [329]:
targets_lvl_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 438129 entries, 0 to 438128
Data columns (total 42 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   user_id                       438129 non-null  int64  
 1   item_id                       438129 non-null  int64  
 2   target                        438129 non-null  float64
 3   orders_per_user               438129 non-null  int64  
 4   uniques_items                 438129 non-null  int64  
 5   retail_disc_user              438129 non-null  float64
 6   coupon_disc_user              438129 non-null  float64
 7   coupon_match_disc_user        438129 non-null  float64
 8   value_per_user                438129 non-null  float64
 9   av_order_value                438129 non-null  float64
 10  av_unique_items_per_order     438129 non-null  float64
 11  favorite_store_id             438129 non-null  int64  
 12  timeperiod_day                438129 non-nul

### Поменяем фичи в валидационном датасете

In [330]:
# создаем датасет с униками из data_val_lvl_2
users_val_lvl_2 = pd.DataFrame(data_val_lvl_2['user_id'].unique())
users_val_lvl_2.columns = ['user_id']
users_val_lvl_2['user_id'].nunique()

2042

In [331]:
# получаем собственные покупки пользователей датасета data_val_lvl_2 через предсказания ItemItemRecommender
users_val_lvl_2['candidates'] = users_val_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=200))
users_val_lvl_2.head(2)

Unnamed: 0,user_id,candidates
0,338,"[1029743, 844179, 1106523, 1127831, 1004906, 8..."
1,2120,"[999999, 1029743, 1106523, 5569230, 916122, 84..."


In [332]:
# переводим колонку с собственными покупками в строки, чтобы было соответствие user_id - item_id
s = users_val_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_val_lvl_2 = users_val_lvl_2.drop('candidates', axis=1).join(s)
users_val_lvl_2['drop'] = 1  # фиктивная переменная

users_val_lvl_2.head(4)

Unnamed: 0,user_id,item_id,drop
0,338,1029743,1
0,338,844179,1
0,338,1106523,1
0,338,1127831,1


In [333]:
#Всего строк в таблице собственных покупок и всего уников
users_val_lvl_2.shape[0], users_val_lvl_2['user_id'].nunique()

(408400, 2042)

In [334]:
data_val_lvl_2.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2277416,338,41260573635,636,840173,1,1.99,369,0.0,112,92,0.0,0.0
2277417,338,41260573635,636,1037348,1,0.89,369,-0.3,112,92,0.0,0.0


In [335]:
targets_val_lvl_2 = data_val_lvl_2[['user_id', 'item_id']].copy()
targets_val_lvl_2['target'] = 1  # тут только покупки 

targets_val_lvl_2 = users_val_lvl_2.merge(targets_val_lvl_2, on=['user_id', 'item_id'], how='left')

# Если товара не было в датасете targets_lvl_2, то значит его пользователь не покупал - ставим таргет 0
targets_val_lvl_2['target'].fillna(0, inplace= True)
targets_val_lvl_2.drop('drop', axis=1, inplace=True)
targets_val_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target
0,338,1029743,0.0
1,338,844179,0.0


In [336]:
# Добавляем фичи к датасету, который будем отправлять в модель 2-го уровня
targets_val_lvl_2 = targets_val_lvl_2.merge(users_orders_df_feat, on='user_id', how='left')
targets_val_lvl_2 = targets_val_lvl_2.merge(items_orders_df_feat, on='item_id', how='left')
targets_val_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target,orders_per_user,uniques_items,retail_disc_user,coupon_disc_user,coupon_match_disc_user,value_per_user,av_order_value,av_unique_items_per_order,favorite_store_id,timeperiod_day,timeperiod_evening,timeperiod_morning,timeperiod_night,orders_per_week,orders_per_day,items_per_user,av_items_per_order,marital_status_code,homeowner_desc,household_size_desc,kid_category_desc,income_desc_mean,age_desc_mean,unique_orders_per_item,amount_items,retail_disc_item,coupon_disc_item,coupon_match_disc_item,value_per_item,av_item_price,av_item_discount,manufacturer,department,brand,commodity_desc,sub_commodity_desc,items_per_department,items_per_commodity_desc,items_per_sub_commodity_desc
0,338,1029743,0.0,128,473,-466.13,0.0,0.0,2252.54,17.597969,3.695312,38,2.0,20.0,0.0,106.0,2.245614,1.174312,2133458983,16667650.0,,,,,,,13455.0,15840.0,-2582.01,-62.02,0.0,37981.91,2.397848,-0.003915,69.0,GROCERY,,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,39021.0,455.0,194.0
1,338,844179,0.0,128,473,-466.13,0.0,0.0,2252.54,17.597969,3.695312,38,2.0,20.0,0.0,106.0,2.245614,1.174312,2133458983,16667650.0,,,,,,,3537.0,4877.0,-4876.98,0.0,0.0,18094.23,3.710115,0.0,2852.0,MEAT,,BEEF,PRIMAL,2544.0,1109.0,111.0


In [337]:
#где и сколько пропусков
targets_val_lvl_2.isna().sum()

user_id                              0
item_id                              0
target                               0
orders_per_user                      0
uniques_items                        0
retail_disc_user                     0
coupon_disc_user                     0
coupon_match_disc_user               0
value_per_user                       0
av_order_value                       0
av_unique_items_per_order            0
favorite_store_id                    0
timeperiod_day                       0
timeperiod_evening                   0
timeperiod_morning                   0
timeperiod_night                     0
orders_per_week                      0
orders_per_day                       0
items_per_user                       0
av_items_per_order                   0
marital_status_code             257170
homeowner_desc                  257170
household_size_desc             412473
kid_category_desc               412473
income_desc_mean                257170
age_desc_mean            

In [338]:
# Сколько значений в целевых классах:
targets_val_lvl_2['target'].value_counts()

0.0    393486
1.0     18987
Name: target, dtype: int64

### Запускаем модель

In [339]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2[['target']]

X_valid = targets_val_lvl_2.drop('target', axis=1)
y_valid = targets_val_lvl_2[['target']]

In [343]:
X_train.shape, X_valid.shape

((438129, 41), (412473, 41))

In [340]:
cat_feats = [
    'department',
    'commodity_desc',
    'sub_commodity_desc',
    'marital_status_code',
    'homeowner_desc',
]

In [341]:
X_train[cat_feats] = X_train[cat_feats].astype('category')
X_valid[cat_feats] = X_valid[cat_feats].astype('category')

In [344]:
%%time

lgb = LGBMClassifier(objective='binary', max_depth=7, categorical_column=cat_feats)
lgb.fit(X_train, y_train)

train_preds = lgb.predict(X_train)

  return f(**kwargs)


Wall time: 5.82 s


In [345]:
valid_preds = lgb.predict(X_valid)

In [263]:
from sklearn.metrics import confusion_matrix, f1_score

In [347]:
confusion_matrix(y_train, train_preds)

array([[409263,   1524],
       [ 23556,   3786]], dtype=int64)

In [346]:
confusion_matrix(y_valid, valid_preds)

array([[391665,   1821],
       [ 16784,   2203]], dtype=int64)

In [348]:
f1_score(y_valid, valid_preds)

0.19147364304028505

In [349]:
f1_score(y_train, train_preds)

0.23190003675119442

In [350]:
valid_preds_proba = lgb.predict_proba(X_valid)

In [354]:
result = pd.concat([X_valid, pd.DataFrame(valid_preds_proba)], axis=1)
result.shape

(412473, 43)

In [358]:
result.sort_values(by=[1], ascending=False, inplace=True)
result.head(2)

Unnamed: 0,user_id,item_id,orders_per_user,uniques_items,retail_disc_user,coupon_disc_user,coupon_match_disc_user,value_per_user,av_order_value,av_unique_items_per_order,favorite_store_id,timeperiod_day,timeperiod_evening,timeperiod_morning,timeperiod_night,orders_per_week,orders_per_day,items_per_user,av_items_per_order,marital_status_code,homeowner_desc,household_size_desc,kid_category_desc,income_desc_mean,age_desc_mean,unique_orders_per_item,amount_items,retail_disc_item,coupon_disc_item,coupon_match_disc_item,value_per_item,av_item_price,av_item_discount,manufacturer,department,brand,commodity_desc,sub_commodity_desc,items_per_department,items_per_commodity_desc,items_per_sub_commodity_desc,0,1
22750,1609,1082185,383,1537,-2256.98,-77.48,-22.5,26081.05,68.096736,4.013055,196,35.0,206.0,1.0,141.0,4.352273,1.268212,17744636782,46330640.0,A,Homeowner,,,133.0,50.0,27362.0,28384.0,-1924.23,0.0,0.0,27291.02,0.961493,0.0,2.0,PRODUCE,,TROPICAL FRUIT,BANANAS,3118.0,83.0,17.0,0.043719,0.956281
22744,1609,1082185,383,1537,-2256.98,-77.48,-22.5,26081.05,68.096736,4.013055,196,35.0,206.0,1.0,141.0,4.352273,1.268212,17744636782,46330640.0,A,Homeowner,,,133.0,50.0,27362.0,28384.0,-1924.23,0.0,0.0,27291.02,0.961493,0.0,2.0,PRODUCE,,TROPICAL FRUIT,BANANAS,3118.0,83.0,17.0,0.043719,0.956281


In [361]:
X_train.sort_values(by=['item_id'], ascending=False).head(30)

Unnamed: 0,user_id,item_id,orders_per_user,uniques_items,retail_disc_user,coupon_disc_user,coupon_match_disc_user,value_per_user,av_order_value,av_unique_items_per_order,favorite_store_id,timeperiod_day,timeperiod_evening,timeperiod_morning,timeperiod_night,orders_per_week,orders_per_day,items_per_user,av_items_per_order,marital_status_code,homeowner_desc,household_size_desc,kid_category_desc,income_desc_mean,age_desc_mean,unique_orders_per_item,amount_items,retail_disc_item,coupon_disc_item,coupon_match_disc_item,value_per_item,av_item_price,av_item_discount,manufacturer,department,brand,commodity_desc,sub_commodity_desc,items_per_department,items_per_commodity_desc,items_per_sub_commodity_desc
203725,1752,15926886,186,1133,-1384.15,-6.7,-2.7,6150.26,33.065914,6.091398,47,10.0,21.0,0.0,155.0,2.48,1.338129,5563312127,29910280.0,,,,,,,356.0,378.0,-106.92,-11.19,0.0,837.08,2.214497,-0.029603,544.0,GROCERY,,BAG SNACKS,POTATO CHIPS,39021.0,1523.0,531.0
380513,2482,15926886,68,667,-565.64,-6.29,0.0,2495.53,36.698971,9.808824,54,0.0,9.0,0.0,59.0,1.7,1.333333,2289398280,33667620.0,,,,,,,356.0,378.0,-106.92,-11.19,0.0,837.08,2.214497,-0.029603,544.0,GROCERY,,BAG SNACKS,POTATO CHIPS,39021.0,1523.0,531.0
228603,1284,15926886,141,416,-357.71,-6.39,0.0,1693.01,12.007163,2.950355,103,4.0,32.0,0.0,105.0,2.517857,1.184874,2415584432,17131800.0,,,,,,,356.0,378.0,-106.92,-11.19,0.0,837.08,2.214497,-0.029603,544.0,GROCERY,,BAG SNACKS,POTATO CHIPS,39021.0,1523.0,531.0
228604,1284,15926886,141,416,-357.71,-6.39,0.0,1693.01,12.007163,2.950355,103,4.0,32.0,0.0,105.0,2.517857,1.184874,2415584432,17131800.0,,,,,,,356.0,378.0,-106.92,-11.19,0.0,837.08,2.214497,-0.029603,544.0,GROCERY,,BAG SNACKS,POTATO CHIPS,39021.0,1523.0,531.0
228605,1284,15926886,141,416,-357.71,-6.39,0.0,1693.01,12.007163,2.950355,103,4.0,32.0,0.0,105.0,2.517857,1.184874,2415584432,17131800.0,,,,,,,356.0,378.0,-106.92,-11.19,0.0,837.08,2.214497,-0.029603,544.0,GROCERY,,BAG SNACKS,POTATO CHIPS,39021.0,1523.0,531.0
228606,1284,15926886,141,416,-357.71,-6.39,0.0,1693.01,12.007163,2.950355,103,4.0,32.0,0.0,105.0,2.517857,1.184874,2415584432,17131800.0,,,,,,,356.0,378.0,-106.92,-11.19,0.0,837.08,2.214497,-0.029603,544.0,GROCERY,,BAG SNACKS,POTATO CHIPS,39021.0,1523.0,531.0
228607,1284,15926886,141,416,-357.71,-6.39,0.0,1693.01,12.007163,2.950355,103,4.0,32.0,0.0,105.0,2.517857,1.184874,2415584432,17131800.0,,,,,,,356.0,378.0,-106.92,-11.19,0.0,837.08,2.214497,-0.029603,544.0,GROCERY,,BAG SNACKS,POTATO CHIPS,39021.0,1523.0,531.0
28575,1762,15926886,309,1905,-2423.56,-53.07,-3.5,13826.69,44.74657,6.165049,147,106.0,61.0,0.0,142.0,3.814815,1.236,11877123974,38437290.0,A,Homeowner,,,133.0,50.0,356.0,378.0,-106.92,-11.19,0.0,837.08,2.214497,-0.029603,544.0,GROCERY,,BAG SNACKS,POTATO CHIPS,39021.0,1523.0,531.0
228608,1284,15926886,141,416,-357.71,-6.39,0.0,1693.01,12.007163,2.950355,103,4.0,32.0,0.0,105.0,2.517857,1.184874,2415584432,17131800.0,,,,,,,356.0,378.0,-106.92,-11.19,0.0,837.08,2.214497,-0.029603,544.0,GROCERY,,BAG SNACKS,POTATO CHIPS,39021.0,1523.0,531.0
83459,1293,15926886,157,646,-327.69,0.0,0.0,2510.24,15.98879,4.11465,140,27.0,76.0,0.0,54.0,2.180556,1.113475,2937051765,18707340.0,,,,,,,356.0,378.0,-106.92,-11.19,0.0,837.08,2.214497,-0.029603,544.0,GROCERY,,BAG SNACKS,POTATO CHIPS,39021.0,1523.0,531.0


In [359]:
result.head(10)

Unnamed: 0,user_id,item_id,orders_per_user,uniques_items,retail_disc_user,coupon_disc_user,coupon_match_disc_user,value_per_user,av_order_value,av_unique_items_per_order,favorite_store_id,timeperiod_day,timeperiod_evening,timeperiod_morning,timeperiod_night,orders_per_week,orders_per_day,items_per_user,av_items_per_order,marital_status_code,homeowner_desc,household_size_desc,kid_category_desc,income_desc_mean,age_desc_mean,unique_orders_per_item,amount_items,retail_disc_item,coupon_disc_item,coupon_match_disc_item,value_per_item,av_item_price,av_item_discount,manufacturer,department,brand,commodity_desc,sub_commodity_desc,items_per_department,items_per_commodity_desc,items_per_sub_commodity_desc,0,1
22750,1609,1082185,383,1537,-2256.98,-77.48,-22.5,26081.05,68.096736,4.013055,196,35.0,206.0,1.0,141.0,4.352273,1.268212,17744636782,46330640.0,A,Homeowner,,,133.0,50.0,27362.0,28384.0,-1924.23,0.0,0.0,27291.02,0.961493,0.0,2.0,PRODUCE,,TROPICAL FRUIT,BANANAS,3118.0,83.0,17.0,0.043719,0.956281
22744,1609,1082185,383,1537,-2256.98,-77.48,-22.5,26081.05,68.096736,4.013055,196,35.0,206.0,1.0,141.0,4.352273,1.268212,17744636782,46330640.0,A,Homeowner,,,133.0,50.0,27362.0,28384.0,-1924.23,0.0,0.0,27291.02,0.961493,0.0,2.0,PRODUCE,,TROPICAL FRUIT,BANANAS,3118.0,83.0,17.0,0.043719,0.956281
22749,1609,1082185,383,1537,-2256.98,-77.48,-22.5,26081.05,68.096736,4.013055,196,35.0,206.0,1.0,141.0,4.352273,1.268212,17744636782,46330640.0,A,Homeowner,,,133.0,50.0,27362.0,28384.0,-1924.23,0.0,0.0,27291.02,0.961493,0.0,2.0,PRODUCE,,TROPICAL FRUIT,BANANAS,3118.0,83.0,17.0,0.043719,0.956281
22748,1609,1082185,383,1537,-2256.98,-77.48,-22.5,26081.05,68.096736,4.013055,196,35.0,206.0,1.0,141.0,4.352273,1.268212,17744636782,46330640.0,A,Homeowner,,,133.0,50.0,27362.0,28384.0,-1924.23,0.0,0.0,27291.02,0.961493,0.0,2.0,PRODUCE,,TROPICAL FRUIT,BANANAS,3118.0,83.0,17.0,0.043719,0.956281
22747,1609,1082185,383,1537,-2256.98,-77.48,-22.5,26081.05,68.096736,4.013055,196,35.0,206.0,1.0,141.0,4.352273,1.268212,17744636782,46330640.0,A,Homeowner,,,133.0,50.0,27362.0,28384.0,-1924.23,0.0,0.0,27291.02,0.961493,0.0,2.0,PRODUCE,,TROPICAL FRUIT,BANANAS,3118.0,83.0,17.0,0.043719,0.956281
22746,1609,1082185,383,1537,-2256.98,-77.48,-22.5,26081.05,68.096736,4.013055,196,35.0,206.0,1.0,141.0,4.352273,1.268212,17744636782,46330640.0,A,Homeowner,,,133.0,50.0,27362.0,28384.0,-1924.23,0.0,0.0,27291.02,0.961493,0.0,2.0,PRODUCE,,TROPICAL FRUIT,BANANAS,3118.0,83.0,17.0,0.043719,0.956281
22745,1609,1082185,383,1537,-2256.98,-77.48,-22.5,26081.05,68.096736,4.013055,196,35.0,206.0,1.0,141.0,4.352273,1.268212,17744636782,46330640.0,A,Homeowner,,,133.0,50.0,27362.0,28384.0,-1924.23,0.0,0.0,27291.02,0.961493,0.0,2.0,PRODUCE,,TROPICAL FRUIT,BANANAS,3118.0,83.0,17.0,0.043719,0.956281
141995,2317,1082185,142,1789,-2889.98,-57.02,-14.8,12968.84,91.329859,12.598592,107,8.0,41.0,4.0,89.0,1.844156,1.100775,11783050050,82979230.0,A,Homeowner,,,63.0,40.0,27362.0,28384.0,-1924.23,0.0,0.0,27291.02,0.961493,0.0,2.0,PRODUCE,,TROPICAL FRUIT,BANANAS,3118.0,83.0,17.0,0.045711,0.954289
141996,2317,1082185,142,1789,-2889.98,-57.02,-14.8,12968.84,91.329859,12.598592,107,8.0,41.0,4.0,89.0,1.844156,1.100775,11783050050,82979230.0,A,Homeowner,,,63.0,40.0,27362.0,28384.0,-1924.23,0.0,0.0,27291.02,0.961493,0.0,2.0,PRODUCE,,TROPICAL FRUIT,BANANAS,3118.0,83.0,17.0,0.045711,0.954289
141994,2317,1082185,142,1789,-2889.98,-57.02,-14.8,12968.84,91.329859,12.598592,107,8.0,41.0,4.0,89.0,1.844156,1.100775,11783050050,82979230.0,A,Homeowner,,,63.0,40.0,27362.0,28384.0,-1924.23,0.0,0.0,27291.02,0.961493,0.0,2.0,PRODUCE,,TROPICAL FRUIT,BANANAS,3118.0,83.0,17.0,0.045711,0.954289


In [366]:
result2 = result.groupby(['user_id', 'item_id'], as_index=False).agg({1: 'max'}).rename(columns={1: 'proba'})
result2.sort_values(by=['proba'], ascending=False, inplace=True)
result2.drop('proba', axis=1, inplace=True)
result2.head(2)

Unnamed: 0,user_id,item_id
248240,1609,1082185
361401,2317,1082185


In [367]:
result2.shape

(389616, 2)

In [386]:
result_lvl_2 = result2.groupby('user_id', as_index=False).agg({'item_id': list}).rename(columns={'item_id': 'lightgbm_result'})
result_lvl_2.head()

Unnamed: 0,user_id,lightgbm_result
0,1,"[1029743, 1106523, 8293439, 15926844, 1126899,..."
1,3,"[1029743, 1106523, 1126899, 1070820, 1110244, ..."
2,6,"[1082185, 1098844, 1029743, 1106523, 12757544,..."
3,7,"[1029743, 1106523, 1126899, 1070820, 866211, 1..."
4,8,"[1029743, 1106523, 1070820, 1126899, 916122, 6..."


In [387]:
result_lvl_2.shape

(2042, 2)

In [392]:
result_lvl_2_actual = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2_actual.columns=['user_id', 'actual']
result_lvl_2_actual.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [393]:
result_lvl_2 = result_lvl_2.merge(result_lvl_2_actual, on='user_id', how='left')
result_lvl_2.head(2)

Unnamed: 0,user_id,lightgbm_result,actual
0,1,"[1029743, 1106523, 8293439, 15926844, 1126899,...","[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[1029743, 1106523, 1126899, 1070820, 1110244, ...","[835476, 851057, 872021, 878302, 879948, 90963..."


In [394]:
%%time
print('precision_at_k', 'lightgbm_result',
      result_lvl_2.apply(lambda row: precision_at_k(row['lightgbm_result'], row['actual']), axis=1).mean())

precision_at_k lightgbm_result 0.1659157688540623
Wall time: 165 ms


#### В двухуровневой моделе precision_at_k немного ниже, чем в моделе get_own_recommendations. Выше я измерил precision_at_k для get_own_recommendations = 0.17901578458681228