# Вебинар 6. Двухуровневые модели рекомендаций


Код для src, utils, metrics вы можете скачать из [этого](https://github.com/geangohn/recsys-tutorial) github репозитория

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split


import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

In [2]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
item_features.head()

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ


In [4]:
user_features.head()

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8
3,25-34,U,75-99K,Homeowner,2 Adults Kids,4,2,13
4,45-54,B,50-74K,Homeowner,Single Female,1,None/Unknown,16


In [5]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


In [6]:
recommender = MainRecommender(data_train_lvl_1)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




In [7]:
recommender.get_als_recommendations(2375, N=5)

[1044078, 899624, 1106523, 871756, 844179]

In [8]:
recommender.get_own_recommendations(2375, N=5)

[948640, 918046, 847962, 907099, 873980]

In [9]:
recommender.get_similar_items_recommendation(2375, N=5)

[1046545, 1044078, 1042907, 903183, 1133312]

In [10]:
recommender.get_similar_users_recommendation(2375, N=5)

[1129805, 1116050, 820923, 1096573, 1133850]

### Задание 1

A) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?
- Пока пробуем отобрать 50 кандидатов (k=50)
- Качество измеряем на data_val_lvl_1: следующие 6 недель после трейна

Дают ли own recommendtions + top-popular лучший recall?  

B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500}  
C)* Исходя из прошлого вопроса, как вы думаете, какое значение k является наиболее разумным?


In [11]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [12]:
N = 50
users_train = data_train_lvl_1['user_id'].tolist()
users_valid = result_lvl_1['user_id'].tolist()
new_users = list(set(users_valid) - set(users_train))
users_list = list(set(users_valid) - set(new_users))

recs = ['top', 'als', 'own', 'similar_items']
overall_top_purchases = recommender.overall_top_purchases[:N]

In [13]:
result_lvl_1[recs] = [np.nan,np.nan,np.nan,np.nan]
result_lvl_1[recs] = result_lvl_1[recs].astype(object)

In [14]:
result_lvl_1.head()

Unnamed: 0,user_id,actual,top,als,own,similar_items
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...",,,,
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...",,,,
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67...",,,,
3,6,"[1024306, 1102949, 6548453, 835394, 940804, 96...",,,,
4,7,"[836281, 843306, 845294, 914190, 920456, 93886...",,,,


In [15]:
ind_new = result_lvl_1.index[result_lvl_1['user_id'].isin(new_users)]
ind_new

Int64Index([255, 1567, 1715], dtype='int64')

In [16]:
for i in ind_new:
    for col in recs:
        result_lvl_1.at[i,col] = overall_top_purchases

In [17]:
ind_users = result_lvl_1.index[result_lvl_1['user_id'].isin(users_list)]

In [18]:
for i in ind_users:
    user = result_lvl_1.iloc[i,0]
    result_lvl_1.at[i, 'top'] = overall_top_purchases
    result_lvl_1.at[i, 'als'] = recommender.get_als_recommendations(user, N)
    result_lvl_1.at[i, 'own'] = recommender.get_own_recommendations(user, N)
    result_lvl_1.at[i, 'similar_items'] = recommender.get_similar_items_recommendation(user, N)  

In [19]:
for rec in recs:    
    recall = result_lvl_1.apply(lambda row: recall_at_k(row[rec], row['actual'], N), axis=1).mean()
    print(f'recall_at_{N} {rec}: {round(recall, 4)}')

recall_at_50 top: 0.0432
recall_at_50 als: 0.0489
recall_at_50 own: 0.0652
recall_at_50 similar_items: 0.0337


##### Вывод:
Лучший recall выдают own recommendtions и ALS (хотя top-popular близко к ALS)

In [20]:
k = [20, 50, 100, 200, 500]
rec_own_k = ['own_rec_' + str(i) for i in k]
result_lvl_1[rec_own_k] = [np.nan for i in k]
result_lvl_1[rec_own_k] = result_lvl_1[rec_own_k].astype(object)



In [21]:
result_lvl_1.head(2)

Unnamed: 0,user_id,actual,top,als,own,similar_items,own_rec_20,own_rec_50,own_rec_100,own_rec_200,own_rec_500
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[1029743, 1106523, 5569230, 916122, 844179, 10...","[1037332, 856942, 1046816, 1108844, 5577022, 1...","[856942, 9297615, 5577022, 877391, 9655212, 88...","[824758, 1007512, 9297615, 5577022, 888210, 98...",,,,,
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[1029743, 1106523, 5569230, 916122, 844179, 10...","[5569230, 995816, 8090521, 1090931, 916122, 86...","[911974, 1076580, 1103898, 5567582, 1056620, 9...","[8090537, 5569845, 1044078, 985999, 880888, 81...",,,,,


In [22]:
for i, N in enumerate(k):
    overall_top_purchases = recommender.overall_top_purchases[:N]
    for ind in ind_new:
        result_lvl_1.at[ind,rec_own_k[i]] = overall_top_purchases

    for ind in ind_users:
        user = result_lvl_1.iloc[ind,0]
#         result_lvl_1.at[ind, 'top'] = overall_top_purchases
#         result_lvl_1.at[ind, 'als'] = recommender.get_als_recommendations(user, N)
        result_lvl_1.at[ind, rec_own_k[i]] = recommender.get_own_recommendations(user, N)
#         result_lvl_1.at[ind, 'similar_items'] = recommender.get_similar_items_recommendation(user, N)  

In [23]:
for i, N in enumerate(k):   
    recall = result_lvl_1.apply(lambda row: recall_at_k(row[rec_own_k[i]], row['actual'], N), axis=1).mean()
    print(f'recall_at_{N} {rec_own_k[i]}: {round(recall, 4)}')

recall_at_20 own_rec_20: 0.0392
recall_at_50 own_rec_50: 0.0652
recall_at_100 own_rec_100: 0.096
recall_at_200 own_rec_200: 0.1353
recall_at_500 own_rec_500: 0.1819


In [24]:
rec_als_k = ['als_rec_' + str(i) for i in k]
result_lvl_1[rec_als_k] = [np.nan for i in k]
result_lvl_1[rec_als_k] = result_lvl_1[rec_als_k].astype(object)

In [25]:
for i, N in enumerate(k):
    overall_top_purchases = recommender.overall_top_purchases[:N]
    for ind in ind_new:
        result_lvl_1.at[ind,rec_als_k[i]] = overall_top_purchases

    for ind in ind_users:
        user = result_lvl_1.iloc[ind,0]
#         result_lvl_1.at[ind, 'top'] = overall_top_purchases
        result_lvl_1.at[ind, rec_als_k[i]] = recommender.get_als_recommendations(user, N)
#         result_lvl_1.at[ind, rec_als_k[i]] = recommender.get_own_recommendations(user, N)
#         result_lvl_1.at[ind, 'similar_items'] = recommender.get_similar_items_recommendation(user, N)  

In [26]:
for i, N in enumerate(k):   
    recall = result_lvl_1.apply(lambda row: recall_at_k(row[rec_als_k[i]], row['actual'], N), axis=1).mean()
    print(f'recall_at_{N} {rec_als_k[i]}: {round(recall, 4)}')

recall_at_20 als_rec_20: 0.0297
recall_at_50 als_rec_50: 0.0489
recall_at_100 als_rec_100: 0.0694
recall_at_200 als_rec_200: 0.0973
recall_at_500 als_rec_500: 0.147


In [27]:
rec_top_k = ['top_rec_' + str(i) for i in k]
result_lvl_1[rec_top_k] = [np.nan for i in k]
result_lvl_1[rec_top_k] = result_lvl_1[rec_top_k].astype(object)

In [28]:
for i, N in enumerate(k):
    overall_top_purchases = recommender.overall_top_purchases[:N]
    for ind in ind_new:
        result_lvl_1.at[ind,rec_top_k[i]] = overall_top_purchases
    for ind in ind_users:
        result_lvl_1.at[ind,rec_top_k[i]] = overall_top_purchases

In [29]:
for i, N in enumerate(k):   
    recall = result_lvl_1.apply(lambda row: recall_at_k(row[rec_top_k[i]], row['actual'], N), axis=1).mean()
    print(f'recall_at_{N} {rec_top_k[i]}: {round(recall, 4)}')

recall_at_20 top_rec_20: 0.0303
recall_at_50 top_rec_50: 0.0432
recall_at_100 top_rec_100: 0.0616
recall_at_200 top_rec_200: 0.0894
recall_at_500 top_rec_500: 0.1318


##### Вывод:
При увеличении k увеличивается и полнота, причем рост полноты независимо от варианта генерации и пропорционально.
С учетом того что большое количество предлагаемых товаров будет замедлять работу для двух уровневой рекомендации, то оптимальным можно считать количество ~200, так как например дальнейшее увеличение количества товаров в 2.5 раза приводит всего лишь где-то в полтора раза увеличение полноты.

-------

### Задание 2.

Обучите модель 2-ого уровня, при этом:
    - Добавьте минимум по 2 фичи для юзера, товара и пары юзер-товар
    - Измерьте отдельно precision@5 модели 1-ого уровня и двухуровневой модели на data_val_lvl_2
    - Вырос ли precision@5 при использовании двухуровневой модели?

In [30]:
user_features['age_desc'].replace(
    {'19-24': 22, '25-34': 30, '35-44': 40, '45-54': 50, '55-64': 60, '65+': 70},
    inplace=True)

user_features['income_desc'].replace(
    {'Under 15K': 10, '15-24K': 20, '25-34K':30, '35-49K': 40,
     '50-74K': 62, '75-99K': 87, '100-124K': 112, '125-149K': 137, 
     '150-174K': 162, '175-199K': 187, '200-249K': 225, '250K+':275}, inplace=True)

user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,70,A,40,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,50,A,62,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [31]:
# user_features['age_desc'].replace(
#     {'19-24': 22, '25-34': 30, '35-44': 40, '45-54': 50, '55-64': 60, '65+': 70},
#     inplace=True)

user_features['marital_status_code'].replace(
    {'U': 0, 'A': 1, 'B': 2}, inplace=True)

# user_features['income_desc'].replace(
#     {'Under 15K': 10, '15-24K': 20, '25-34K':30, '35-49K': 40,
#      '50-74K': 62, '75-99K': 87, '100-124K': 112, '125-149K': 137, 
#      '150-174K': 162, '175-199K': 187, '200-249K': 225, '250K+':275}, inplace=True)

user_features['homeowner_desc'].replace(
    {'Unknown': 0, 'Probable Renter': 1, 'Renter': 2,
     'Probable Owner': 3, 'Homeowner': 4}, inplace=True)

user_features['hh_comp_desc'].replace(
    {'Unknown': 0, 'Single Male': 1, 'Single Female': 2,
     '1 Adult Kids': 3, '2 Adults No Kids': 4, '2 Adults Kids':5},inplace=True)

user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,70,1,40,4,4,2,None/Unknown,1
1,50,1,62,4,4,2,None/Unknown,7


In [32]:
item_features['brand'] = np.where(item_features['brand']=='Private', 0, 1)


In [33]:
item_features.head()

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,1,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,1,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,0,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,0,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,0,COOKIES/CONES,SPECIALTY COOKIES,14 OZ


In [34]:
data_train_lvl_1.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price
7,2375,26984851516,1,1085983,1,2.99,364,-0.4,1642,1,0.0,0.0,2.99
11,1364,26984896261,1,999999,1,2.19,31742,0.0,1520,1,0.0,0.0,2.19
12,1364,26984896261,1,999999,1,2.99,31742,-0.4,1520,1,0.0,0.0,2.99
13,1364,26984896261,1,999999,1,3.09,31742,0.0,1520,1,0.0,0.0,3.09
14,1364,26984896261,1,937406,1,2.5,31742,-0.99,1520,1,0.0,0.0,2.5


In [35]:
df = data_train_lvl_1.groupby(['user_id', 'basket_id'])['sales_value'].sum().reset_index()
df = df.groupby('user_id')['sales_value'].mean().reset_index()
df.columns = ['user_id', 'mean_check']
user_features = user_features.merge(df, on='user_id')
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id,mean_check
0,70,1,40,4,4,2,None/Unknown,1,37.4205
1,50,1,62,4,4,2,None/Unknown,7,44.355676


In [36]:
df = data_train_lvl_1.groupby('item_id')['price'].mean().reset_index()
df.columns = ['item_id', 'price']
item_features = item_features.merge(df, on='item_id')
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,price
0,117847,450,NUTRITION,1,REFRIGERATED,SOY/RICE MILK,64 OZ,3.041364
1,818981,194,GROCERY,1,COLD CEREAL,ALL FAMILY CEREAL,10.4 OZ,3.222113


-----

In [37]:
result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns=['user_id', 'actual']
result_lvl_2.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


##### 1. Одноуровневая модель

In [38]:
data_train = data[data['week_no'] < data['week_no'].max() - val_lvl_2_size_weeks]

n_items_before = data_train['item_id'].nunique()
data_train = prefilter_items(data_train, item_features=item_features, take_n_popular=5000)

n_items_after = data_train['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 86865 to 5001


In [39]:
recommender = MainRecommender(data_train)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




In [40]:
N = 5
users_train = data_train['user_id'].tolist()
users_valid = result_lvl_2['user_id'].tolist()
new_users = list(set(users_valid) - set(users_train))
users_list = list(set(users_valid) - set(new_users))

overall_top_purchases = recommender.overall_top_purchases[:N]
result_lvl_2['lvl_1_recommendations'] = np.nan
result_lvl_2['lvl_1_recommendations'] = result_lvl_2['lvl_1_recommendations'].astype(object)
ind_new = result_lvl_2.index[result_lvl_2['user_id'].isin(new_users)]
for i in ind_new:
    result_lvl_2.at[i,'lvl_1_recommendations'] = overall_top_purchases
ind_users = result_lvl_2.index[result_lvl_2['user_id'].isin(users_list)]
for i in ind_users:
    user = result_lvl_2.iloc[i,0]
    result_lvl_2.at[i, 'lvl_1_recommendations'] = recommender.get_own_recommendations(user, N)  

In [41]:
result_lvl_2.head()

Unnamed: 0,user_id,actual,lvl_1_recommendations
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[856942, 9297615, 5577022, 8293439, 9655212]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1092937, 1008714, 12132312, 1075979, 998206]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[13003092, 995598, 972416, 13115971, 923600]"
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[896666, 998519, 7147142, 9338009, 939681]"
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[926808, 12808385, 981660, 7410201, 847374]"


In [42]:
result_lvl_2.apply(lambda row: precision_at_k(row['lvl_1_recommendations'], row['actual'], 5), axis=1).mean()

0.15484818805092904

##### 2. Двухуровневая модель 

In [43]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

# Пока только warm start
train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

recommender = MainRecommender(data_train_lvl_1)
users_lvl_2['items200'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=200))
users_lvl_2.head(2)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))




Unnamed: 0,user_id,items200
0,2070,"[1105426, 1097350, 879194, 948640, 928263, 944..."
1,2021,"[950935, 1119454, 835578, 863762, 1019142, 102..."


In [44]:
users_lvl_2.shape

(2151, 2)

In [45]:
df=pd.DataFrame({'user_id':users_lvl_2.user_id.values.repeat(len(users_lvl_2.items200[0])),
                 'item_id':np.concatenate(users_lvl_2.items200.values)})

targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = df.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')
targets_lvl_2['target'].fillna(0, inplace= True)

targets_lvl_2['target'].mean()

0.06198326333060635

In [46]:
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')
targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,price,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,mean_check
0,2070,1105426,0.0,69,DELI,0,SANDWICHES,SANDWICHES - (COLD),,3.99,50.0,0.0,62.0,0.0,0.0,1,None/Unknown,23.48875
1,2070,1097350,0.0,2468,GROCERY,1,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,11.47,50.0,0.0,62.0,0.0,0.0,1,None/Unknown,23.48875


In [47]:
targets_lvl_2.shape

(437602, 18)

In [48]:
targets_lvl_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 437602 entries, 0 to 437601
Data columns (total 18 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   user_id               437602 non-null  int64  
 1   item_id               437602 non-null  int64  
 2   target                437602 non-null  float64
 3   manufacturer          437602 non-null  int64  
 4   department            437602 non-null  object 
 5   brand                 437602 non-null  int32  
 6   commodity_desc        437602 non-null  object 
 7   sub_commodity_desc    437602 non-null  object 
 8   curr_size_of_product  437602 non-null  object 
 9   price                 437602 non-null  float64
 10  age_desc              162827 non-null  float64
 11  marital_status_code   162827 non-null  float64
 12  income_desc           162827 non-null  float64
 13  homeowner_desc        162827 non-null  float64
 14  hh_comp_desc          162827 non-null  float64
 15  

In [49]:
SELECTED_FEATURES_NAMES = ['brand', 'price', 'age_desc', 'income_desc','mean_check','manufacturer','marital_status_code', 
                           'homeowner_desc','hh_comp_desc']
categorical = ['manufacturer', 'brand', 'marital_status_code', 'homeowner_desc', 'hh_comp_desc' ]

In [50]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split

In [51]:
X_train, X_valid, y_train, y_valid = train_test_split(targets_lvl_2[SELECTED_FEATURES_NAMES].fillna(0),
                                                      targets_lvl_2[['target']],
                                                      test_size=0.2, random_state=16,
                                                      stratify=targets_lvl_2[['target']])

dtrain = lgb.Dataset(X_train, y_train, categorical_feature=categorical)
dvalid = lgb.Dataset(X_valid, y_valid, categorical_feature=categorical)

params_lgb = {"boosting_type": "gbdt",
              "objective": "binary",
              "metric": "auc", # "auc" "binary_logloss"
              "num_boost_round": 10000,
              "learning_rate": 0.1,
              "class_weight": 'balanced',
              "max_depth": 7,
              "n_estimators": 1500,
              "n_jobs": 4,
              "seed": 42} 

model_lgb = lgb.train(params=params_lgb,
                      train_set=dtrain,  
                      valid_sets=[dtrain, dvalid],
                      categorical_feature=categorical,
                      verbose_eval=1000,
                      early_stopping_rounds=50)



Training until validation scores don't improve for 50 rounds
[1000]	training's auc: 0.860768	valid_1's auc: 0.779246
Did not meet early stopping. Best iteration is:
[1500]	training's auc: 0.876918	valid_1's auc: 0.783188


In [52]:
train_preds = model_lgb.predict(targets_lvl_2[SELECTED_FEATURES_NAMES].fillna(0))

In [53]:
df = targets_lvl_2[['user_id', 'item_id']]
df['predictions'] = train_preds

df = df.groupby(['user_id', 'item_id'])['predictions'].median().reset_index()
df = df.sort_values(['predictions'], ascending=False).groupby(['user_id']).head(5)

df = df.groupby('user_id')['item_id'].unique().reset_index()
df.columns = ['user_id', 'lgb_recommendations']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [54]:
result_lvl_2 = result_lvl_2[['user_id', 'actual', 'lvl_1_recommendations']].merge(df, on='user_id')
result_lvl_2

Unnamed: 0,user_id,actual,lvl_1_recommendations,lgb_recommendations
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[856942, 9297615, 5577022, 8293439, 9655212]","[8293439, 10149640, 1082269, 969403, 9527558]"
1,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[13003092, 995598, 972416, 13115971, 923600]","[1098844, 1082185, 878715, 907099, 942166]"
2,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[896666, 998519, 7147142, 9338009, 939681]","[9338009, 1070820, 1029743, 1126899, 899624]"
3,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[926808, 12808385, 981660, 7410201, 847374]","[12302069, 1090507, 7167903, 1004475, 991592]"
4,9,"[864335, 990865, 1029743, 9297474, 10457112, 8...","[889692, 1135468, 10456457, 1056005, 872146]","[1082185, 1029743, 1106523, 862799, 1126899]"
...,...,...,...,...
1910,2496,[6534178],"[872826, 983665, 12452939, 991546, 1134296]","[916122, 995876, 5569230, 933835, 839243]"
1911,2497,"[1016709, 9835695, 1132298, 16809501, 845294, ...","[870515, 1102207, 1117219, 1010950, 1103513]","[1103513, 1082185, 12731517, 1010950, 1121384]"
1912,2498,"[15716530, 834484, 901776, 914190, 958382, 972...","[1100379, 1022066, 1076580, 931579, 5565356]","[907993, 896074, 1029743, 1126899, 942045]"
1913,2499,"[867188, 877580, 902396, 914190, 951590, 95813...","[7168055, 1128395, 6904613, 5570048, 830202]","[846417, 1060872, 1126899, 837644, 1138858]"


In [55]:
result_lvl_2.apply(lambda row: precision_at_k(row['lgb_recommendations'], row['actual'], 5), axis=1).mean()


0.17577023498694305

##### Вывод:

precision@5 при двухуровневой модели вырос