In [1]:
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix, coo_matrix

from sklearn.metrics import f1_score

from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task

import os, sys

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

## 1. Подготовка данных

In [2]:
data = pd.read_csv('../data/raw/retail_train.csv')
#data = pd.read_csv('../data/retail_train_sample.csv')
#data = data.drop(['Unnamed: 0'], axis=1)

item_features = pd.read_csv('../data/raw/product.csv')
user_features = pd.read_csv('../data/raw/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2396804 entries, 0 to 2396803
Data columns (total 12 columns):
 #   Column             Dtype  
---  ------             -----  
 0   user_id            int64  
 1   basket_id          int64  
 2   day                int64  
 3   item_id            int64  
 4   quantity           int64  
 5   sales_value        float64
 6   store_id           int64  
 7   retail_disc        float64
 8   trans_time         int64  
 9   week_no            int64  
 10  coupon_disc        float64
 11  coupon_match_disc  float64
dtypes: float64(4), int64(8)
memory usage: 219.4 MB


In [4]:
data = data.sort_values('day')
data.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
151,718,26985360571,1,934676,1,1.37,324,-0.42,1115,1,-1.0,0.0
153,718,26985360571,1,947849,1,1.25,324,-0.44,1115,1,0.0,0.0
154,718,26985360571,1,948756,3,2.12,324,-0.9,1115,1,-0.75,-0.25
155,718,26985360571,1,950439,1,1.0,324,-0.59,1115,1,0.0,0.0


### а) Новые признаки пользотелей-товаров

In [5]:
def get_usr_itm_feats(input_data):
    
    # час совершения транзакции
    data = input_data.copy()
    data['hour'] = data['trans_time'] // 100
    user_item_features = data.groupby(['user_id', 'item_id'])['hour'].median().reset_index()
    user_item_features.columns = ['user_id', 'item_id', 'median_sales_hour']
    
    # день недели совершения транзакции
    data['weekday'] = data['day'] % 7
    df = data.groupby(['user_id', 'item_id'])['weekday'].median().reset_index()
    df.columns = ['user_id', 'item_id', 'median_weekday']
    user_item_features = user_item_features.merge(df, on=['user_id', 'item_id'])
    
    # cреднее кол-во дней между покупками
    df = data.groupby('user_id')['day'].nunique().reset_index()
    df['mean_visits_interval'] = (data.groupby('user_id')['day'].max() - data.groupby('user_id')['day'].min()) / df['day']
    user_item_features = user_item_features.merge(df[['user_id', 'mean_visits_interval']], on=['user_id'])
    
    # средний чек корзины клиента
    df = data.groupby(['user_id', 'basket_id'])['sales_value'].sum().reset_index()
    df = df.groupby('user_id')['sales_value'].mean().reset_index()
    df.columns = ['user_id', 'mean_check']
    user_item_features = user_item_features.merge(df, on=['user_id'])
    
    # кол-во магазинов, в которых продавался товар
    df = data.groupby(['item_id'])['store_id'].nunique().reset_index()
    df.columns = ['item_id', 'n_stores']
    user_item_features = user_item_features.merge(df, on=['item_id'])
    
    # кол-во уникальных товаров, купленных клиентом
    df = data.groupby(['user_id'])['item_id'].nunique().reset_index()
    df.columns = ['user_id', 'n_items']
    user_item_features = user_item_features.merge(df, on=['user_id'])
    
    # кол-во транзакций клиента
    df = data.groupby(['user_id'])['item_id'].count().reset_index()
    df.columns = ['user_id', 'n_transactions']
    user_item_features = user_item_features.merge(df, on=['user_id'])
    
    # mean / max / std кол-ва уникальных товаров в корзине клиента
    df = data.groupby(['user_id', 'basket_id'])['item_id'].nunique().reset_index()
    df1 = df.groupby('user_id')['item_id'].mean().reset_index()
    df1.columns = ['user_id', 'mean_n_items_basket']
    user_item_features = user_item_features.merge(df1, on=['user_id'])

    df2 = df.groupby('user_id')['item_id'].max().reset_index()
    df2.columns = ['user_id', 'max_n_items_basket']
    user_item_features = user_item_features.merge(df2, on=['user_id'])

    df3 = df.groupby('user_id')['item_id'].std().reset_index()
    df3.columns = ['user_id', 'std_n_items_basket']
    user_item_features = user_item_features.merge(df3, on=['user_id'])
    
    # mean / max / std кол-ва уникальных категорий в корзине клиента
    data = data.merge(item_features[['item_id', 'commodity_desc']], on=['item_id'])
    df = data.groupby(['user_id', 'basket_id'])['commodity_desc'].nunique().reset_index()
    df1 = df.groupby('user_id')['commodity_desc'].mean().reset_index()
    df1.columns = ['user_id', 'mean_n_item_categories_basket']
    user_item_features = user_item_features.merge(df1, on=['user_id'])

    df2 = df.groupby('user_id')['commodity_desc'].max().reset_index()
    df2.columns = ['user_id', 'max_n_item_categories_basket']
    user_item_features = user_item_features.merge(df2, on=['user_id'])

    df3 = df.groupby('user_id')['commodity_desc'].std().reset_index()
    df3.columns = ['user_id', 'std_n_item_categories_basket']
    user_item_features = user_item_features.merge(df3, on=['user_id'])
    
    #Последний день когда было взаимодействие пользователя с товаром
    usr_itm_last_rec_day = data.groupby(['user_id', 'item_id'])['day'].max().reset_index()
    usr_itm_last_rec_day.columns=['user_id', 'item_id', 'last_rec_day']
    usr_itm_last_rec_day['last_rec_day'].astype('int64')
    user_item_features = user_item_features.merge(usr_itm_last_rec_day, on=['user_id', 'item_id'])
    
    return user_item_features

In [6]:
usr_itm_feats = get_usr_itm_feats(data)
usr_itm_feats.head(5)

Unnamed: 0,user_id,item_id,median_sales_hour,median_weekday,mean_visits_interval,mean_check,n_stores,n_items,n_transactions,mean_n_items_basket,max_n_items_basket,std_n_items_basket,mean_n_item_categories_basket,max_n_item_categories_basket,std_n_item_categories_basket,last_rec_day
0,1,819312,18.0,4.0,,50.125443,3,627,1589,20.113924,55,15.409777,15.873418,41,11.636996,536
1,1,820165,12.0,1.5,,50.125443,110,627,1589,20.113924,55,15.409777,15.873418,41,11.636996,610
2,1,821815,16.0,3.0,,50.125443,18,627,1589,20.113924,55,15.409777,15.873418,41,11.636996,311
3,1,821867,12.0,2.0,,50.125443,85,627,1589,20.113924,55,15.409777,15.873418,41,11.636996,639
4,1,823721,13.0,4.0,,50.125443,101,627,1589,20.113924,55,15.409777,15.873418,41,11.636996,291


### б) Новые признаки товаров 

In [7]:
# средняя цена за товар
data_tmp = data.copy()
data_tmp = data_tmp[data_tmp['quantity'] > 0]
data_tmp['price'] = data_tmp['sales_value'] / data_tmp['quantity']

In [8]:
item_price_mean = data_tmp.groupby('item_id')['price'].mean().reset_index()
item_price_mean.columns=['item_id', 'price']
item_price_mean.head(2)

Unnamed: 0,item_id,price
0,25671,3.49
1,26081,0.99


In [9]:
item_price_mean[item_price_mean['item_id']==993339]

Unnamed: 0,item_id,price
25092,993339,1.982308


In [10]:
item_features = item_features.merge(item_price_mean, on='item_id', how='left')
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,price
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB,3.49
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,,0.99


In [11]:
#Последний день когда товар покупался
item_last_sales_day = data_tmp.groupby('item_id')['day'].max().reset_index()
item_last_sales_day.columns=['item_id', 'last_sales_day']
item_last_sales_day.head(2)

Unnamed: 0,item_id,last_sales_day
0,25671,410
1,26081,250


In [12]:
item_last_sales_day[item_last_sales_day['item_id']==993339]

Unnamed: 0,item_id,last_sales_day
25092,993339,663


In [13]:
item_features = item_features.merge(item_last_sales_day, on='item_id', how='left')
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,price,last_sales_day
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB,3.49,410.0
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,,0.99,250.0


In [14]:
item_features[item_features['item_id']==993339]

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,price,last_sales_day
25561,993339,69,GROCERY,Private,YOGURT,YOGURT NOT MULTI-PACKS,32 OZ,1.982308,663.0


In [15]:
item_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 92353 entries, 0 to 92352
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   item_id               92353 non-null  int64  
 1   manufacturer          92353 non-null  int64  
 2   department            92353 non-null  object 
 3   brand                 92353 non-null  object 
 4   commodity_desc        92353 non-null  object 
 5   sub_commodity_desc    92353 non-null  object 
 6   curr_size_of_product  92353 non-null  object 
 7   price                 88743 non-null  float64
 8   last_sales_day        88743 non-null  float64
dtypes: float64(2), int64(2), object(5)
memory usage: 7.0+ MB


### в) Новые признаки пользователя

In [16]:
#Есть ли дети
user_features["has_kid"] = user_features["kid_category_desc"].isin(['1','2', '3+'])
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id,has_kid
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1,False
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7,False


In [17]:
# статус дохода можно сделать числовым и порядковым, чтоб по нему можно было делать более точный прогноз
user_features["income"] = 0
user_features.loc[user_features["income_desc"] == 'Under 15K', 'income'] = 1 
user_features.loc[user_features["income_desc"] == '15-24K', 'income'] = 2
user_features.loc[user_features["income_desc"] == '25-34K', 'income'] = 3
user_features.loc[user_features["income_desc"] == '35-49K', 'income'] = 4
user_features.loc[user_features["income_desc"] == '50-74K', 'income'] = 5
user_features.loc[user_features["income_desc"] == '75-99K', 'income'] = 6
user_features.loc[user_features["income_desc"] == '100-124K', 'income'] = 7
user_features.loc[user_features["income_desc"] == '125-149K', 'income'] = 8
user_features.loc[user_features["income_desc"] == '150-174K', 'income'] = 9
user_features.loc[user_features["income_desc"] == '175-199K', 'income'] = 10
user_features.loc[user_features["income_desc"] == '200-249K', 'income'] = 11
user_features.loc[user_features["income_desc"] == '250K+', 'income'] = 12

user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id,has_kid,income
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1,False,4
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7,False,5


In [18]:
# порядковая категоря возраста - чтоб по ней можно было делаше более точный прогноз
user_features["age"] = 0
user_features.loc[user_features["age_desc"] ==  '19-24', 'age'] = 1 
user_features.loc[user_features["age_desc"] == '25-34', 'age'] = 2
user_features.loc[user_features["age_desc"] == '35-44', 'age'] = 3
user_features.loc[user_features["age_desc"] == '45-54', 'age'] = 4
user_features.loc[user_features["age_desc"] == '45-54', 'age'] = 5
user_features.loc[user_features["age_desc"] == '65+', 'age'] = 6

user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id,has_kid,income,age
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1,False,4,6
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7,False,5,5


In [19]:
# аналогично для household_size_desc
user_features["household_size"] = 0
user_features.loc[user_features["household_size_desc"] ==  '1', 'household_size'] = 1 
user_features.loc[user_features["household_size_desc"] == '2', 'household_size'] = 2
user_features.loc[user_features["household_size_desc"] == '3', 'household_size'] = 3
user_features.loc[user_features["household_size_desc"] == '4', 'household_size'] = 4
user_features.loc[user_features["household_size_desc"] == '5+', 'household_size'] = 5

user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id,has_kid,income,age,household_size
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1,False,4,6,2
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7,False,5,5,2


In [20]:
user_features = user_features.drop(['kid_category_desc','income_desc', 'age_desc', 'household_size_desc'], axis=1)
new_cat_feats = ['has_kid', 'income', 'age', 'household_size']
user_features[new_cat_feats] = user_features[new_cat_feats].astype('category')

In [21]:
user_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 801 entries, 0 to 800
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   marital_status_code  801 non-null    object  
 1   homeowner_desc       801 non-null    object  
 2   hh_comp_desc         801 non-null    object  
 3   user_id              801 non-null    int64   
 4   has_kid              801 non-null    category
 5   income               801 non-null    category
 6   age                  801 non-null    category
 7   household_size       801 non-null    category
dtypes: category(4), int64(1), object(3)
memory usage: 29.2+ KB


### г) Подготовка обучающих и валидационных данных

In [22]:
# -- давние покупки -- | -- 6 недель -- | -- 3 недели -- 
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
151,718,26985360571,1,934676,1,1.37,324,-0.42,1115,1,-1.0,0.0


In [23]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, delete_popular=False, item_features=item_features, take_n_popular=500)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 501


## 2. Отбор кандидатов

Подбор параметров модели отбора кандидатов

In [24]:
result_train = data_train_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_train.columns=['user_id', 'actual']

result_val = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_val.columns=['user_id', 'actual']

In [25]:
score = []
recommenders = []
best_pr = 0
recommender = None

for factors in range(35, 45, 1):
    recommenders.append(MainRecommender(data_train_lvl_1, weighting='tfidf_weight', n_factors=factors))

    column_name = 'als_f{}'.format(factors)

    result_train[column_name] = result_train['user_id'].apply(lambda x: recommenders[-1].get_als_recommendations(x, N=50))
    pr_train = result_train.apply(lambda row: precision_at_k(row[column_name], row['actual']), axis=1).mean()
    rec_train = result_train.apply(lambda row: recall_at_k(row[column_name], row['actual']), axis=1).mean()
    
    result_val[column_name] = result_val['user_id'].apply(lambda x: recommenders[-1].get_als_recommendations(x, N=50))
    pr_val = result_val.apply(lambda row: precision_at_k(row[column_name], row['actual']), axis=1).mean()
    rec_val = result_val.apply(lambda row: recall_at_k(row[column_name], row['actual']), axis=1).mean()
    
    score.append([factors, pr_train, rec_train, pr_val, rec_val])
    if best_pr < pr_val:
        best_pr = pr_val
        recommender = recommenders[-1]



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=501.0), HTML(value='')))


Get recommendations error. Return top. User ID: 296
Get recommendations error. Return top. User ID: 1813
Get recommendations error. Return top. User ID: 1984


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=501.0), HTML(value='')))


Get recommendations error. Return top. User ID: 296
Get recommendations error. Return top. User ID: 1813
Get recommendations error. Return top. User ID: 1984


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=501.0), HTML(value='')))


Get recommendations error. Return top. User ID: 296
Get recommendations error. Return top. User ID: 1813
Get recommendations error. Return top. User ID: 1984


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=501.0), HTML(value='')))


Get recommendations error. Return top. User ID: 296
Get recommendations error. Return top. User ID: 1813
Get recommendations error. Return top. User ID: 1984


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=501.0), HTML(value='')))


Get recommendations error. Return top. User ID: 296
Get recommendations error. Return top. User ID: 1813
Get recommendations error. Return top. User ID: 1984


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=501.0), HTML(value='')))


Get recommendations error. Return top. User ID: 296
Get recommendations error. Return top. User ID: 1813
Get recommendations error. Return top. User ID: 1984


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=501.0), HTML(value='')))


Get recommendations error. Return top. User ID: 296
Get recommendations error. Return top. User ID: 1813
Get recommendations error. Return top. User ID: 1984


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=501.0), HTML(value='')))


Get recommendations error. Return top. User ID: 296
Get recommendations error. Return top. User ID: 1813
Get recommendations error. Return top. User ID: 1984


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=501.0), HTML(value='')))


Get recommendations error. Return top. User ID: 296
Get recommendations error. Return top. User ID: 1813
Get recommendations error. Return top. User ID: 1984


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=501.0), HTML(value='')))


Get recommendations error. Return top. User ID: 296
Get recommendations error. Return top. User ID: 1813
Get recommendations error. Return top. User ID: 1984


In [26]:
score_df = pd.DataFrame(score, columns=['factors','pr_train', 'rec_train', 'pr_val', 'rec_val'])
score_df.sort_values('pr_val', ascending=False)

Unnamed: 0,factors,pr_train,rec_train,pr_val,rec_val
8,43,0.829339,0.168208,0.165924,0.019666
1,36,0.789739,0.154679,0.16546,0.019179
3,38,0.804329,0.159678,0.164717,0.019568
0,35,0.782605,0.153908,0.164067,0.0191
7,42,0.824208,0.165821,0.164067,0.019907
4,39,0.806733,0.161245,0.163788,0.019246
6,41,0.818196,0.164821,0.163603,0.019242
2,37,0.797916,0.15743,0.163138,0.018865
5,40,0.818277,0.164238,0.162581,0.019315
9,44,0.833747,0.168596,0.159424,0.019283


Качество модели отбора кандидатов

In [27]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']

result_lvl_1['als'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=50))
#result_lvl_1['als'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))
pr = result_lvl_1.apply(lambda row: precision_at_k(row['als'], row['actual']), axis=1).mean()
rec = result_lvl_1.apply(lambda row: recall_at_k(row['als'], row['actual']), axis=1).mean()
pr, rec

Get recommendations error. Return top. User ID: 296
Get recommendations error. Return top. User ID: 1813
Get recommendations error. Return top. User ID: 1984


(0.16592386258124184, 0.019666217313196867)

### 3. Модель 2-го уровня

In [28]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

In [29]:
warm_start = False
if warm_start:
    train_users = data_train_lvl_1['user_id'].unique()
    users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=100))
#users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=100))

Get recommendations error. Return top. User ID: 1813
Get recommendations error. Return top. User ID: 1984
Get recommendations error. Return top. User ID: 296


In [30]:
users_lvl_2.head(2)

Unnamed: 0,user_id,candidates
0,628,"[1029743, 8090521, 1106523, 5569230, 902172, 1..."
1,1631,"[12810393, 916122, 844179, 1044078, 12384775, ..."


In [31]:
s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2['flag'] = 1

users_lvl_2.head(4)

Unnamed: 0,user_id,item_id,flag
0,628,1029743,1
0,628,8090521,1
0,628,1106523,1
0,628,5569230,1


In [32]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('flag', axis=1, inplace=True)

In [33]:
targets_lvl_2 = targets_lvl_2.merge(usr_itm_feats, on=['user_id', 'item_id'], how='left')
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')

targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target,median_sales_hour,median_weekday,mean_visits_interval,mean_check,n_stores,n_items,n_transactions,...,curr_size_of_product,price,last_sales_day,marital_status_code,homeowner_desc,hh_comp_desc,has_kid,income,age,household_size
0,628,1029743,1.0,18.5,4.0,32.647059,61.258235,113.0,240.0,271.0,...,1 GA,2.412823,663.0,,,,,,,
1,628,8090521,0.0,21.0,0.0,32.647059,61.258235,87.0,240.0,271.0,...,12 OZ,3.258206,663.0,,,,,,,


In [34]:
TASK = Task(name='binary', metric=lambda y_true, y_pred: f1_score(y_true, (y_pred > 0.5)*1), greater_is_better=False)
TIMEOUT = 300000
N_THREADS = 4
N_FOLDS = 5
RANDOM_STATE = 42
TARGET_NAME = 'target'
TEST_SIZE=0.2

In [35]:
roles = {'target': TARGET_NAME, 'drop': ['user_id, item_id']}

In [36]:
automl_model = TabularAutoML(task=TASK,
                             timeout=TIMEOUT,
                             cpu_limit = N_THREADS,
                             gpu_ids='all',
                             reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                             general_params={'use_algos': [['lgb_tuned', 'cb_tuned', 'cb', 'lgb'], ['lgb_tuned', 'cb']]},
                             tuning_params={'max_tuning_iter': 10},
                            )

In [37]:
oof_pred = automl_model.fit_predict(targets_lvl_2, roles=roles)

INFO:optuna.storages._in_memory:A new study created in memory with name: no-name-1b1ce116-636f-411a-8eb7-6b9258b2e726
INFO:optuna.study.study:Trial 0 finished with value: -0.0 and parameters: {'feature_fraction': 0.6872700594236812, 'num_leaves': 244}. Best is trial 0 with value: -0.0.
INFO:optuna.study.study:Trial 1 finished with value: -0.0 and parameters: {'feature_fraction': 0.8659969709057025, 'num_leaves': 159}. Best is trial 0 with value: -0.0.
INFO:optuna.study.study:Trial 2 finished with value: -0.0 and parameters: {'feature_fraction': 0.5780093202212182, 'num_leaves': 53}. Best is trial 0 with value: -0.0.
INFO:optuna.study.study:Trial 3 finished with value: -0.0 and parameters: {'feature_fraction': 0.5290418060840998, 'num_leaves': 223}. Best is trial 0 with value: -0.0.
INFO:optuna.study.study:Trial 4 finished with value: -0.0 and parameters: {'feature_fraction': 0.8005575058716043, 'num_leaves': 185}. Best is trial 0 with value: -0.0.
INFO:optuna.study.study:Trial 5 finish

In [38]:
train_pred = automl_model.predict(targets_lvl_2)

In [39]:
f1_score(targets_lvl_2[['target']], (train_pred.data[:, 0] > 0.4)*1)

0.6999539999163634

__Валидация модели 2-го уровня__

In [40]:
users_lvl_2_val = pd.DataFrame(data_val_lvl_2['user_id'].unique())
users_lvl_2_val.columns = ['user_id']

users_lvl_2_val['candidates'] = users_lvl_2_val['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=100))

Get recommendations error. Return top. User ID: 1984
Get recommendations error. Return top. User ID: 2259


In [41]:
users_lvl_2_val.head(2)

Unnamed: 0,user_id,candidates
0,800,"[5569471, 916122, 857503, 999858, 897954, 5568..."
1,1985,"[1004906, 961979, 1043751, 8090440, 866211, 10..."


In [42]:
s_val = users_lvl_2_val.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s_val.name = 'item_id'

users_lvl_2_val = users_lvl_2_val.drop('candidates', axis=1).join(s_val)
users_lvl_2_val.head()

Unnamed: 0,user_id,item_id
0,800,5569471
0,800,916122
0,800,857503
0,800,999858
0,800,897954


In [43]:
targets_lvl_2_val = data_val_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2_val['target'] = 1  # тут только покупки 

targets_lvl_2_val = users_lvl_2_val.merge(targets_lvl_2_val, on=['user_id', 'item_id'], how='left')

targets_lvl_2_val['target'].fillna(0, inplace= True)

In [44]:
targets_lvl_2_val = targets_lvl_2_val.merge(usr_itm_feats, on=['user_id', 'item_id'], how='left')
targets_lvl_2_val = targets_lvl_2_val.merge(item_features, on='item_id', how='left')
targets_lvl_2_val = targets_lvl_2_val.merge(user_features, on='user_id', how='left')

targets_lvl_2_val.head(2)

Unnamed: 0,user_id,item_id,target,median_sales_hour,median_weekday,mean_visits_interval,mean_check,n_stores,n_items,n_transactions,...,curr_size_of_product,price,last_sales_day,marital_status_code,homeowner_desc,hh_comp_desc,has_kid,income,age,household_size
0,800,5569471,0.0,17.5,2.5,1.784483,23.584289,108.0,1212.0,3966.0,...,12 OZ,3.195904,663.0,A,Homeowner,2 Adults No Kids,False,4,2,2
1,800,916122,0.0,15.5,4.0,1.784483,23.584289,112.0,1212.0,3966.0,...,,4.290817,663.0,A,Homeowner,2 Adults No Kids,False,4,2,2


In [45]:
targets_lvl_2_val['target'].value_counts()

0.0    195544
1.0     11121
Name: target, dtype: int64

In [46]:
test_pred = automl_model.predict(targets_lvl_2_val)

In [47]:
f1_score(targets_lvl_2_val[['target']], (test_pred.data[:, 0] > 0.4)*1)

0.17141541472399702

In [48]:
automl_model_res = targets_lvl_2_val[['user_id', 'item_id']]
automl_model_res['preds_proba'] = test_pred.data[:, 0]

automl_model_res = automl_model_res.sort_values('preds_proba', ascending=False)

automl_model_res[automl_model_res['user_id']==1].head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  automl_model_res['preds_proba'] = test_pred.data[:, 0]


Unnamed: 0,user_id,item_id,preds_proba
88966,1,1082212,0.421325
88973,1,872137,0.412268
88952,1,10149640,0.408434
88930,1,940947,0.403087
88950,1,878285,0.397623
88955,1,1102067,0.39679
88961,1,856942,0.351457
88962,1,856942,0.351457
88931,1,1041796,0.325358
88965,1,1115576,0.322232


In [49]:
automl_model_res = automl_model_res.groupby('user_id')['item_id'].unique().reset_index()
automl_model_res.head()

Unnamed: 0,user_id,item_id
0,1,"[1082212, 872137, 10149640, 940947, 878285, 11..."
1,3,"[1106523, 983584, 899624, 866211, 1044078, 110..."
2,6,"[1098844, 1082185, 6548453, 878996, 12301109, ..."
3,7,"[993638, 1126899, 1106523, 909714, 1122358, 98..."
4,8,"[1044078, 12301839, 1029743, 1068719, 1105301,..."


In [50]:
result_lvl_2_val = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2_val.columns=['user_id', 'actual']

result_lvl_2_val = result_lvl_2_val.merge(automl_model_res, on='user_id', how='left')

result_lvl_2_val['pr']=result_lvl_2_val.apply(lambda row: precision_at_k(row['item_id'], row['actual']), axis=1)

result_lvl_2_val.head()

Unnamed: 0,user_id,actual,item_id,pr
0,1,"[13876341, 15971874, 17178953, 6534544, 113277...","[1082212, 872137, 10149640, 940947, 878285, 11...",0.2
1,3,"[851057, 13842214, 9526886, 9526563, 7167249, ...","[1106523, 983584, 899624, 866211, 1044078, 110...",0.0
2,6,"[920308, 946489, 926804, 17105539, 13776852, 1...","[1098844, 1082185, 6548453, 878996, 12301109, ...",0.0
3,7,"[909714, 929067, 953476, 954543, 976998, 99383...","[993638, 1126899, 1106523, 909714, 1122358, 98...",0.8
4,8,"[15629920, 13071586, 9337581, 9337369, 5569471...","[1044078, 12301839, 1029743, 1068719, 1105301,...",0.2


In [51]:
result_lvl_2_val['pr'].mean()

0.2534769833496545

### 4. Предсказания на retail_test1.csv

In [52]:
data_test = pd.read_csv('../data/raw/retail_test1.csv')

In [53]:
users_test = pd.DataFrame(data_test['user_id'].unique())
users_test.columns = ['user_id']

users_test['candidates'] = users_test['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=100))

Get recommendations error. Return top. User ID: 2259
Get recommendations error. Return top. User ID: 2325


In [54]:
users_test.head(2)

Unnamed: 0,user_id,candidates
0,1340,"[844179, 999858, 916122, 1044078, 12301109, 83..."
1,588,"[1138443, 1127831, 12810393, 866211, 1068719, ..."


In [55]:
s_val = users_test.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s_val.name = 'item_id'

users_test = users_test.drop('candidates', axis=1).join(s_val)
users_test.head()

Unnamed: 0,user_id,item_id
0,1340,844179
0,1340,999858
0,1340,916122
0,1340,1044078
0,1340,12301109


In [56]:
targets_test = data_test[['user_id', 'item_id']].copy()
targets_test['target'] = 1  # тут только покупки 

targets_test = users_test.merge(targets_test, on=['user_id', 'item_id'], how='left')

targets_test['target'].fillna(0, inplace= True)

In [57]:
targets_test = targets_test.merge(usr_itm_feats, on=['user_id', 'item_id'], how='left')
targets_test = targets_test.merge(item_features, on='item_id', how='left')
targets_test = targets_test.merge(user_features, on='user_id', how='left')

targets_test.head(2)

Unnamed: 0,user_id,item_id,target,median_sales_hour,median_weekday,mean_visits_interval,mean_check,n_stores,n_items,n_transactions,...,curr_size_of_product,price,last_sales_day,marital_status_code,homeowner_desc,hh_comp_desc,has_kid,income,age,household_size
0,1340,844179,0.0,,,,,,,,...,,3.793924,663.0,,,,,,,
1,1340,999858,0.0,23.0,6.0,11.346939,8.698462,110.0,148.0,177.0,...,,3.867612,663.0,,,,,,,


In [58]:
targets_test['target'].value_counts()

0.0    181979
1.0      7981
Name: target, dtype: int64

In [59]:
final_pred = automl_model.predict(targets_test)

In [60]:
f1_score(targets_test[['target']], (final_pred.data[:, 0] > 0.4)*1)

0.2211974820670473

In [61]:
final_res = targets_test[['user_id', 'item_id']]
final_res['preds_proba'] = final_pred.data[:, 0]

final_res = final_res.sort_values('preds_proba', ascending=False)

final_res[final_res['user_id']==1].head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_res['preds_proba'] = final_pred.data[:, 0]


Unnamed: 0,user_id,item_id,preds_proba
158478,1,1082212,0.421325
158485,1,872137,0.412268
158465,1,10149640,0.408434
158443,1,940947,0.403087
158463,1,878285,0.397623


In [62]:
final_res = final_res.groupby('user_id')['item_id'].unique().reset_index()
final_res.head(5)

Unnamed: 0,user_id,item_id
0,1,"[1082212, 872137, 10149640, 940947, 878285, 11..."
1,2,"[1106523, 899624, 1082185, 916122, 866211, 828..."
2,3,"[1106523, 983584, 899624, 866211, 1044078, 110..."
3,6,"[1098844, 1082185, 6548453, 878996, 12301109, ..."
4,7,"[993638, 1126899, 1106523, 909714, 1122358, 98..."


In [63]:
result_test = data_test.groupby('user_id')['item_id'].unique().reset_index()
result_test.columns=['user_id', 'actual']

result_test = result_test.merge(final_res, on='user_id', how='left')

result_test['pr']=result_test.apply(lambda row: precision_at_k(row['item_id'], row['actual']), axis=1)

result_test.head()

Unnamed: 0,user_id,actual,item_id,pr
0,1,"[880007, 883616, 931136, 938004, 940947, 94726...","[1082212, 872137, 10149640, 940947, 878285, 11...",0.2
1,2,"[820165, 820291, 826784, 826835, 829009, 85784...","[1106523, 899624, 1082185, 916122, 866211, 828...",0.4
2,3,"[827683, 908531, 989069, 1071377, 1080155, 109...","[1106523, 983584, 899624, 866211, 1044078, 110...",0.0
3,6,"[956902, 960791, 1037863, 1119051, 1137688, 84...","[1098844, 1082185, 6548453, 878996, 12301109, ...",0.0
4,7,"[847270, 855557, 859987, 863407, 895454, 90663...","[993638, 1126899, 1106523, 909714, 1122358, 98...",0.4


In [64]:
result_test['pr'].mean()

0.16891246684349945

In [66]:
df = result_test[['user_id', 'item_id']].copy()
df.rename(columns = {'item_id':'rec'}, inplace = True)
df.to_csv('../recommendations.csv', index=False)
df.head(2)

Unnamed: 0,user_id,rec
0,1,"[1082212, 872137, 10149640, 940947, 878285, 11..."
1,2,"[1106523, 899624, 1082185, 916122, 866211, 828..."
