In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import OneHotEncoder

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares

# Модель второго уровня
from xgboost import XGBClassifier

import sys
sys.path.append('../')

# Написанные нами функции
from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders import MainRecommender

In [None]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

In [None]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'

In [None]:
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

In [None]:
VAL_MATCHER_WEEKS = 6
VAL_RANKER_WEEKS = 3

In [None]:
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]


data_train_ranker = data_val_matcher.copy()  

data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

In [None]:
data_train_matcher.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [None]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, n_popular=5000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


In [None]:
common_users = data_train_matcher.user_id.values

data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

In [None]:
recommender = MainRecommender(data_train_matcher)

In [None]:
ACTUAL_COL = 'actual'

In [None]:
result_eval_matcher = data_val_matcher.groupby(USER_COL, sort=False)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,2070,"[1019940, 834103, 918438, 878302, 949616, 9879..."
1,2021,"[840361, 856060, 869344, 896862, 951590, 10191..."


In [None]:
def calc_recall(df_result, top_k):
    for col_name in df_result.columns[2:]:
        yield col_name, df_result.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [None]:
def calc_precision(df_result, top_k):
    for col_name in df_result.columns[2:]:
        yield col_name, df_result.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [None]:
def make_recommendations(df_result, rec_name_model, N=50):
    rec_name = rec_name_model[0]
    rec_model = rec_name_model[1]
    df_result[rec_name] = df_result[USER_COL].apply(lambda x: rec_model(x, N=N))

In [None]:
own_rec = ('own_recs', recommender.get_own_recommendations)
als_rec = ('als_recs', recommender.get_als_recommendations)
sim_user_rec = ('similar_user_recs', recommender.get_similar_users_recommendation)
sim_item_rec = ('similar_item_recs', recommender.get_similar_items_recommendation)

In [None]:
for rec in (own_rec, als_rec, sim_user_rec, sim_item_rec):
    make_recommendations(result_eval_matcher, rec, N=50)

In [None]:
make_recommendations(result_eval_matcher, ('own+top_pop', recommender.get_own_recommendations), N=25)

In [None]:
def fill_with_tops(column, N=5):
    
    tops = np.array(recommender.overall_top_purchases)
    recs = np.array(column)
    mask = np.isin(tops, recs, invert=True)
    tops = tops[mask]
    
    return np.append(recs, tops[:N])

In [None]:
result_eval_matcher['own+top_pop'] = result_eval_matcher['own+top_pop']. \
        apply(lambda row: fill_with_tops(row, N=25))

In [None]:
len(result_eval_matcher.iloc[0]['own+top_pop'])

50

Посмотрим на recall@50 кандидатов, полученных разными способами:

In [None]:
sorted(calc_recall(result_eval_matcher, 50), key=lambda x: x[1], reverse=True)

[('own_recs', 0.07934879656918649),
 ('own+top_pop', 0.0738710471311752),
 ('als_recs', 0.048092753849589144),
 ('similar_item_recs', 0.032414477233487456),
 ('similar_user_recs', 0.025494293767761935)]

Сначала измерим precision@5 имеющихся кандидатов:

In [None]:
sorted(calc_precision(result_eval_matcher, 5), key=lambda x: x[1], reverse=True)

[('own_recs', 0.2242565055762082),
 ('own+top_pop', 0.2242565055762082),
 ('als_recs', 0.09433085501858736),
 ('similar_item_recs', 0.06282527881040893),
 ('similar_user_recs', 0.02732342007434944)]

In [None]:
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

In [None]:
df_match_candidates['candidates'] = df_match_candidates[USER_COL].apply(lambda x: recommender.get_als_recommendations(x, N=50))

In [None]:
df_match_candidates.head(3)

Unnamed: 0,user_id,candidates
0,2070,"[1107553, 1042942, 9526410, 879755, 9527290, 8..."
1,2021,"[981521, 12731544, 1013928, 1009333, 1019142, ..."
2,1753,"[1110572, 926422, 991951, 861445, 879755, 1037..."


In [None]:
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

In [None]:
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

In [None]:
df_match_candidates.head()

Unnamed: 0,user_id,item_id
0,2070,1107553
0,2070,1042942
0,2070,9526410
0,2070,879755
0,2070,9527290


In [None]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1  # тут только покупки 

In [None]:
df_ranker_train

Unnamed: 0,user_id,item_id,target
2104867,2070,1019940,1
2107468,2021,840361,1
2107469,2021,856060,1
2107470,2021,869344,1
2107471,2021,896862,1
...,...,...,...
2282320,222,1120741,1
2282321,462,993339,1
2282322,462,995242,1
2282323,462,10180324,1


In [None]:
df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')


df_ranker_train = df_ranker_train.drop_duplicates(subset=[USER_COL, ITEM_COL])

df_ranker_train['target'].fillna(0, inplace= True)

In [None]:
df_ranker_train['target'].mean()

0.05697955390334573

In [None]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1107553,0.0,103,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1042942,0.0,69,GROCERY,Private,ICE CREAM/MILK/SHERBTS,PAILS,4 QT,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


In [None]:
df_ranker_train = df_ranker_train.merge(data_train_matcher.groupby(USER_COL, sort=False)['sales_value']. \
                                    mean().reset_index(), how='left', on=USER_COL)
df_ranker_train.rename(columns={'sales_value': 'avg_bill'}, inplace=True)

In [None]:
data_train_matcher = data_train_matcher.merge(item_features[[ITEM_COL, 'department']], how='left', on=ITEM_COL)
df_ranker_train = df_ranker_train.merge(data_train_matcher.groupby([USER_COL, 'department'], sort=False)['sales_value'] \
                                        .mean().reset_index(), how='left', on=[USER_COL, 'department']). \
                                        rename(columns={'sales_value': 'avg_cat_spendings'})

In [None]:
df_ranker_train.head(3)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_bill,avg_cat_spendings
0,2070,1107553,0.0,103,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,3.562354,2.523977
1,2070,1042942,0.0,69,GROCERY,Private,ICE CREAM/MILK/SHERBTS,PAILS,4 QT,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,3.562354,2.523977
2,2070,9526410,0.0,544,GROCERY,National,BAG SNACKS,POTATO CHIPS,11.5 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,3.562354,2.523977


In [None]:
df_ranker_train = df_ranker_train.merge((data_train_matcher.groupby(ITEM_COL, sort=False)['quantity'] \
                                         .count() / data_train_matcher['week_no'].nunique()).reset_index(), how='left', on=ITEM_COL)

df_ranker_train.rename(columns={'quantity': 'avg_week_purchases'}, inplace=True)

In [None]:
df_ranker_train = df_ranker_train.merge((data_train_matcher.groupby('department', sort=False)['quantity'] \
                                         .count() / data_train_matcher['week_no'].nunique()).reset_index(), \
                                        how='left', on='department').rename(columns={'quantity': 'avg_week_purchases_cat'})

In [None]:
df_ranker_train.head(3)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_bill,avg_cat_spendings,avg_week_purchases,avg_week_purchases_cat
0,2070,1107553,0.0,103,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,3.562354,2.523977,9.035294,7342.223529
1,2070,1042942,0.0,69,GROCERY,Private,ICE CREAM/MILK/SHERBTS,PAILS,4 QT,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,3.562354,2.523977,3.282353,7342.223529
2,2070,9526410,0.0,544,GROCERY,National,BAG SNACKS,POTATO CHIPS,11.5 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,3.562354,2.523977,20.870588,7342.223529


In [None]:

df_ranker_train = df_ranker_train.merge(data_train_matcher.groupby('department', sort=False)['sales_value'] \
                                         .mean().reset_index(), how='left', on='department') \
                                         .rename(columns={'sales_value': 'avg_cat_spendings_items'})

In [None]:

df_ranker_train = df_ranker_train.merge((data_train_matcher.groupby([USER_COL, 'department'], sort=False)['quantity'] \
                                         .count() / data_train_matcher['week_no'].nunique()).reset_index(), how='left', \
                                        on=[USER_COL, 'department']).rename(columns={'quantity': 'user_week_cat_purchase'})

In [None]:
df_ranker_train.head(3)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_bill,avg_cat_spendings,avg_week_purchases,avg_week_purchases_cat,avg_cat_spendings_items,user_week_cat_purchase
0,2070,1107553,0.0,103,GROCERY,National,SOFT DRINKS,SFT DRNK SNGL SRV BTL CARB (EX,20 OZ,45-54,...,Unknown,Unknown,1,None/Unknown,3.562354,2.523977,9.035294,7342.223529,2.635412,6.094118
1,2070,1042942,0.0,69,GROCERY,Private,ICE CREAM/MILK/SHERBTS,PAILS,4 QT,45-54,...,Unknown,Unknown,1,None/Unknown,3.562354,2.523977,3.282353,7342.223529,2.635412,6.094118
2,2070,9526410,0.0,544,GROCERY,National,BAG SNACKS,POTATO CHIPS,11.5 OZ,45-54,...,Unknown,Unknown,1,None/Unknown,3.562354,2.523977,20.870588,7342.223529,2.635412,6.094118


In [None]:
X_train = df_ranker_train.iloc[:, 2:].drop('target', axis=1)
y_train = df_ranker_train['target']

In [None]:
cat_feats = X_train.columns.tolist()[:-6]
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

In [None]:
nan_cols = X_train.isna().sum().loc[X_train.isna().sum() > 0].index.tolist()

for col in nan_cols:
    X_train[f'{col}_nan'] = 0
    X_train.loc[X_train[col].isna(), f'{col}_nan'] = 1
    X_train[col].fillna(X_train[col].value_counts().index[0], inplace=True)

In [None]:
X_train.isna().sum().sum()

0

In [None]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
X_train = X_train.drop(cat_feats, axis=1).join(pd.DataFrame(encoder.fit_transform(X_train[cat_feats]), \
                                                            columns=encoder.get_feature_names_out(cat_feats)))

In [None]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(iterations=200, silent=True,
                           eta=0.1, task_type='GPU',
                           auto_class_weights='Balanced',
                           random_state=29
            )
model.fit(X_train, y_train)

train_preds = model.predict_proba(X_train)

In [None]:
df_ranker_predict = df_ranker_train.copy()

In [None]:
df_ranker_predict['proba_item_purchase'] = train_preds[:, 1]

In [None]:
df_ranker_predict['proba_item_purchase'][:10]

0    0.719326
1    0.623822
2    0.237627
3    0.771141
4    0.235181
5    0.714822
6    0.671295
7    0.866600
8    0.692142
9    0.777229
Name: proba_item_purchase, dtype: float64