In [1]:
pip install implicit



In [2]:
pip install catboost



In [3]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler

import os, sys
module_path = os.path.abspath(os.path.join(os.curdir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders import MainRecommender



In [4]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель --
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631.0,1.0,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631.0,1.0,0.0,0.0


In [5]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=18000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 60731 to 18001


In [6]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data['user_id'].nunique()} Items: {df_data['item_id'].nunique()}")

In [7]:
# общие пользователи
common_users = list(set(data_train_lvl_1.user_id.values)&(set(data_val_lvl_1.user_id.values))&set(data_val_lvl_2.user_id.values))

data_train_lvl_1 = data_train_lvl_1[data_train_lvl_1.user_id.isin(common_users)]
data_val_lvl_1 = data_val_lvl_1[data_val_lvl_1.user_id.isin(common_users)]
data_train_lvl_2 = data_train_lvl_2[data_train_lvl_2.user_id.isin(common_users)]
data_val_lvl_2 = data_val_lvl_2[data_val_lvl_2.user_id.isin(common_users)]

print_stats_data(data_train_lvl_1,'train_lvl_1')
print_stats_data(data_val_lvl_1,'val_lvl_1')
print_stats_data(data_train_lvl_2,'train_lvl_2')
print_stats_data(data_val_lvl_2,'val_lvl_2')

train_lvl_1
Shape: (371621, 13) Users: 1780 Items: 17874
val_lvl_1
Shape: (160900, 12) Users: 1780 Items: 26933
train_lvl_2
Shape: (160900, 12) Users: 1780 Items: 26933
val_lvl_2
Shape: (94398, 12) Users: 1780 Items: 21224


### MainRecommender

In [8]:
recommender = MainRecommender(data_train_lvl_1)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/1780 [00:00<?, ?it/s]

In [9]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[838867, 856942, 859191, 861272, 898011, 93436..."
1,3,"[824915, 826385, 827656, 831063, 839147, 85440..."


In [10]:
def do_recommend(model, func, n, k):

    func_dic = {
        'get_als_recommendations': model.get_als_recommendations,
        'get_own_recommendations': model.get_own_recommendations,
        'get_similar_items_recommendation': model.get_similar_items_recommendation,
        'get_similar_users_recommendation': model.get_similar_users_recommendation
    }

    try:
        result = func_dic[func](n, k)

    except (IndexError, ValueError) as e:

        result = recommender._extend_with_top_popular([], k)

    return result

In [11]:
%%time

k = 50
result_lvl_1['als'] = result_lvl_1['user_id'].apply(lambda x:
                                                    do_recommend(recommender, 'get_als_recommendations', x, k))

CPU times: user 4.28 s, sys: 0 ns, total: 4.28 s
Wall time: 4.37 s


In [None]:
%%time

result_lvl_1['own'] = result_lvl_1['user_id'].apply(lambda x:
                                                    do_recommend(recommender, 'get_own_recommendations', x, k))

In [12]:
%%time

result_lvl_1['sim_item'] = result_lvl_1['user_id'].apply(lambda x:
                                                    do_recommend(recommender, 'get_similar_items_recommendation', x, k))

KeyError: ignored

In [None]:
%time

result_lvl_1['sim_user'] = result_lvl_1['user_id'].apply(lambda x:
                                                    do_recommend(recommender, 'get_similar_users_recommendation', x, k))

In [None]:
for col in result_lvl_1.columns[2:]:
    print(f"{col}: {result_lvl_1.apply(lambda row: recall_at_k(row[col], row['actual'], k=5), axis=1).mean()}")

In [None]:
for col in result_lvl_1.columns[2:]:
    print(f"{col}: {result_lvl_1.apply(lambda row: precision_at_k(row[col], row['actual'], k=5), axis=1).mean()}")

### 2-lvl model

In [None]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

users_lvl_2.head(2)

In [None]:
users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=k))
users_lvl_2.head(4)

In [None]:
s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)

users_lvl_2.head(4)

In [None]:
targets_lvl_2 = data_train_lvl_2.copy()
targets_lvl_2['target'] = 1  # тут только покупки

In [None]:
targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

In [None]:
targets_lvl_2['target'].fillna(0, inplace= True)

In [None]:
targets_lvl_2['target'].mean()

## Catboost

In [None]:
item_features.head(2)

In [None]:
user_features.head(2)

In [None]:
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')

targets_lvl_2.head(2)

## FE

In [None]:
def gen_new_user_feachs(df):

    # добавим средний чек по каждому пользователю
    basket_sum = df.groupby(['user_id', 'basket_id'], as_index=False).agg({'sales_value': 'sum'})
    basket_mean = basket_sum.groupby(['user_id'], as_index=False).agg({'sales_value': 'mean'}).\
                rename(columns={'sales_value':'basket_mean'})

    df = df.merge(basket_mean, on='user_id', how='left')

    # кол-во покупок в каждой категрии по пользователям
    user_cat_sale_count = df.groupby(['user_id', 'commodity_desc'], as_index=True).\
                            agg({'commodity_desc': 'count'}).\
                            rename(columns={'commodity_desc':'commodity_count'}).reset_index()

    df = df.merge(user_cat_sale_count, on=['user_id', 'commodity_desc'], how='left')

    # Средняя сумма покупки 1 товара в каждой категории
    mean_good_price_cat = df.groupby(['user_id', 'commodity_desc'], as_index=True).\
                agg({'sales_value': 'mean'}).rename(columns={'sales_value':'mean_good_price_cat'}).reset_index()

    df = df.merge(mean_good_price_cat, on=['user_id', 'commodity_desc'], how='left')

    # Частотность покупок раз/неделя
    sales_count = df.groupby(['user_id', 'week_no'], as_index=True).\
                agg({'basket_id': 'count'}).rename(columns={'basket_id': 'sales_week_count'}).reset_index()

    df = df.merge(sales_count, on=['user_id', 'week_no'], how='left')


    # Кол-во покупок утром/днем/вечером
    df['trans_time'] = df['trans_time']/100
    df.loc[(df['trans_time'] <= 12.0) & (df['trans_time'] > 5.0), 'period'] = 'morning'
    df.loc[(df['trans_time'] <= 18.0) & (df['trans_time'] > 12.0), 'period'] = 'afternoon'
    df.loc[(df['trans_time'] <= 22.0) & (df['trans_time'] > 18.0), 'period'] = 'evening'
    df.loc[(df['trans_time'] <= 5.0) | (df['trans_time'] > 22.0), 'period'] = 'night'

    period_sale_count = df.groupby(['user_id', 'period'], as_index=True).agg({'basket_id': 'count'}).\
                        rename(columns={'basket_id': 'period_sale_count'})

    df = df.merge(period_sale_count, on=['user_id', 'period'], how='left')

    return df

In [None]:
def gen_new_item_feachs(df):

    # средняя цена товара в категории / стоимость товара к средней стоимости по категории
    commodity_mean_price = df.groupby(['commodity_desc', 'item_id'], as_index=True).\
                            agg({'sales_value': 'mean'}).\
                            rename(columns={'sales_value': 'item_price'}).\
                            groupby(['commodity_desc']).agg({'item_price': 'mean'}).reset_index()

    df = df.merge(commodity_mean_price, on='commodity_desc', how='left')
    df['price_to_cat'] = df[['sales_value', 'item_price']].\
                        apply(lambda row: row['sales_value']/row['item_price'], axis=1)


    return df

In [None]:
X_train = targets_lvl_2.copy().drop('target', axis=1)
y_train = targets_lvl_2[['target']].copy()

In [None]:
X_train = gen_new_user_feachs(X_train)
X_train = gen_new_item_feachs(X_train)

In [None]:
X_train.columns

In [None]:
cat_feats = ['manufacturer', 'department', 'brand',
       'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product',
       'age_desc', 'marital_status_code', 'income_desc', 'homeowner_desc',
       'hh_comp_desc', 'household_size_desc', 'kid_category_desc',
             # 'period'
            ]
ids = ['user_id', 'item_id', 'basket_id']

other_feats = [ 'day', 'quantity', 'sales_value',
       'store_id', 'retail_disc', 'trans_time', 'week_no', 'coupon_disc',
       # 'coupon_match_disc',
       'basket_mean', 'commodity_count', 'mean_good_price_cat',
       'sales_week_count', 'period_sale_count', 'item_price',
       'price_to_cat']

X_train = X_train[ids + other_feats + cat_feats]
X_train.fillna(7777777, inplace=True)
X_train['store_id'] = X_train['store_id'].astype('str')
X_train['manufacturer'] = X_train['manufacturer'].astype('str')
X_train[cat_feats] = X_train[cat_feats].astype('category')

In [None]:
scaler = StandardScaler()
scaler.fit(X_train[other_feats])
X_train[other_feats] = scaler.transform(X_train[other_feats])

In [None]:
cat_model = CatBoostClassifier(random_seed=42, verbose=False, cat_features=cat_feats,
                               max_depth=6, l2_leaf_reg=1, learning_rate=0.03)

# grid = {'learning_rate': [0.03, 0.1],
#         'depth': [4, 6, 10],
#         'l2_leaf_reg': [1, 3, 5, 7, 9]}

# grid_search_result = cat_model.grid_search(grid,
#                                        X=X_train,
#                                        y=y_train,
#                                        plot=True)

# {'depth': 6, 'l2_leaf_reg': 1, 'learning_rate': 0.03}

In [None]:
cat_model.fit(X_train, y_train)

In [None]:
feature_importance = cat_model.feature_importances_
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), np.array(X_train.columns)[sorted_idx])
plt.title('Feature Importance')

In [None]:
X_test = data_val_lvl_2.copy()
X_test = X_test.merge(item_features, on='item_id', how='left')
X_test = X_test.merge(user_features, on='user_id', how='left')
X_test = gen_new_user_feachs(X_test)
X_test = gen_new_item_feachs(X_test)
X_test = X_test[ids + other_feats + cat_feats]
X_test.fillna(7777777, inplace=True)
X_test['store_id'] = X_test['store_id'].astype('str')
X_test[cat_feats] = X_test[cat_feats].astype('category')
X_test[other_feats] = scaler.transform(X_test[other_feats])

In [None]:
y_pred_raw = cat_model.predict(X_train, prediction_type='RawFormulaVal')

In [None]:
y_pred_raw

In [None]:
train_users_lvl2 = targets_lvl_2['user_id'].unique()
train_users_lvl2

In [None]:
def get_xgb_rec(uid, df=targets_lvl_2):
    return df[df['user_id']==uid].\
                sort_values('own_proba', ascending=False).head(5).item_id.to_list()

In [None]:
targets_lvl_2['own_proba'] = y_pred_raw
targets_lvl_2['own_proba'].describe()

In [None]:
result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns=['user_id', 'actual']
result_lvl_2.head(2)

In [None]:
train_users_2 = users_lvl_2['user_id'].unique()
result_lvl_2 = result_lvl_2[result_lvl_2['user_id'].isin(train_users_2)]

In [None]:
result_lvl_2['own'] = result_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))
result_lvl_2.head(3)

In [None]:
result_lvl_2['xgb_candidates'] = result_lvl_2['user_id'].apply(lambda x:
                                                          get_xgb_rec(x))
result_lvl_2.head(3)

In [None]:
prec_1lvl_model = result_lvl_2.apply(lambda row:
                        precision_at_k(row['own'], row['actual'], 5), axis=1).mean()

prec_2lvl_model = result_lvl_2.apply(lambda row:
                        precision_at_k(row['xgb_candidates'], row['actual'], 5), axis=1).mean()

print(f'1-lvl model Precision is {prec_1lvl_model}\n2-lvl model Precision is {prec_2lvl_model}')

## Test

In [None]:
data_test = pd.read_csv('retail_test1.csv')
data_test = prefilter_items(data=data_test, take_n_popular=18000, item_features=item_features)
data_test.head(5)

In [None]:
data_test = data_test[data_test['user_id'].isin(common_users)]

data_test

In [None]:
result_test = data_test.groupby('user_id')['item_id'].unique().reset_index()
result_test.columns=['user_id', 'actual']
result_test.head(2)

In [None]:
%%time

k = 50
result_test['own'] = result_test['user_id'].apply(lambda x:
                                    recommender.get_own_recommendations(x, k))

In [None]:
result_test['xgb_candidates'] = result_test['user_id'].apply(lambda x:
                                                          get_xgb_rec(x))
result_test.head(3)

In [None]:
precision_k = result_test.apply(lambda row:
                        precision_at_k(row['own'], row['actual'], 5), axis=1).mean()

precision_k_xgb = result_test.apply(lambda row:
                        precision_at_k(row['xgb_candidates'], row['actual'], 5), axis=1).mean()

precision_k, precision_k_xgb