In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight, ItemItemRecommender, CosineRecommender, TFIDFRecommender
from implicit.bpr import BayesianPersonalizedRanking

# Функции из 1-ого вебинара
import os, sys

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.metrics import precision_at_k, recall_at_k, money_precision_at_k
from src.utils import prefilter_items, postfilter_items
from src.recommenders import MainRecommender

In [2]:
data = pd.read_csv('./raw_data/retail_train.csv')

data.columns = [col.lower() for col in data.columns]
data.rename(columns={'household_key': 'user_id',
                    'product_id': 'item_id'},
           inplace=True)


test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [4]:
item_features = pd.read_csv('./raw_data/product.csv')
item_features.columns = [col.lower() for col in item_features.columns]
item_features.rename(columns={'product_id': 'item_id'}, inplace=True)

item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [5]:
item_features['sub_commodity_desc'] = pd.Categorical(item_features['sub_commodity_desc'])
item_features['sub_commodity_desc_code'] = item_features['sub_commodity_desc'].cat.codes
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,sub_commodity_desc_code
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB,1079
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,,1446


In [6]:
item_mean_cost = data_train.groupby(['item_id'])['sales_value','quantity'].sum().reset_index()
item_mean_cost['mean_price'] = item_mean_cost['sales_value']/item_mean_cost['quantity']

In [7]:
popular_exp_item = data_train.loc[data_train['sales_value']/data_train['quantity']>=7].groupby('item_id')['quantity'].sum().reset_index()
popular_exp_item.sort_values('quantity', ascending=False, inplace=True)
popular_exp_item = popular_exp_item[:1].item_id.values[0]
popular_exp_item

6533765

In [8]:
n_items_before = data_train['item_id'].nunique()

data_train,top_popular = prefilter_items(data_train, take_n_popular=5000, item_mean_cost=item_mean_cost)

n_items_after = data_train['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

86825
26745
26094
26090
16492
Decreased # items from 86865 to 16492


In [9]:
obj = MainRecommender(data=data_train, top_popular=top_popular, item_features=item_features, item_mean_cost=item_mean_cost, popular_exp_item=popular_exp_item,weighting=True)



HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




HBox(children=(IntProgress(value=0, max=16492), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2497), HTML(value='')))




In [10]:
%%time
    
result['bm25_item'] = result['user_id'].apply(lambda x: obj.get_recommendations(x, model=obj.own_recommender, N=200))

Wall time: 39.4 s


In [11]:
%%time
    
result['bm25_als'] = result['user_id'].apply(lambda x: obj.get_recommendations(x, model=obj.model, N=200))

Wall time: 38.8 s


In [12]:
%%time
result['result'] = result['user_id'].apply(lambda x: postfilter_items(user=x, data=result, data1=data_train, item_features=obj.item_features, col='bm25_item', N=5, item_mean_cost=obj.item_mean_cost,all_rec=obj.all_recommendations,top=obj.top_popular, userid_to_id=obj.userid_to_id,id_to_itemid=obj.id_to_itemid,popular_exp_item=obj.popular_exp_item))

Wall time: 4min 15s


In [13]:
%%time

result['price_item'] = result['result'].apply(lambda x: obj.get_recommendations_price(x))

Wall time: 5.25 s


In [14]:
%%time

result.apply(lambda row: money_precision_at_k(row['result'], row['actual'], row['price_item']), axis=1).mean()

Wall time: 289 ms


0.1652330662811777

In [15]:
result[['user_id', 'result']].to_csv('IUMarchenko.csv', index=False)