In [1]:
from implicit.bpr import BayesianPersonalizedRanking

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender  # нужен для одного трюка
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

# Функции из 1-ого вебинара
import os, sys

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
#from metrics import precision_at_k, recall_at_k
#from utils import prefilter_items

In [5]:
data = pd.read_csv('retail_train.csv')

data.columns = [col.lower() for col in data.columns]
data.rename(columns={'household_key': 'user_id',
                    'product_id': 'item_id'},
           inplace=True)

test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [6]:
item_features = pd.read_csv('product.csv')
item_features.columns = [col.lower() for col in item_features.columns]
item_features.rename(columns={'product_id': 'item_id'}, inplace=True)

item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [13]:
%%writefile "utils.py"

import numpy as np


class Prefilter_Items():
    
    def __init__(self, drop_popular=False, drop_not_popular=False, drop_old=False, drop_low_price=False, 
                 drop_high_price=False, drop_not_top_N=None, item_features=None, drop_wrong_department=None):
        
        self.drop_popular = drop_popular
        self.drop_not_popular = drop_not_popular
        self.drop_old = drop_old
        self.drop_low_price = drop_low_price
        self.drop_high_price = drop_high_price
        
        self.drop_not_top_N = drop_not_top_N
        self.item_features = item_features
        self.drop_wrong_department = drop_wrong_department
        
        self.popular = None
        self.not_popular = None
        self.old = None
        self.low_price = None
        self.high_price = None
        self.not_top_N = None
        self.wrong_department = None
    
    def fit(self, data_train):
        
        if self.drop_popular or self.drop_not_popular or (self.drop_not_top_N is not None):
            popularity = data_train.groupby('item_id')['user_id'].nunique().reset_index()
            popularity['user_id'] /= data_train['user_id'].nunique()
            popularity.rename(columns={'user_id': 'share_unique_users'}, inplace=True)
        
        if self.drop_popular:
            self.popular = popularity[popularity['share_unique_users'] > 0.5].item_id.tolist()
        
        if self.drop_not_popular:
            self.not_popular = popularity[popularity['share_unique_users'] < 0.01].item_id.tolist()
        
        if self.drop_old:
            self.old = data_train[data_train['week_no'] <= data_train['week_no'].max() - 52].item_id.tolist()
        
        if self.drop_low_price or self.drop_high_price:
            price = data_train[['item_id', 'sales_value']]
            price['sales_value'] /= (np.maximum(data_train['quantity'], 1))
            price = data_train.groupby('item_id')['sales_value'].last().reset_index()
        
        if self.drop_low_price:
            self.low_price = price[price['sales_value'] < 1].item_id.tolist()
        
        if self.drop_high_price:
            self.high_price = price[price['sales_value'] > 100].item_id.tolist()
            
        if self.drop_not_top_N is not None:
            popularity.sort_values('share_unique_users', ascending=True, inplace=True)
            self.not_top_N = popularity.item_id.head(-self.drop_not_top_N).tolist()
        
        if self.drop_wrong_department is not None:
            self.wrong_department = self.item_features.loc[self.item_features['department'].isin(self.drop_wrong_department)]
            self.wrong_department = self.wrong_department.item_id.tolist()
            
        
    def transform(self, data):
        
        # Уберем самые популярные товары (их и так купят)
        if self.drop_popular:
            data = data[~data['item_id'].isin(self.popular)]
        
        # Уберем самые НЕ популярные товары (их и так НЕ купят)
        if self.drop_not_popular:
            data = data[~data['item_id'].isin(self.not_popular)]
        
        # Уберем товары, которые не продавались за последние 12 месяцев
        if self.drop_old:
            data = data[~data['item_id'].isin(self.old)]
        
        # Уберем слишком дешевые товары (на них не заработаем). 1 покупка из рассылок стоит 60 руб. 
        if self.low_price:
            data = data[~data['item_id'].isin(self.low_price)]
        
        # Уберем слишком дорогие товары
        if self.high_price:
            data = data[~data['item_id'].isin(self.high_price)]
            
        # Уберем товары, которые не вошли в топ N
        if self.drop_not_top_N is not None:
            data = data[~data['item_id'].isin(self.not_top_N)]
            
        # Уберем не интересные для рекоммендаций категории (department)
        if self.drop_wrong_department is not None:
            data = data[~data['item_id'].isin(self.wrong_department)]
        
        return data

Writing utils.py


In [16]:
def get_similar_items_recommendation(self, user, N=5):
    """Рекомендуем товары, похожие на топ-N купленных юзером товаров"""

    top_N = self.popularity.loc[self.popularity['user_id']==user].head(N)
        
    recommendation_similar_items = set()
    i = 2
        
    while len(recommendation_similar_items) < N:
        for item in top_N['item_id']:
            recommendation_similar_items.add(self.id_to_itemid[self.model.similar_items(self.itemid_to_id[item], 
                                                                                        N=i)[i-1][0]])
            if len(recommendation_similar_items) == N:
                break
        i+=1
            
    return list(recommendation_similar_items)
     


In [17]:
def get_similar_users_recommendation(self, user, N=5):
    """Рекомендуем топ-N товаров, среди купленных похожими юзерами"""
        
    similar_users = self.model.similar_users(self.userid_to_id[user], N=N+1)[1:]
        
    recommendation_similar_users = set()
    i = 1
        
    while len(recommendation_similar_users) < N:
        for user in similar_users:
            recommendation_similar_users.add(self.id_to_itemid[self.own_recommender.recommend(
                                                                    userid=user[0], 
                                                                    user_items=csr_matrix(self.user_item_matrix).tocsr(), 
                                                                    N=i,
                                                                    filter_already_liked_items=False,
                                                                    recalculate_user=True)[i-1][0]])
            if len(recommendation_similar_users) == N:
                break
        i+=1
            
    return list(recommendation_similar_users)

In [18]:
def recs_top_n(self, N=5):
    res = self.popularity.groupby('item_id')['quantity'].count().reset_index()
    res.sort_values('quantity', ascending=False, inplace=True)
    res = res.item_id.head(N).tolist()
        
    return res