# practice

In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix, coo_matrix
from implicit.nearest_neighbours import bm25_weight

from lightfm import LightFM
from lightfm.evaluation import precision_at_k, recall_at_k

from additional import DataProcessor
from sklearn.preprocessing import StandardScaler

from hyperopt import hp, fmin, tpe
import json

## load & split

In [40]:
# load purchases
purchases = pd.read_csv('retail_train.csv')

# train/test split
test_size_weeks = 3
train = purchases[purchases['week_no'] < purchases['week_no'].max() - test_size_weeks].copy()
test = purchases[purchases['week_no'] >= purchases['week_no'].max() - test_size_weeks].copy()

EDA в отдельном блокноте

## prepare dataset

подготовим параметры обработки датасета:
* defaults: на основе кол-ва проданных товаров
* mix_feat: на комбинации стоимости и кол-ва проданных товаров

In [41]:
mix_feat_params = {
    'top_config': {'fields': ['quantity', 'sales_value'],
                   'beta': [1., 1.],
                   'k': 5000,
                   'scaler': StandardScaler},
    'uim_config': {'aggfunc': 'sum',
                #    'weights': bm25_weight
                   },
}

defaults_params = {
    'top_config': {'fields': ['quantity'],
                   'k': 5000},
    'uim_config': {'aggfunc': 'count',
    #             #    'weights': bm25_weight
                   },
}

In [42]:
# создаем хранилище обучающих и валидационных данных
preparer = DataProcessor(train, test, **mix_feat_params)
preparer.fit()

## Item featuring

In [43]:
# load items data
item_data = pd.read_csv('product.csv')
item_data.columns = item_data.columns.str.lower()
item_data.rename(columns={'product_id': 'item_id'}, inplace=True)

# drop columns
# много информации - тоже плохо
# уберем инфу о весе - она интуитивно неинформативна и commodity_desc как промежуточную категорию - в датасете есть department и sub_commodity_desc
# keep_cols = ['item_id', 'manufacturer', 'sub_commodity_desc', 'department', 'brand']
keep_cols = ['item_id', 'manufacturer', 'sub_commodity_desc']
item_data = item_data[keep_cols]

In [44]:
# # day rate
# day_rate = (purchases.groupby('item_id')['day'].nunique() / purchases['day'].max()).rename('day_rate')
# everyday = day_rate > 0.71                      # ~соотв. 5 дней из 7
# everyweek = ~everyday & (day_rate > 0.42)       # ~соотв. 3 дням из 7
# day_rate[everyday] = 'everyday'
# day_rate[everyweek] = 'everyweek'
# day_rate[~everyday & ~everyweek] = 'episodic'
# item_data = item_data.merge(day_rate, on='item_id', how='left').fillna('unknown')

In [45]:
# dummy
item_features = pd.DataFrame(preparer.train_uim.columns)
item_features = item_features.merge(item_data, on='item_id', how='left')
item_features.set_index('item_id', inplace=True)
item_features = pd.get_dummies(item_features, columns=item_features.columns.tolist())
del item_data

In [46]:
# item_features

## User featuring

In [47]:
# Загружаем user data, их подготовка - в одноименном блокноте
user_data = pd.read_csv('user_features_corrected.csv')
user_features = pd.DataFrame(preparer.train_uim.index)
user_features = user_features.merge(user_data, on='user_id', how='left').fillna(0)
user_features.set_index('user_id', inplace=True)

# get features' columns
# age = user_data.columns[user_data.columns.str.match('age')].to_list()
# marital = user_data.columns[user_data.columns.str.match('marital')].to_list()
income = user_data.columns[user_data.columns.str.match('income')].to_list()
# homeowner = user_data.columns[user_data.columns.str.match('homeowner')].to_list()
# single = user_data.columns[user_data.columns.str.match('single')].to_list()
# size = user_data.columns[user_data.columns.str.match('hh_size')].to_list()
# kids = user_data.columns[user_data.columns.str.match('kids')].to_list()

# drop/reorder features
# user_features.drop(columns=[*homeowner], inplace=True)
# user_features = user_features[[*income, *marital, *size, *single, *kids]]
# user_features = user_features[[*homeowner,]]

# user_features = user_features[[*income, *kids, *size, *age, *marital]]
user_features = user_features[[*income]]
del user_data

In [48]:
# user_features

In [49]:
# # test для проверки без фичей - модель работает как матричная факторизация
# add_cols = [col for col in preparer.train_uim.columns if col not in preparer.test_uim.columns]
# tst = pd.concat([preparer.test_uim, pd.DataFrame(columns=add_cols)], axis=1)[preparer.train_uim.columns]
# add_index = [row for row in preparer.train_uim.index if row not in preparer.test_uim.index]
# tst = pd.concat([tst, pd.DataFrame(add_index)])
# tst.fillna(0, inplace=True)
# tst.shape

## LightFM

In [50]:
# model_params = {      # baseline params
#     'no_components': 10,
#     'learning_rate': 0.1,
#     'item_alpha': 0.1,
#     'user_alpha': 0.1,
# }
model_params = {k: int(v) if k == 'no_components' else v for k, v in json.load(open('hypopt.json'))['params'].items()}

model = LightFM(loss='warp', # 'bpr'
                random_state=42,
                **model_params)

model.fit((preparer.train_uim_sparse > 0) * 1,  # user-item matrix из 0 и 1
          sample_weight=coo_matrix(preparer.train_uim_weighted),
          user_features=csr_matrix(user_features.values).tocsr(),
          item_features=csr_matrix(item_features.values).tocsr(),
          epochs=10)

train_pr = precision_at_k(model, preparer.train_uim_weighted, k=5,
                          user_features=csr_matrix(user_features.values),
                          item_features=csr_matrix(item_features.values)
                          ).mean()

# test_pr = precision_at_k(model, csr_matrix(tst).tocsr(), k=5,
test_pr = precision_at_k(model, preparer.test_uim_weighted, k=5,
                         user_features=csr_matrix(user_features.values).tocsr(),
                         item_features=csr_matrix(item_features.values).tocsr()
                         ).mean()

print(f'Train pr@5: {train_pr}', f'Test pr@5: {test_pr}', sep='\n')

Train pr@5: 0.03731942176818848
Test pr@5: 0.043522268533706665


In [8]:
# item_index = np.arange(preparer.train_uim.columns.size)
# predictions = model.predict(user_ids=6, item_ids=item_index,
#                             user_features=csr_matrix(user_features.values).tocsr(),
#                             item_features=csr_matrix(item_features.values).tocsr(),
#                             num_threads=4)

## hyperopt

In [35]:
hopt_history = [{},]
hopt_metrics = [0,]

# define objective function
def objective(params):
    model = LightFM(**params, loss='warp', random_state=42)

    model.fit((preparer.train_uim_sparse > 0) * 1,  # user-item matrix из 0 и 1
            sample_weight=coo_matrix(preparer.train_uim_weighted),
            user_features=csr_matrix(user_features.values).tocsr(),
            item_features=csr_matrix(item_features.values).tocsr(),
            epochs=10)

    _pr = precision_at_k(model, preparer.test_uim_sparse, k=5,
                         user_features=csr_matrix(user_features.values).tocsr(),
                         item_features=csr_matrix(item_features.values).tocsr()).mean()
    hopt_history.append(params)
    hopt_metrics.append(_pr)    
    return 1 / _pr if _pr else np.inf

In [36]:
# define a search space
search_space = {'no_components': 5 + hp.randint('no_components', 95),
                'learning_rate': hp.uniform('learning_rate', 1e-5, 0.4),
                'item_alpha': hp.uniform('item_alpha', 0, 0.4),
                'user_alpha': hp.uniform('user_alpha', 0, 0.4),
                }

static_params = {'loss': 'warp',
                 'random_state': 42,
                 }

In [11]:
# %%time
# # manual searching
# best = fmin(objective, search_space, algo=tpe.suggest, max_evals=5)
# best.update(static_params)
# hopt_history[np.array(hopt_metrics).argmax()], max(hopt_metrics)
# best

In [12]:
# # manual save
# saved_best = json.load(open('hypopt.json'))
# params, metric = hopt_history[np.array(hopt_metrics).argmax()], max(hopt_metrics)
# if metric > saved_best['metric']:
#     saved_best = {'metric': float(metric), 'params': {k: float(v) for k, v in params.items()}}
#     json.dump(saved_best, open('hypopt.json', 'w'))

In [37]:
# %%time
# # поиск в несколько подходов
# n_cycles = 20
# best_arr = []       # набор параметров, которые алгоритм счел лучшими

# saved_best = json.load(open('hypopt.json'))
# for _ in range(n_cycles):
#     try:
#         best = fmin(objective, search_space, algo=tpe.suggest, max_evals=5)
#     except ValueError:
#         pass
#     else:
#         best.update(static_params)
#         best_arr.append(best)

#     params, metric = hopt_history[np.array(hopt_metrics).argmax()], max(hopt_metrics)
#     if metric > saved_best['metric']:
#         saved_best = {'metric': float(metric), 'params': {k: float(v) for k, v in params.items()}}
#         json.dump(saved_best, open('hypopt.json', 'w'))

# saved_best

100%|██████████| 5/5 [01:28<00:00, 17.78s/trial, best loss: 83.02520590987774] 
100%|██████████| 5/5 [01:35<00:00, 19.01s/trial, best loss: 161.96720476642002]
100%|██████████| 5/5 [01:22<00:00, 16.57s/trial, best loss: 22.976743485361535]
100%|██████████| 5/5 [01:10<00:00, 14.04s/trial, best loss: 103.999986052515]  
100%|██████████| 5/5 [00:58<00:00, 11.76s/trial, best loss: 164.6666625260066] 
100%|██████████| 5/5 [01:53<00:00, 22.76s/trial, best loss: 111.01123230281387]
100%|██████████| 5/5 [01:26<00:00, 17.34s/trial, best loss: 141.14285823201038]
100%|██████████| 5/5 [01:14<00:00, 14.86s/trial, best loss: 125.06328326027291]
100%|██████████| 5/5 [01:15<00:00, 15.16s/trial, best loss: 87.43363128173183]
100%|██████████| 5/5 [01:57<00:00, 23.46s/trial, best loss: 129.9999920092528] 
100%|██████████| 5/5 [01:00<00:00, 12.12s/trial, best loss: 135.34246618854752]
100%|██████████| 5/5 [01:59<00:00, 23.89s/trial, best loss: 120.48781063649244]
100%|██████████| 5/5 [01:26<00:00, 17.28s

{'metric': 0.043522268533706665,
 'params': {'item_alpha': 0.3715697055298993,
  'learning_rate': 0.21255221183657733,
  'no_components': 13.0,
  'user_alpha': 0.2145757171965058}}

baseline: 0.4366 / 0.0026

common: 0.390 / 0.00587

In [None]:
# 