In [1]:
import cloudpickle
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from featuring import Merger, ColumnsCorrector, LastDaysRate, BasketRate, DepartmentSellRate, SameDepartmentPurchases, \
    MeanDepartmentExpenses

from datasplit import DataSplit, prepare_true_values
from preprocess import DataPreprocessor
from candidate_model import CandidateModel
from metrics import precision_at_k, recall_at_k, calc_mean_metric

## prepare data & fit candidate model

In [2]:
purchases = pd.read_csv('retail_train.csv')

# train/valid split
splitter = DataSplit(purchases, 'week_no', [6, 4])

train_lv1 = purchases[splitter.part0].copy()
valid_lv1 = purchases[splitter.part1].copy()
valid_lv2 = purchases[splitter.part2].copy()

# prepare lv1 validation true values
true_train_lv1 = prepare_true_values(train_lv1)
true_valid_lv1 = prepare_true_values(valid_lv1)

In [3]:
mix_feat_params = {
    'top_config': {'fields': ['quantity', 'sales_value'],
                   'beta': [1., 1.],
                   'k': 5000,
                   'scaler': StandardScaler
                    },
    'uim_config': {'aggfunc': 'sum',
                   # 'weights': tfidf_weight
                   },
}

pre = DataPreprocessor(train_lv1, **mix_feat_params)
pre.fit()

In [4]:
candidate_params = {
    'train': pre.train_uim_sparse,
    'weighted': pre.train_uim_weighted,
    'top_items': pre.top_k_items,
    'placeholder_id': pre.placeholder_id,
    'idx_to_item': pre.idx_to_item,
    'item_to_idx': pre.item_to_idx,
    'user_to_idx': pre.user_to_idx
}
n_candidates = 100
cm = CandidateModel('BM25', **candidate_params)
cm.fit(K=1)

train_candidates = cm.predict(true_train_lv1['user_id'], N=n_candidates)
valid_candidates = cm.predict(true_valid_lv1['user_id'], N=n_candidates)

recall_lv1_train = calc_mean_metric(recall_at_k, true_train_lv1['actual'], train_candidates, k=n_candidates)
recall_lv1_valid = calc_mean_metric(recall_at_k, true_valid_lv1['actual'], valid_candidates, k=n_candidates)
recall_lv1_train, recall_lv1_valid

(0.30147727453621564, 0.2123693837009425)

In [5]:
# collect candidates for ALL existing users
all_users = pd.Series(purchases['user_id'].sort_values().unique(), name='user_id')    # sort здесь не обязателен, но удобен при отладке
pred_candidates = cm.predict(all_users, N=n_candidates)
candidates = pd.DataFrame.from_dict(pred_candidates.to_dict(), orient='index').set_index(all_users)
candidates = candidates.stack().reset_index(level=1, drop=True).rename('item_id').reset_index()
candidates.shape

(249900, 2)

In [6]:
# общая доля релевантных товаров среди кандидатов в обучающей выборке
relevant = valid_lv1[['user_id', 'item_id']].copy()
relevant['target'] = 1
zeros, ones = candidates.merge(relevant, on=['user_id', 'item_id'], how='left').fillna(0)['target'].value_counts()
ones / (zeros + ones)

0.15684617242212026

## prepare data for lv2 model

In [7]:
def merge_candidates(df, cand, users=None):
    """ Prepare dataset lv2 for featuring
    :param df: required data to be prepared
    :param cand: dataset with stacked candidates
    :param users: leave only specified users
    """
    if users is not None:
        warm = df['user_id'].isin(users)
        target = df[warm].copy()
    else:
        target = df.copy()
    required_users = cand['user_id'].isin(target['user_id'].unique())       # keep candidates for only required users
    target['target'] = 1      # flag means this item was really bought
    target = cand[required_users].merge(target, on=['user_id', 'item_id'], how='left').fillna(0)
    return target.drop(columns='target'), target['target']

In [8]:
# train lv2: prepare for featuring (markup & merge candidates)
train_lv2_merged, train_lv2_target = merge_candidates(valid_lv1, candidates)    # both warm & cold: candidates for cold users are predicted from top5k

# valid lv2: prepare for featuring (markup & merge candidates)
valid_lv2_merged, valid_lv2_target = merge_candidates(valid_lv2, candidates)
train_lv2_merged.shape, valid_lv2_merged.shape

((230524, 12), (211663, 12))

### feature engineering

In [9]:
# # baseline item features
# drop_columns = ['basket_id', 'day', 'quantity', 'sales_value', 'store_id', 'retail_disc', 'trans_time', 'week_no', 'coupon_disc', 'coupon_match_disc']
# item_data = pd.read_csv('product.csv')
# item_data.columns = item_data.columns.str.lower()
# item_data.rename(columns={'product_id': 'item_id'}, inplace=True)
# # item_data.drop(columns=['curr_size_of_product'], inplace=True)
# keep_cols = ['item_id', 'department',]
# item_data = item_data[keep_cols]
#
# # baseline user features
# user_data = pd.read_csv('../hw5/user_features_corrected.csv')
#
# # merge dummies
# featured_train_lv2 = train_lv2.copy()
# featured_train_lv2 = featured_train_lv2.merge(pd.get_dummies(item_data), on='item_id', how='left').fillna(0)
# featured_train_lv2 = featured_train_lv2.merge(user_data, on='user_id', how='left').fillna(0)
# featured_train_lv2.drop(columns=drop_columns, inplace=True)
#
# featured_valid_lv2 = valid_lv2.copy()
# featured_valid_lv2 = featured_valid_lv2.merge(pd.get_dummies(item_data), on='item_id', how='left').fillna(0)
# featured_valid_lv2 = featured_valid_lv2.merge(user_data, on='user_id', how='left').fillna(0)
# featured_valid_lv2.drop(columns=drop_columns, inplace=True)

In [10]:
# load items data
item_data = pd.read_csv('product.csv')
item_data.columns = item_data.columns.str.lower()
item_data.rename(columns={'product_id': 'item_id'}, inplace=True)

# load users data
user_data = pd.read_csv('hh_demographic.csv')
user_data.columns = user_data.columns.str.lower()
user_data.rename(columns={'household_key': 'user_id'}, inplace=True)

# load prepared user features
user_data_corrected = pd.read_csv('../hw5/user_features_corrected.csv')

In [11]:
drop_columns = ['basket_id', 'day', 'quantity', 'sales_value', 'store_id', 'retail_disc', 'trans_time', 'week_no',
                'coupon_disc', 'coupon_match_disc']
keep_user_cols = ['hh_comp', 'hh_size', 'kids', 'single_female', 'single_male']

featuring = Pipeline([('UserFeaturesMerger', Merger(user_data_corrected, on='user_id', cols=keep_user_cols)),
                      # ('LastDaysRate', LastDaysRate(n_days=25)),
                      # ('BasketRate', BasketRate(n_days=14)),
                      ('MeanDepartmentExpenses', MeanDepartmentExpenses(item_data, n_days=28)),
                      ('DepartmentSellRate', DepartmentSellRate(item_data, n_days=7)),
                      ('SameDepartmentPurchases', SameDepartmentPurchases(item_data, n_days=3)),
                      # ('', ),
                      # ('', ),
                      ('drop', ColumnsCorrector(drop_columns, mode='drop')),
                      ])

featuring.fit(train_lv2_merged)
featured_train_lv2 = featuring.transform(train_lv2_merged)
featured_valid_lv2 = featuring.transform(valid_lv2_merged)

## fit lv2 LGBM

In [12]:
model = LGBMClassifier(#max_depth=3,
                       num_leaves=13,
                       learning_rate=0.00625,
                       n_estimators=150,
                       random_state=193, n_jobs=-1)
model.fit(featured_train_lv2, train_lv2_target)

train_pred = model.predict(featured_train_lv2)
valid_pred = model.predict(featured_valid_lv2)
lgb_pr_train = precision_score(train_lv2_target, train_pred)
lgb_pr_valid = precision_score(valid_lv2_target, valid_pred)
lgb_rc_train = recall_score(train_lv2_target, train_pred)
lgb_rc_valid = recall_score(valid_lv2_target, valid_pred)

lgb_pr_train, lgb_pr_valid, lgb_rc_train, lgb_rc_valid

(0.9664045746962115,
 0.9606986899563319,
 0.032525019245573515,
 0.03205711995920003)

In [13]:
def predict_recommends(data, *, k):
    """ Rank and collect candidates for given users """
    keep_users = candidates['user_id'].isin(data['user_id'].unique())
    data_merged, _ = merge_candidates(data, candidates[keep_users])
    data_featured = featuring.transform(data_merged)
    true_values = prepare_true_values(data_merged)

    proba = pd.Series(model.predict_proba(data_featured).T[1], name='proba')
    ranked_pred = pd.concat([data_featured[['user_id', 'item_id']], proba], axis=1)
    # collect recommends
    sorted_cand = candidates[keep_users].merge(ranked_pred, on=['user_id', 'item_id'], how='left')\
        .sort_values(by=['user_id', 'proba'], ascending=[True, False])\
        .groupby('user_id').head(k)
    predicts = sorted_cand.groupby('user_id')['item_id'].unique()

    # calc rank metric
    metric = calc_mean_metric(precision_at_k, true_values['actual'], predicts.reset_index(drop=True), k=k)
    # fill missing predictions from top K items
    predicts = cm.fill_from_top(predicts, k)
    # можно считать rank_metric тут - но тогда она будет включать неранжированные товары (и возможно, будет выше)
    # metric = calc_mean_metric(precision_at_k, true_values['actual'], predicts.reset_index(drop=True), k=k)
    return predicts, metric

In [14]:
# рассчитываем два варианта метрик
k = 5
true_train_lv2_real = prepare_true_values(valid_lv1)
true_valid_lv2_real = prepare_true_values(valid_lv2)

train_rec, train_rank_precision = predict_recommends(valid_lv1, k=k)
valid_rec, valid_rank_precision = predict_recommends(valid_lv2, k=k)
precision_train_lv2 = calc_mean_metric(precision_at_k, true_train_lv2_real['actual'], train_rec.reset_index(drop=True), k=k)
precision_valid_lv2 = calc_mean_metric(precision_at_k, true_valid_lv2_real['actual'], valid_rec.reset_index(drop=True), k=k)

In [15]:
# Эта метрика показывает, насколько точно (pr@5) предикты соответствуют отобранным кандидатам
train_rank_precision, valid_rank_precision

(0.8355617455896008, 0.8992164544564152)

In [16]:
# А эта - насколько (pr@5) рекомендации совпадают с реальными данными покупок
precision_train_lv2, precision_valid_lv2

(0.2700092850510678, 0.25592556317335946)

In [17]:
# save models
cloudpickle.dump(cm, open('models/candidate_model.pkl', 'wb'))
cloudpickle.dump(featuring, open('models/featuring_pipeline.pkl', 'wb'))
cloudpickle.dump(model, open('models/rank_model.pkl', 'wb'))

# где-то в процессе обработки предиктов list видимо заменяется на массив другого типа, из-за чего криво преобразуется в str
# поэтому сделаем принудительный рекаст
valid_rec = valid_rec.apply(list)
valid_rec.to_csv('recommendations.csv')     # recommendations for 2042 users from lv2 validation data

In [18]:
# читаем предикты из файла
valid_rec_check = pd.read_csv('recommendations.csv', index_col='user_id', converters={'item_id': pd.eval})
# проверяем, что метрика не изменилась - значит все сохранилось/считалось правильно
calc_mean_metric(precision_at_k, true_valid_lv2_real['actual'], valid_rec_check.reset_index(drop=True), k=k)

item_id    0.255926
dtype: float64

In [19]:
#